Quellcode durchsuchen

Merge branch 'master' of https://github.com/okamstudio/godot

Conflicts:
	platform/windows/detect.py
Juan Linietsky vor 10 Jahren
Ursprung
Commit
b3cda43a0f
100 geänderte Dateien mit 36225 neuen und 11121 gelöschten Zeilen
  1. 4 0
      .gitattributes
  2. 9 1
      .gitignore
  3. 3 0
      SConstruct
  4. 107 0
      core/image.cpp
  5. 1 0
      core/image.h
  6. 359 359
      core/io/aes256.cpp
  7. 46 46
      core/io/aes256.h
  8. 2 2
      core/math/math_funcs.h
  9. 1 306
      core/os/input.cpp
  10. 2 71
      core/os/input.h
  11. 2 1
      core/os/main_loop.cpp
  12. 2 0
      core/os/main_loop.h
  13. 4 4
      core/ustring.cpp
  14. 456 186
      doc/base/classes.xml
  15. 3 1
      drivers/SCsub
  16. 2454 2454
      drivers/etc1/rg_etc1.cpp
  17. 76 76
      drivers/etc1/rg_etc1.h
  18. 0 1
      drivers/gles2/rasterizer_gles2.cpp
  19. 5814 5814
      drivers/nedmalloc/malloc.c.h
  20. 1467 1467
      drivers/nedmalloc/nedmalloc.cpp
  21. 302 302
      drivers/nedmalloc/nedmalloc.h
  22. 19 19
      drivers/openssl/register_openssl.cpp
  23. 11 11
      drivers/openssl/register_openssl.h
  24. 200 0
      drivers/opus/SCsub
  25. 645 0
      drivers/opus/analysis.c
  26. 90 0
      drivers/opus/analysis.h
  27. 376 0
      drivers/opus/audio_stream_opus.cpp
  28. 141 0
      drivers/opus/audio_stream_opus.h
  29. 183 0
      drivers/opus/celt/_kiss_fft_guts.h
  30. 214 0
      drivers/opus/celt/arch.h
  31. 316 0
      drivers/opus/celt/arm/arm2gnu.pl
  32. 49 0
      drivers/opus/celt/arm/arm_celt_map.c
  33. 174 0
      drivers/opus/celt/arm/armcpu.c
  34. 71 0
      drivers/opus/celt/arm/armcpu.h
  35. 37 0
      drivers/opus/celt/arm/armopts.s
  36. 37 0
      drivers/opus/celt/arm/armopts.s.in
  37. 545 0
      drivers/opus/celt/arm/celt_pitch_xcorr_arm.s
  38. 76 0
      drivers/opus/celt/arm/fixed_armv4.h
  39. 116 0
      drivers/opus/celt/arm/fixed_armv5e.h
  40. 121 0
      drivers/opus/celt/arm/kiss_fft_armv4.h
  41. 118 0
      drivers/opus/celt/arm/kiss_fft_armv5e.h
  42. 57 0
      drivers/opus/celt/arm/pitch_arm.h
  43. 1518 0
      drivers/opus/celt/bands.c
  44. 114 0
      drivers/opus/celt/bands.h
  45. 223 0
      drivers/opus/celt/celt.c
  46. 218 0
      drivers/opus/celt/celt.h
  47. 1195 0
      drivers/opus/celt/celt_decoder.c
  48. 2353 0
      drivers/opus/celt/celt_encoder.c
  49. 309 0
      drivers/opus/celt/celt_lpc.c
  50. 54 0
      drivers/opus/celt/celt_lpc.h
  51. 54 0
      drivers/opus/celt/cpu_support.h
  52. 697 0
      drivers/opus/celt/cwrs.c
  53. 48 0
      drivers/opus/celt/cwrs.h
  54. 87 0
      drivers/opus/celt/ecintrin.h
  55. 93 0
      drivers/opus/celt/entcode.c
  56. 117 0
      drivers/opus/celt/entcode.h
  57. 245 0
      drivers/opus/celt/entdec.c
  58. 100 0
      drivers/opus/celt/entdec.h
  59. 294 0
      drivers/opus/celt/entenc.c
  60. 110 0
      drivers/opus/celt/entenc.h
  61. 773 0
      drivers/opus/celt/fixed_debug.h
  62. 134 0
      drivers/opus/celt/fixed_generic.h
  63. 140 0
      drivers/opus/celt/float_cast.h
  64. 719 0
      drivers/opus/celt/kiss_fft.c
  65. 139 0
      drivers/opus/celt/kiss_fft.h
  66. 134 0
      drivers/opus/celt/laplace.c
  67. 48 0
      drivers/opus/celt/laplace.h
  68. 208 0
      drivers/opus/celt/mathops.c
  69. 258 0
      drivers/opus/celt/mathops.h
  70. 311 0
      drivers/opus/celt/mdct.c
  71. 70 0
      drivers/opus/celt/mdct.h
  72. 48 0
      drivers/opus/celt/mfrngcod.h
  73. 438 0
      drivers/opus/celt/modes.c
  74. 210 0
      drivers/opus/celt/opus_custom_demo.c
  75. 83 0
      drivers/opus/celt/opus_modes.h
  76. 92 0
      drivers/opus/celt/os_support.h
  77. 537 0
      drivers/opus/celt/pitch.c
  78. 173 0
      drivers/opus/celt/pitch.h
  79. 556 0
      drivers/opus/celt/quant_bands.c
  80. 66 0
      drivers/opus/celt/quant_bands.h
  81. 638 0
      drivers/opus/celt/rate.c
  82. 101 0
      drivers/opus/celt/rate.h
  83. 182 0
      drivers/opus/celt/stack_alloc.h
  84. 595 0
      drivers/opus/celt/static_modes_fixed.h
  85. 599 0
      drivers/opus/celt/static_modes_float.h
  86. 161 0
      drivers/opus/celt/tests/test_unit_cwrs32.c
  87. 164 0
      drivers/opus/celt/tests/test_unit_dft.c
  88. 382 0
      drivers/opus/celt/tests/test_unit_entropy.c
  89. 92 0
      drivers/opus/celt/tests/test_unit_laplace.c
  90. 275 0
      drivers/opus/celt/tests/test_unit_mathops.c
  91. 210 0
      drivers/opus/celt/tests/test_unit_mdct.c
  92. 90 0
      drivers/opus/celt/tests/test_unit_rotation.c
  93. 50 0
      drivers/opus/celt/tests/test_unit_types.c
  94. 415 0
      drivers/opus/celt/vq.c
  95. 70 0
      drivers/opus/celt/vq.h
  96. 156 0
      drivers/opus/celt/x86/pitch_sse.h
  97. 3391 0
      drivers/opus/http.c
  98. 687 0
      drivers/opus/info.c
  99. 42 0
      drivers/opus/internal.c
  100. 249 0
      drivers/opus/internal.h

+ 4 - 0
.gitattributes

@@ -0,0 +1,4 @@
+*.cpp eol=lf
+*.h eol=lf
+*.py eol=lf
+*.hpp eol=lf

+ 9 - 1
.gitignore

@@ -50,6 +50,14 @@ platform/android/libs/play_licensing/gen/*
 *.d
 *.so
 *.os
+*.Plo
+*.lo
+*.Po
+
+# Libs generated files
+.deps/*
+.dirstamp
+
 
 # QT project files
 *.config
@@ -282,4 +290,4 @@ cscope.in.out
 cscope.po.out
 godot.creator.*
 
-projects/
+projects/

+ 3 - 0
SConstruct

@@ -102,6 +102,7 @@ opts.Add('p','Platform (same as platform=).',"")
 opts.Add('tools','Build Tools (Including Editor): (yes/no)','yes')
 opts.Add('gdscript','Build GDSCript support: (yes/no)','yes')
 opts.Add('vorbis','Build Ogg Vorbis Support: (yes/no)','yes')
+opts.Add('opus','Build Opus Audio Format Support: (yes/no)','yes')
 opts.Add('minizip','Build Minizip Archive Support: (yes/no)','yes')
 opts.Add('squish','Squish BC Texture Compression in editor (yes/no)','yes')
 opts.Add('theora','Theora Video (yes/no)','yes')
@@ -299,6 +300,8 @@ if selected_platform in platform_list:
 
 	if (env['vorbis']=='yes'):
 		env.Append(CPPFLAGS=['-DVORBIS_ENABLED']);
+	if (env['opus']=='yes'):
+		env.Append(CPPFLAGS=['-DOPUS_ENABLED']);
 
 	if (env['theora']=='yes'):
 		env.Append(CPPFLAGS=['-DTHEORA_ENABLED']);

+ 107 - 0
core/image.cpp

@@ -400,6 +400,102 @@ Image::Format Image::get_format() const{
 	return format;
 }
 
+static double _bicubic_interp_kernel( double x ) {
+
+	x = ABS(x);
+
+	double bc = 0;
+
+	if ( x <= 1 )
+		bc = ( 1.5 * x - 2.5 ) * x * x + 1;
+	else if ( x < 2 )
+		bc = ( ( -0.5 * x + 2.5 ) * x - 4 ) * x + 2;
+
+
+	return bc;
+}
+
+template<int CC>
+static void _scale_cubic(const uint8_t* p_src, uint8_t* p_dst, uint32_t p_src_width, uint32_t p_src_height, uint32_t p_dst_width, uint32_t p_dst_height) {
+
+
+	// get source image size
+	int width   = p_src_width;
+	int height  = p_src_height;
+	double xfac = (double) width / p_dst_width;
+	double yfac = (double) height / p_dst_height;
+	// coordinates of source points and cooefficiens
+	double  ox, oy, dx, dy, k1, k2;
+	int     ox1, oy1, ox2, oy2;
+	// destination pixel values
+	// width and height decreased by 1
+	int ymax = height - 1;
+	int xmax = width - 1;
+	// temporary pointer
+
+	for ( int y = 0; y < p_dst_height; y++ ) {
+		// Y coordinates
+		oy  = (double) y * yfac - 0.5f;
+		oy1 = (int) oy;
+		dy  = oy - (double) oy1;
+
+		for ( int x = 0; x < p_dst_width; x++ )	{
+			// X coordinates
+			ox  = (double) x * xfac - 0.5f;
+			ox1 = (int) ox;
+			dx  = ox - (double) ox1;
+
+			// initial pixel value
+
+			uint8_t *dst=p_dst + (y*p_dst_width+x)*CC;
+
+			double color[CC];
+			for(int i=0;i<CC;i++) {
+				color[i]=0;
+			}
+
+
+
+			for ( int n = -1; n < 3; n++ ) {
+				// get Y cooefficient
+				k1 = _bicubic_interp_kernel( dy - (double) n );
+
+				oy2 = oy1 + n;
+				if ( oy2 < 0 )
+					oy2 = 0;
+				if ( oy2 > ymax )
+					oy2 = ymax;
+
+				for ( int m = -1; m < 3; m++ ) {
+					// get X cooefficient
+					k2 = k1 * _bicubic_interp_kernel( (double) m - dx );
+
+					ox2 = ox1 + m;
+					if ( ox2 < 0 )
+						ox2 = 0;
+					if ( ox2 > xmax )
+						ox2 = xmax;
+
+					// get pixel of original image
+					const uint8_t *p = p_src + (oy2 * p_src_width + ox2)*CC;
+
+					for(int i=0;i<CC;i++) {
+
+						color[i]+=p[i]*k2;
+					}
+				}
+			}
+
+			for(int i=0;i<CC;i++) {
+				dst[i]=CLAMP(Math::fast_ftoi(color[i]),0,255);
+			}
+		}
+	}
+}
+
+
+
+
 template<int CC>
 static void _scale_bilinear(const uint8_t* p_src, uint8_t* p_dst, uint32_t p_src_width, uint32_t p_src_height, uint32_t p_dst_width, uint32_t p_dst_height) {
 
@@ -559,6 +655,17 @@ void Image::resize( int p_width, int p_height, Interpolation p_interpolation ) {
 			}
 
 		} break;
+		case INTERPOLATE_CUBIC: {
+
+			switch(get_format_pixel_size(format)) {
+				case 1: _scale_cubic<1>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+				case 2: _scale_cubic<2>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+				case 3: _scale_cubic<3>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+				case 4: _scale_cubic<4>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+			}
+
+		} break;
+
 
 	}
 

+ 1 - 0
core/image.h

@@ -91,6 +91,7 @@ public:
 	
 		INTERPOLATE_NEAREST,
 		INTERPOLATE_BILINEAR,
+		INTERPOLATE_CUBIC,
 		/* INTERPOLATE GAUSS */
 	};
 

+ 359 - 359
core/io/aes256.cpp

@@ -1,359 +1,359 @@
-/*  
-*   Byte-oriented AES-256 implementation.
-*   All lookup tables replaced with 'on the fly' calculations. 
-*
-*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
-*   Other contributors: Hal Finney
-*
-*   Permission to use, copy, modify, and distribute this software for any
-*   purpose with or without fee is hereby granted, provided that the above
-*   copyright notice and this permission notice appear in all copies.
-*
-*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-*/
-#include "aes256.h"
-
-#define F(x)   (((x)<<1) ^ ((((x)>>7) & 1) * 0x1b))
-#define FD(x)  (((x) >> 1) ^ (((x) & 1) ? 0x8d : 0))
-
-// #define BACK_TO_TABLES
-#ifdef BACK_TO_TABLES
-
-const uint8_t sbox[256] = {
-    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
-    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
-    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
-    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
-    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
-    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
-    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
-    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
-    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
-    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
-    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
-    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
-    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
-    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
-    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
-    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
-    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-};
-const uint8_t sboxinv[256] = {
-    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
-    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
-    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
-    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
-    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
-    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
-    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
-    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
-    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
-    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
-    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
-    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
-    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
-    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
-    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
-    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
-    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
-    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
-    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
-    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
-    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
-    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
-    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
-    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
-    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
-    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
-    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
-    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
-    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
-    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
-    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
-    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-};
-
-#define rj_sbox(x)     sbox[(x)]
-#define rj_sbox_inv(x) sboxinv[(x)]
-
-#else /* tableless subroutines */
-
-/* -------------------------------------------------------------------------- */
-uint8_t gf_alog(uint8_t x) // calculate anti-logarithm gen 3
-{
-    uint8_t atb = 1, z;
-
-    while (x--) {z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;}
-
-    return atb;
-} /* gf_alog */
-
-/* -------------------------------------------------------------------------- */
-uint8_t gf_log(uint8_t x) // calculate logarithm gen 3
-{
-    uint8_t atb = 1, i = 0, z;
-
-    do {
-        if (atb == x) break;
-        z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;
-    } while (++i > 0);
-
-    return i;
-} /* gf_log */
-
-
-/* -------------------------------------------------------------------------- */
-uint8_t gf_mulinv(uint8_t x) // calculate multiplicative inverse
-{
-    return (x) ? gf_alog(255 - gf_log(x)) : 0;
-} /* gf_mulinv */
-
-/* -------------------------------------------------------------------------- */
-uint8_t rj_sbox(uint8_t x)
-{
-    uint8_t y, sb;
-
-    sb = y = gf_mulinv(x);
-    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y; 
-    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y;
-
-    return (sb ^ 0x63);
-} /* rj_sbox */
-
-/* -------------------------------------------------------------------------- */
-uint8_t rj_sbox_inv(uint8_t x)
-{
-    uint8_t y, sb;
-
-    y = x ^ 0x63;
-    sb = y = (y<<1)|(y>>7);
-    y = (y<<2)|(y>>6); sb ^= y; y = (y<<3)|(y>>5); sb ^= y;
-
-    return gf_mulinv(sb);
-} /* rj_sbox_inv */
-
-#endif
-
-/* -------------------------------------------------------------------------- */
-uint8_t rj_xtime(uint8_t x) 
-{
-    return (x & 0x80) ? ((x << 1) ^ 0x1b) : (x << 1);
-} /* rj_xtime */
-
-/* -------------------------------------------------------------------------- */
-void aes_subBytes(uint8_t *buf)
-{
-    register uint8_t i = 16;
-
-    while (i--) buf[i] = rj_sbox(buf[i]);
-} /* aes_subBytes */
-
-/* -------------------------------------------------------------------------- */
-void aes_subBytes_inv(uint8_t *buf)
-{
-    register uint8_t i = 16;
-
-    while (i--) buf[i] = rj_sbox_inv(buf[i]);
-} /* aes_subBytes_inv */
-
-/* -------------------------------------------------------------------------- */
-void aes_addRoundKey(uint8_t *buf, uint8_t *key)
-{
-    register uint8_t i = 16;
-
-    while (i--) buf[i] ^= key[i];
-} /* aes_addRoundKey */
-
-/* -------------------------------------------------------------------------- */
-void aes_addRoundKey_cpy(uint8_t *buf, uint8_t *key, uint8_t *cpk)
-{
-    register uint8_t i = 16;
-
-    while (i--)  buf[i] ^= (cpk[i] = key[i]), cpk[16+i] = key[16 + i];
-} /* aes_addRoundKey_cpy */
-
-
-/* -------------------------------------------------------------------------- */
-void aes_shiftRows(uint8_t *buf)
-{
-    register uint8_t i, j; /* to make it potentially parallelable :) */
-
-    i = buf[1]; buf[1] = buf[5]; buf[5] = buf[9]; buf[9] = buf[13]; buf[13] = i;
-    i = buf[10]; buf[10] = buf[2]; buf[2] = i;
-    j = buf[3]; buf[3] = buf[15]; buf[15] = buf[11]; buf[11] = buf[7]; buf[7] = j;
-    j = buf[14]; buf[14] = buf[6]; buf[6]  = j;
-
-} /* aes_shiftRows */
-
-/* -------------------------------------------------------------------------- */
-void aes_shiftRows_inv(uint8_t *buf)
-{
-    register uint8_t i, j; /* same as above :) */
-
-    i = buf[1]; buf[1] = buf[13]; buf[13] = buf[9]; buf[9] = buf[5]; buf[5] = i;
-    i = buf[2]; buf[2] = buf[10]; buf[10] = i;
-    j = buf[3]; buf[3] = buf[7]; buf[7] = buf[11]; buf[11] = buf[15]; buf[15] = j;
-    j = buf[6]; buf[6] = buf[14]; buf[14] = j;
-
-} /* aes_shiftRows_inv */
-
-/* -------------------------------------------------------------------------- */
-void aes_mixColumns(uint8_t *buf)
-{
-    register uint8_t i, a, b, c, d, e;
-
-    for (i = 0; i < 16; i += 4)
-    {
-        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
-        e = a ^ b ^ c ^ d;
-        buf[i] ^= e ^ rj_xtime(a^b);   buf[i+1] ^= e ^ rj_xtime(b^c);
-        buf[i+2] ^= e ^ rj_xtime(c^d); buf[i+3] ^= e ^ rj_xtime(d^a);
-    }
-} /* aes_mixColumns */
-
-/* -------------------------------------------------------------------------- */
-void aes_mixColumns_inv(uint8_t *buf)
-{
-    register uint8_t i, a, b, c, d, e, x, y, z;
-
-    for (i = 0; i < 16; i += 4)
-    {
-        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
-        e = a ^ b ^ c ^ d;
-        z = rj_xtime(e);
-        x = e ^ rj_xtime(rj_xtime(z^a^c));  y = e ^ rj_xtime(rj_xtime(z^b^d));
-        buf[i] ^= x ^ rj_xtime(a^b);   buf[i+1] ^= y ^ rj_xtime(b^c);
-        buf[i+2] ^= x ^ rj_xtime(c^d); buf[i+3] ^= y ^ rj_xtime(d^a);
-    }
-} /* aes_mixColumns_inv */
-
-/* -------------------------------------------------------------------------- */
-void aes_expandEncKey(uint8_t *k, uint8_t *rc) 
-{
-    register uint8_t i;
-
-    k[0] ^= rj_sbox(k[29]) ^ (*rc);
-    k[1] ^= rj_sbox(k[30]);
-    k[2] ^= rj_sbox(k[31]);
-    k[3] ^= rj_sbox(k[28]);
-    *rc = F( *rc);
-
-    for(i = 4; i < 16; i += 4)  k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-    k[16] ^= rj_sbox(k[12]);
-    k[17] ^= rj_sbox(k[13]);
-    k[18] ^= rj_sbox(k[14]);
-    k[19] ^= rj_sbox(k[15]);
-
-    for(i = 20; i < 32; i += 4) k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-
-} /* aes_expandEncKey */
-
-/* -------------------------------------------------------------------------- */
-void aes_expandDecKey(uint8_t *k, uint8_t *rc) 
-{
-    uint8_t i;
-
-    for(i = 28; i > 16; i -= 4) k[i+0] ^= k[i-4], k[i+1] ^= k[i-3], 
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-
-    k[16] ^= rj_sbox(k[12]);
-    k[17] ^= rj_sbox(k[13]);
-    k[18] ^= rj_sbox(k[14]);
-    k[19] ^= rj_sbox(k[15]);
-
-    for(i = 12; i > 0; i -= 4)  k[i+0] ^= k[i-4], k[i+1] ^= k[i-3],
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-
-    *rc = FD(*rc);
-    k[0] ^= rj_sbox(k[29]) ^ (*rc);
-    k[1] ^= rj_sbox(k[30]);
-    k[2] ^= rj_sbox(k[31]);
-    k[3] ^= rj_sbox(k[28]);
-} /* aes_expandDecKey */
-
-
-/* -------------------------------------------------------------------------- */
-void aes256_init(aes256_context *ctx, uint8_t *k)
-{
-    uint8_t rcon = 1;
-    register uint8_t i;
-
-    for (i = 0; i < sizeof(ctx->key); i++) ctx->enckey[i] = ctx->deckey[i] = k[i];
-    for (i = 8;--i;) aes_expandEncKey(ctx->deckey, &rcon);
-} /* aes256_init */
-
-/* -------------------------------------------------------------------------- */
-void aes256_done(aes256_context *ctx)
-{
-    register uint8_t i;
-
-    for (i = 0; i < sizeof(ctx->key); i++) 
-        ctx->key[i] = ctx->enckey[i] = ctx->deckey[i] = 0;
-} /* aes256_done */
-
-/* -------------------------------------------------------------------------- */
-void aes256_encrypt_ecb(aes256_context *ctx, uint8_t *buf)
-{
-    uint8_t i, rcon;
-
-    aes_addRoundKey_cpy(buf, ctx->enckey, ctx->key);
-    for(i = 1, rcon = 1; i < 14; ++i)
-    {
-        aes_subBytes(buf);
-        aes_shiftRows(buf);
-        aes_mixColumns(buf);
-        if( i & 1 ) aes_addRoundKey( buf, &ctx->key[16]);
-        else aes_expandEncKey(ctx->key, &rcon), aes_addRoundKey(buf, ctx->key);
-    }
-    aes_subBytes(buf);
-    aes_shiftRows(buf);
-    aes_expandEncKey(ctx->key, &rcon); 
-    aes_addRoundKey(buf, ctx->key);
-} /* aes256_encrypt */
-
-/* -------------------------------------------------------------------------- */
-void aes256_decrypt_ecb(aes256_context *ctx, uint8_t *buf)
-{
-    uint8_t i, rcon;
-
-    aes_addRoundKey_cpy(buf, ctx->deckey, ctx->key);
-    aes_shiftRows_inv(buf);
-    aes_subBytes_inv(buf);
-
-    for (i = 14, rcon = 0x80; --i;)
-    {
-        if( ( i & 1 ) )           
-        {
-            aes_expandDecKey(ctx->key, &rcon);
-            aes_addRoundKey(buf, &ctx->key[16]);
-        }
-        else aes_addRoundKey(buf, ctx->key);
-        aes_mixColumns_inv(buf);
-        aes_shiftRows_inv(buf);
-        aes_subBytes_inv(buf);
-    }
-    aes_addRoundKey( buf, ctx->key); 
-} /* aes256_decrypt */
+/*  
+*   Byte-oriented AES-256 implementation.
+*   All lookup tables replaced with 'on the fly' calculations. 
+*
+*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
+*   Other contributors: Hal Finney
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+#include "aes256.h"
+
+#define F(x)   (((x)<<1) ^ ((((x)>>7) & 1) * 0x1b))
+#define FD(x)  (((x) >> 1) ^ (((x) & 1) ? 0x8d : 0))
+
+// #define BACK_TO_TABLES
+#ifdef BACK_TO_TABLES
+
+const uint8_t sbox[256] = {
+    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+};
+const uint8_t sboxinv[256] = {
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+};
+
+#define rj_sbox(x)     sbox[(x)]
+#define rj_sbox_inv(x) sboxinv[(x)]
+
+#else /* tableless subroutines */
+
+/* -------------------------------------------------------------------------- */
+uint8_t gf_alog(uint8_t x) // calculate anti-logarithm gen 3
+{
+    uint8_t atb = 1, z;
+
+    while (x--) {z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;}
+
+    return atb;
+} /* gf_alog */
+
+/* -------------------------------------------------------------------------- */
+uint8_t gf_log(uint8_t x) // calculate logarithm gen 3
+{
+    uint8_t atb = 1, i = 0, z;
+
+    do {
+        if (atb == x) break;
+        z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;
+    } while (++i > 0);
+
+    return i;
+} /* gf_log */
+
+
+/* -------------------------------------------------------------------------- */
+uint8_t gf_mulinv(uint8_t x) // calculate multiplicative inverse
+{
+    return (x) ? gf_alog(255 - gf_log(x)) : 0;
+} /* gf_mulinv */
+
+/* -------------------------------------------------------------------------- */
+uint8_t rj_sbox(uint8_t x)
+{
+    uint8_t y, sb;
+
+    sb = y = gf_mulinv(x);
+    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y; 
+    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y;
+
+    return (sb ^ 0x63);
+} /* rj_sbox */
+
+/* -------------------------------------------------------------------------- */
+uint8_t rj_sbox_inv(uint8_t x)
+{
+    uint8_t y, sb;
+
+    y = x ^ 0x63;
+    sb = y = (y<<1)|(y>>7);
+    y = (y<<2)|(y>>6); sb ^= y; y = (y<<3)|(y>>5); sb ^= y;
+
+    return gf_mulinv(sb);
+} /* rj_sbox_inv */
+
+#endif
+
+/* -------------------------------------------------------------------------- */
+uint8_t rj_xtime(uint8_t x) 
+{
+    return (x & 0x80) ? ((x << 1) ^ 0x1b) : (x << 1);
+} /* rj_xtime */
+
+/* -------------------------------------------------------------------------- */
+void aes_subBytes(uint8_t *buf)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] = rj_sbox(buf[i]);
+} /* aes_subBytes */
+
+/* -------------------------------------------------------------------------- */
+void aes_subBytes_inv(uint8_t *buf)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] = rj_sbox_inv(buf[i]);
+} /* aes_subBytes_inv */
+
+/* -------------------------------------------------------------------------- */
+void aes_addRoundKey(uint8_t *buf, uint8_t *key)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] ^= key[i];
+} /* aes_addRoundKey */
+
+/* -------------------------------------------------------------------------- */
+void aes_addRoundKey_cpy(uint8_t *buf, uint8_t *key, uint8_t *cpk)
+{
+    register uint8_t i = 16;
+
+    while (i--)  buf[i] ^= (cpk[i] = key[i]), cpk[16+i] = key[16 + i];
+} /* aes_addRoundKey_cpy */
+
+
+/* -------------------------------------------------------------------------- */
+void aes_shiftRows(uint8_t *buf)
+{
+    register uint8_t i, j; /* to make it potentially parallelable :) */
+
+    i = buf[1]; buf[1] = buf[5]; buf[5] = buf[9]; buf[9] = buf[13]; buf[13] = i;
+    i = buf[10]; buf[10] = buf[2]; buf[2] = i;
+    j = buf[3]; buf[3] = buf[15]; buf[15] = buf[11]; buf[11] = buf[7]; buf[7] = j;
+    j = buf[14]; buf[14] = buf[6]; buf[6]  = j;
+
+} /* aes_shiftRows */
+
+/* -------------------------------------------------------------------------- */
+void aes_shiftRows_inv(uint8_t *buf)
+{
+    register uint8_t i, j; /* same as above :) */
+
+    i = buf[1]; buf[1] = buf[13]; buf[13] = buf[9]; buf[9] = buf[5]; buf[5] = i;
+    i = buf[2]; buf[2] = buf[10]; buf[10] = i;
+    j = buf[3]; buf[3] = buf[7]; buf[7] = buf[11]; buf[11] = buf[15]; buf[15] = j;
+    j = buf[6]; buf[6] = buf[14]; buf[14] = j;
+
+} /* aes_shiftRows_inv */
+
+/* -------------------------------------------------------------------------- */
+void aes_mixColumns(uint8_t *buf)
+{
+    register uint8_t i, a, b, c, d, e;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
+        e = a ^ b ^ c ^ d;
+        buf[i] ^= e ^ rj_xtime(a^b);   buf[i+1] ^= e ^ rj_xtime(b^c);
+        buf[i+2] ^= e ^ rj_xtime(c^d); buf[i+3] ^= e ^ rj_xtime(d^a);
+    }
+} /* aes_mixColumns */
+
+/* -------------------------------------------------------------------------- */
+void aes_mixColumns_inv(uint8_t *buf)
+{
+    register uint8_t i, a, b, c, d, e, x, y, z;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
+        e = a ^ b ^ c ^ d;
+        z = rj_xtime(e);
+        x = e ^ rj_xtime(rj_xtime(z^a^c));  y = e ^ rj_xtime(rj_xtime(z^b^d));
+        buf[i] ^= x ^ rj_xtime(a^b);   buf[i+1] ^= y ^ rj_xtime(b^c);
+        buf[i+2] ^= x ^ rj_xtime(c^d); buf[i+3] ^= y ^ rj_xtime(d^a);
+    }
+} /* aes_mixColumns_inv */
+
+/* -------------------------------------------------------------------------- */
+void aes_expandEncKey(uint8_t *k, uint8_t *rc) 
+{
+    register uint8_t i;
+
+    k[0] ^= rj_sbox(k[29]) ^ (*rc);
+    k[1] ^= rj_sbox(k[30]);
+    k[2] ^= rj_sbox(k[31]);
+    k[3] ^= rj_sbox(k[28]);
+    *rc = F( *rc);
+
+    for(i = 4; i < 16; i += 4)  k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+    k[16] ^= rj_sbox(k[12]);
+    k[17] ^= rj_sbox(k[13]);
+    k[18] ^= rj_sbox(k[14]);
+    k[19] ^= rj_sbox(k[15]);
+
+    for(i = 20; i < 32; i += 4) k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+
+} /* aes_expandEncKey */
+
+/* -------------------------------------------------------------------------- */
+void aes_expandDecKey(uint8_t *k, uint8_t *rc) 
+{
+    uint8_t i;
+
+    for(i = 28; i > 16; i -= 4) k[i+0] ^= k[i-4], k[i+1] ^= k[i-3], 
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+
+    k[16] ^= rj_sbox(k[12]);
+    k[17] ^= rj_sbox(k[13]);
+    k[18] ^= rj_sbox(k[14]);
+    k[19] ^= rj_sbox(k[15]);
+
+    for(i = 12; i > 0; i -= 4)  k[i+0] ^= k[i-4], k[i+1] ^= k[i-3],
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+
+    *rc = FD(*rc);
+    k[0] ^= rj_sbox(k[29]) ^ (*rc);
+    k[1] ^= rj_sbox(k[30]);
+    k[2] ^= rj_sbox(k[31]);
+    k[3] ^= rj_sbox(k[28]);
+} /* aes_expandDecKey */
+
+
+/* -------------------------------------------------------------------------- */
+void aes256_init(aes256_context *ctx, uint8_t *k)
+{
+    uint8_t rcon = 1;
+    register uint8_t i;
+
+    for (i = 0; i < sizeof(ctx->key); i++) ctx->enckey[i] = ctx->deckey[i] = k[i];
+    for (i = 8;--i;) aes_expandEncKey(ctx->deckey, &rcon);
+} /* aes256_init */
+
+/* -------------------------------------------------------------------------- */
+void aes256_done(aes256_context *ctx)
+{
+    register uint8_t i;
+
+    for (i = 0; i < sizeof(ctx->key); i++) 
+        ctx->key[i] = ctx->enckey[i] = ctx->deckey[i] = 0;
+} /* aes256_done */
+
+/* -------------------------------------------------------------------------- */
+void aes256_encrypt_ecb(aes256_context *ctx, uint8_t *buf)
+{
+    uint8_t i, rcon;
+
+    aes_addRoundKey_cpy(buf, ctx->enckey, ctx->key);
+    for(i = 1, rcon = 1; i < 14; ++i)
+    {
+        aes_subBytes(buf);
+        aes_shiftRows(buf);
+        aes_mixColumns(buf);
+        if( i & 1 ) aes_addRoundKey( buf, &ctx->key[16]);
+        else aes_expandEncKey(ctx->key, &rcon), aes_addRoundKey(buf, ctx->key);
+    }
+    aes_subBytes(buf);
+    aes_shiftRows(buf);
+    aes_expandEncKey(ctx->key, &rcon); 
+    aes_addRoundKey(buf, ctx->key);
+} /* aes256_encrypt */
+
+/* -------------------------------------------------------------------------- */
+void aes256_decrypt_ecb(aes256_context *ctx, uint8_t *buf)
+{
+    uint8_t i, rcon;
+
+    aes_addRoundKey_cpy(buf, ctx->deckey, ctx->key);
+    aes_shiftRows_inv(buf);
+    aes_subBytes_inv(buf);
+
+    for (i = 14, rcon = 0x80; --i;)
+    {
+        if( ( i & 1 ) )           
+        {
+            aes_expandDecKey(ctx->key, &rcon);
+            aes_addRoundKey(buf, &ctx->key[16]);
+        }
+        else aes_addRoundKey(buf, ctx->key);
+        aes_mixColumns_inv(buf);
+        aes_shiftRows_inv(buf);
+        aes_subBytes_inv(buf);
+    }
+    aes_addRoundKey( buf, ctx->key); 
+} /* aes256_decrypt */

+ 46 - 46
core/io/aes256.h

@@ -1,46 +1,46 @@
-/*  
-*   Byte-oriented AES-256 implementation.
-*   All lookup tables replaced with 'on the fly' calculations. 
-*
-*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
-*   Other contributors: Hal Finney
-*
-*   Permission to use, copy, modify, and distribute this software for any
-*   purpose with or without fee is hereby granted, provided that the above
-*   copyright notice and this permission notice appear in all copies.
-*
-*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-*/
-
-#ifndef AES_256_H
-#define AES_256_H
-
-#include "typedefs.h"
-
-#ifdef __cplusplus
-extern "C" { 
-#endif
-
-    typedef struct {
-        uint8_t key[32]; 
-        uint8_t enckey[32]; 
-        uint8_t deckey[32];
-    } aes256_context; 
-
-
-    void aes256_init(aes256_context *, uint8_t * /* key */);
-    void aes256_done(aes256_context *);
-    void aes256_encrypt_ecb(aes256_context *, uint8_t * /* plaintext */);
-    void aes256_decrypt_ecb(aes256_context *, uint8_t * /* cipertext */);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+/*  
+*   Byte-oriented AES-256 implementation.
+*   All lookup tables replaced with 'on the fly' calculations. 
+*
+*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
+*   Other contributors: Hal Finney
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef AES_256_H
+#define AES_256_H
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+    typedef struct {
+        uint8_t key[32]; 
+        uint8_t enckey[32]; 
+        uint8_t deckey[32];
+    } aes256_context; 
+
+
+    void aes256_init(aes256_context *, uint8_t * /* key */);
+    void aes256_done(aes256_context *);
+    void aes256_encrypt_ecb(aes256_context *, uint8_t * /* plaintext */);
+    void aes256_decrypt_ecb(aes256_context *, uint8_t * /* cipertext */);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 2 - 2
core/math/math_funcs.h

@@ -79,9 +79,9 @@ public:
 		return Math::log( p_linear ) * 8.6858896380650365530225783783321;
 	}
 
-	static inline double db2linear(double p_linear) {
+	static inline double db2linear(double p_db) {
 
-		return Math::exp( p_linear * 0.11512925464970228420089957273422 );
+		return Math::exp( p_db * 0.11512925464970228420089957273422 );
 	}
 
 	static bool is_nan(double p_val);

+ 1 - 306
core/os/input.cpp

@@ -64,6 +64,7 @@ void Input::_bind_methods() {
 	ObjectTypeDB::bind_method(_MD("warp_mouse_pos","to"),&Input::warp_mouse_pos);
 	ObjectTypeDB::bind_method(_MD("action_press"),&Input::action_press);
 	ObjectTypeDB::bind_method(_MD("action_release"),&Input::action_release);
+	ObjectTypeDB::bind_method(_MD("set_custom_mouse_cursor","image:Texture","hotspot"),&Input::set_custom_mouse_cursor,DEFVAL(Vector2()));
 
 	BIND_CONSTANT( MOUSE_MODE_VISIBLE );
 	BIND_CONSTANT( MOUSE_MODE_HIDDEN );
@@ -104,309 +105,3 @@ Input::Input() {
 
 //////////////////////////////////////////////////////////
 
-
-void InputDefault::SpeedTrack::update(const Vector2& p_delta_p) {
-
-	uint64_t tick = OS::get_singleton()->get_ticks_usec();
-	uint32_t tdiff = tick-last_tick;
-	float delta_t = tdiff / 1000000.0;
-	last_tick=tick;
-
-
-	accum+=p_delta_p;
-	accum_t+=delta_t;
-
-	if (accum_t>max_ref_frame*10)
-		accum_t=max_ref_frame*10;
-
-	while( accum_t>=min_ref_frame ) {
-
-		float slice_t = min_ref_frame / accum_t;
-		Vector2 slice = accum*slice_t;
-		accum=accum-slice;
-		accum_t-=min_ref_frame;
-
-		speed=(slice/min_ref_frame).linear_interpolate(speed,min_ref_frame/max_ref_frame);
-	}
-
-
-
-}
-
-void InputDefault::SpeedTrack::reset() {
-	last_tick = OS::get_singleton()->get_ticks_usec();
-	speed=Vector2();
-	accum_t=0;
-}
-
-InputDefault::SpeedTrack::SpeedTrack() {
-
-	 min_ref_frame=0.1;
-	 max_ref_frame=0.3;
-	 reset();
-}
-
-bool InputDefault::is_key_pressed(int p_scancode) {
-
-	_THREAD_SAFE_METHOD_
-	return keys_pressed.has(p_scancode);
-}
-
-bool InputDefault::is_mouse_button_pressed(int p_button) {
-
-	_THREAD_SAFE_METHOD_
-	return (mouse_button_mask&(1<<p_button))!=0;
-}
-
-
-static int _combine_device(int p_value,int p_device) {
-
-	return p_value|(p_device<<20);
-}
-
-bool InputDefault::is_joy_button_pressed(int p_device, int p_button) {
-
-	_THREAD_SAFE_METHOD_
-	return joy_buttons_pressed.has(_combine_device(p_button,p_device));
-}
-
-bool InputDefault::is_action_pressed(const StringName& p_action) {
-
-	if (custom_action_press.has(p_action))
-		return true; //simpler
-
-	const List<InputEvent> *alist = InputMap::get_singleton()->get_action_list(p_action);
-	if (!alist)
-		return NULL;
-
-
-	for (const List<InputEvent>::Element *E=alist->front();E;E=E->next()) {
-
-
-		int device=E->get().device;
-
-		switch(E->get().type) {
-
-			case InputEvent::KEY: {
-
-				const InputEventKey &iek=E->get().key;
-				if ((keys_pressed.has(iek.scancode)))
-					return true;
-			} break;
-			case InputEvent::MOUSE_BUTTON: {
-
-				const InputEventMouseButton &iemb=E->get().mouse_button;
-				 if(mouse_button_mask&(1<<iemb.button_index))
-					 return true;
-			} break;
-			case InputEvent::JOYSTICK_BUTTON: {
-
-				const InputEventJoystickButton &iejb=E->get().joy_button;
-				int c = _combine_device(iejb.button_index,device);
-				if (joy_buttons_pressed.has(c))
-					return true;
-			} break;
-		}
-	}
-
-	return false;
-}
-
-float InputDefault::get_joy_axis(int p_device,int p_axis) {
-
-	_THREAD_SAFE_METHOD_
-	int c = _combine_device(p_axis,p_device);
-	if (joy_axis.has(c)) {
-		return joy_axis[c];
-	} else {
-		return 0;
-	}
-}
-
-String InputDefault::get_joy_name(int p_idx) {
-
-	_THREAD_SAFE_METHOD_
-	return joy_names[p_idx];
-};
-
-void InputDefault::joy_connection_changed(int p_idx, bool p_connected, String p_name) {
-
-	_THREAD_SAFE_METHOD_
-	joy_names[p_idx] = p_connected ? p_name : "";
-
-	emit_signal("joy_connection_changed", p_idx, p_connected);
-};
-
-Vector3 InputDefault::get_accelerometer() {
-
-	_THREAD_SAFE_METHOD_
-	return accelerometer;
-}
-
-void InputDefault::parse_input_event(const InputEvent& p_event) {
-
-	_THREAD_SAFE_METHOD_
-	switch(p_event.type) {
-
-		case InputEvent::KEY: {
-
-			if (p_event.key.echo)
-				break;
-			if (p_event.key.scancode==0)
-				break;
-
-		//	print_line(p_event);
-
-			if (p_event.key.pressed)
-				keys_pressed.insert(p_event.key.scancode);
-			else
-				keys_pressed.erase(p_event.key.scancode);
-		} break;
-		case InputEvent::MOUSE_BUTTON: {
-
-			if (p_event.mouse_button.doubleclick)
-				break;
-
-			if (p_event.mouse_button.pressed)
-				mouse_button_mask|=(1<<p_event.mouse_button.button_index);
-			else
-				mouse_button_mask&=~(1<<p_event.mouse_button.button_index);
-
-			if (main_loop && emulate_touch && p_event.mouse_button.button_index==1) {
-				InputEventScreenTouch touch_event;
-				touch_event.index=0;
-				touch_event.pressed=p_event.mouse_button.pressed;
-				touch_event.x=p_event.mouse_button.x;
-				touch_event.y=p_event.mouse_button.y;
-				InputEvent ev;
-				ev.type=InputEvent::SCREEN_TOUCH;
-				ev.screen_touch=touch_event;
-				main_loop->input_event(ev);
-			}
-		} break;
-		case InputEvent::MOUSE_MOTION: {
-
-			if (main_loop && emulate_touch && p_event.mouse_motion.button_mask&1) {
-				InputEventScreenDrag drag_event;
-				drag_event.index=0;
-				drag_event.x=p_event.mouse_motion.x;
-				drag_event.y=p_event.mouse_motion.y;
-				drag_event.relative_x=p_event.mouse_motion.relative_x;
-				drag_event.relative_y=p_event.mouse_motion.relative_y;
-				drag_event.speed_x=p_event.mouse_motion.speed_x;
-				drag_event.speed_y=p_event.mouse_motion.speed_y;
-
-				InputEvent ev;
-				ev.type=InputEvent::SCREEN_DRAG;
-				ev.screen_drag=drag_event;
-
-				main_loop->input_event(ev);
-			}
-
-		} break;
-		case InputEvent::JOYSTICK_BUTTON: {
-
-			int c = _combine_device(p_event.joy_button.button_index,p_event.device);
-
-			if (p_event.joy_button.pressed)
-				joy_buttons_pressed.insert(c);
-			else
-				joy_buttons_pressed.erase(c);
-		} break;
-		case InputEvent::JOYSTICK_MOTION: {
-			set_joy_axis(p_event.device, p_event.joy_motion.axis, p_event.joy_motion.axis_value);
-		} break;
-
-	}
-
-	if (main_loop)
-		main_loop->input_event(p_event);
-
-}
-
-void InputDefault::set_joy_axis(int p_device,int p_axis,float p_value) {
-
-	_THREAD_SAFE_METHOD_
-	int c = _combine_device(p_axis,p_device);
-	joy_axis[c]=p_value;
-}
-
-void InputDefault::set_accelerometer(const Vector3& p_accel) {
-
-	_THREAD_SAFE_METHOD_
-
-	accelerometer=p_accel;
-
-}
-
-void InputDefault::set_main_loop(MainLoop *p_main_loop) {
-	main_loop=p_main_loop;
-
-}
-
-void InputDefault::set_mouse_pos(const Point2& p_posf) {
-
-	mouse_speed_track.update(p_posf-mouse_pos);
-	mouse_pos=p_posf;
-}
-
-Point2 InputDefault::get_mouse_pos() const {
-
-	return mouse_pos;
-}
-Point2 InputDefault::get_mouse_speed() const {
-
-	return mouse_speed_track.speed;
-}
-
-int InputDefault::get_mouse_button_mask() const {
-
-	return OS::get_singleton()->get_mouse_button_state();
-}
-
-void InputDefault::warp_mouse_pos(const Vector2& p_to) {
-
-	OS::get_singleton()->warp_mouse_pos(p_to);
-}
-
-
-void InputDefault::iteration(float p_step) {
-
-
-}
-
-void InputDefault::action_press(const StringName& p_action) {
-
-	if (custom_action_press.has(p_action)) {
-
-		custom_action_press[p_action]++;
-	} else {
-		custom_action_press[p_action]=1;
-	}
-}
-
-void InputDefault::action_release(const StringName& p_action){
-
-	ERR_FAIL_COND(!custom_action_press.has(p_action));
-	custom_action_press[p_action]--;
-	if (custom_action_press[p_action]==0) {
-		custom_action_press.erase(p_action);
-	}
-}
-
-void InputDefault::set_emulate_touch(bool p_emulate) {
-
-	emulate_touch=p_emulate;
-}
-
-bool InputDefault::is_emulating_touchscreen() const {
-
-	return emulate_touch;
-}
-
-InputDefault::InputDefault() {
-
-	mouse_button_mask=0;
-	emulate_touch=false;
-	main_loop=NULL;
-}

+ 2 - 71
core/os/input.h

@@ -80,82 +80,13 @@ public:
 
 	virtual bool is_emulating_touchscreen() const=0;
 
+	virtual void set_custom_mouse_cursor(const RES& p_cursor,const Vector2& p_hotspot=Vector2())=0;
+	virtual void set_mouse_in_window(bool p_in_window)=0;
 
 	Input();
 };
 
 VARIANT_ENUM_CAST(Input::MouseMode);
 
-class InputDefault : public Input {
-
-	OBJ_TYPE( InputDefault, Input );
-	_THREAD_SAFE_CLASS_
-
-	int mouse_button_mask;
-	Set<int> keys_pressed;
-	Set<int> joy_buttons_pressed;
-	Map<int,float> joy_axis;
-	Map<StringName,int> custom_action_press;
-	Map<int, String> joy_names;
-	Vector3 accelerometer;
-	Vector2 mouse_pos;
-	MainLoop *main_loop;
-
-	bool emulate_touch;
-
-	struct SpeedTrack {
-
-		uint64_t last_tick;
-		Vector2 speed;
-		Vector2 accum;
-		float accum_t;
-		float min_ref_frame;
-		float max_ref_frame;
-
-		void update(const Vector2& p_delta_p);
-		void reset();
-		SpeedTrack();
-	};
-
-	SpeedTrack mouse_speed_track;
-
-public:
-
-	virtual bool is_key_pressed(int p_scancode);
-	virtual bool is_mouse_button_pressed(int p_button);
-	virtual bool is_joy_button_pressed(int p_device, int p_button);
-	virtual bool is_action_pressed(const StringName& p_action);
-
-	virtual float get_joy_axis(int p_device,int p_axis);
-	String get_joy_name(int p_idx);
-	void joy_connection_changed(int p_idx, bool p_connected, String p_name);
-
-	virtual Vector3 get_accelerometer();
-
-	virtual Point2 get_mouse_pos() const;
-	virtual Point2 get_mouse_speed() const;
-	virtual int get_mouse_button_mask() const;
-
-	virtual void warp_mouse_pos(const Vector2& p_to);
-
-
-	void parse_input_event(const InputEvent& p_event);
-	void set_accelerometer(const Vector3& p_accel);
-	void set_joy_axis(int p_device,int p_axis,float p_value);
-
-	void set_main_loop(MainLoop *main_loop);
-	void set_mouse_pos(const Point2& p_posf);
-
-	void action_press(const StringName& p_action);
-	void action_release(const StringName& p_action);
-
-	void iteration(float p_step);
-
-	void set_emulate_touch(bool p_emulate);
-	virtual bool is_emulating_touchscreen() const;
-
-	InputDefault();
-
-};
 
 #endif // INPUT_H

+ 2 - 1
core/os/main_loop.cpp

@@ -45,7 +45,8 @@ void MainLoop::_bind_methods() {
 	BIND_VMETHOD( MethodInfo("_idle",PropertyInfo(Variant::REAL,"delta")) );
 	BIND_VMETHOD( MethodInfo("_finalize") );
 
-
+	BIND_CONSTANT(NOTIFICATION_WM_MOUSE_ENTER);
+	BIND_CONSTANT(NOTIFICATION_WM_MOUSE_EXIT);
 	BIND_CONSTANT(NOTIFICATION_WM_FOCUS_IN);
 	BIND_CONSTANT(NOTIFICATION_WM_FOCUS_OUT);
 	BIND_CONSTANT(NOTIFICATION_WM_QUIT_REQUEST);

+ 2 - 0
core/os/main_loop.h

@@ -47,6 +47,8 @@ protected:
 public:	
 
 	enum {
+		NOTIFICATION_WM_MOUSE_ENTER = 3,
+		NOTIFICATION_WM_MOUSE_EXIT = 4,
 		NOTIFICATION_WM_FOCUS_IN = 5,
 		NOTIFICATION_WM_FOCUS_OUT = 6,
 		NOTIFICATION_WM_QUIT_REQUEST = 7,

+ 4 - 4
core/ustring.cpp

@@ -3119,8 +3119,8 @@ String String::xml_escape(bool p_escape_quotes) const {
 
 	String str=*this;
 	str=str.replace("&","&amp;");
-	str=str.replace("<","&gt;");
-	str=str.replace(">","&lt;");
+	str=str.replace("<","&lt;");
+	str=str.replace(">","&gt;");
 	if (p_escape_quotes) {
 		str=str.replace("'","&apos;");
 		str=str.replace("\"","&quot;");
@@ -3172,12 +3172,12 @@ static _FORCE_INLINE_ int _xml_unescape(const CharType *p_src,int p_src_len,Char
 			} else if (p_src_len>=4 && p_src[1]=='g' && p_src[2]=='t' && p_src[3]==';') {
 
 				if (p_dst)
-					*p_dst='<';
+					*p_dst='>';
 				eat=4;
 			} else if (p_src_len>=4 && p_src[1]=='l' && p_src[2]=='t' && p_src[3]==';') {
 
 				if (p_dst)
-					*p_dst='>';
+					*p_dst='<';
 				eat=4;
 			} else if (p_src_len>=5 && p_src[1]=='a' && p_src[2]=='m' && p_src[3]=='p' && p_src[4]==';') {
 

Datei-Diff unterdrückt, da er zu groß ist
+ 456 - 186
doc/base/classes.xml


+ 3 - 1
drivers/SCsub

@@ -31,10 +31,12 @@ SConscript("rtaudio/SCsub");
 SConscript("nedmalloc/SCsub");
 SConscript("nrex/SCsub");
 SConscript("chibi/SCsub");
-if (env["vorbis"]=="yes" or env["speex"]=="yes" or env["theora"]=="yes"):
+if (env["vorbis"]=="yes" or env["speex"]=="yes" or env["theora"]=="yes" or env["opus"]=="yes"):
         SConscript("ogg/SCsub");
 if (env["vorbis"]=="yes"):
         SConscript("vorbis/SCsub");
+if (env["opus"]=="yes"):
+		SConscript('opus/SCsub');
 if (env["tools"]=="yes"):
 	SConscript("convex_decomp/SCsub");
 

+ 2454 - 2454
drivers/etc1/rg_etc1.cpp

@@ -1,2454 +1,2454 @@
-// File: rg_etc1.cpp - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
-// Please see ZLIB license at the end of rg_etc1.h.
-//
-// For more information Ericsson Texture Compression (ETC/ETC1), see:
-// http://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8_texture.txt
-//
-// v1.03 - 5/12/13 - Initial public release
-#include "rg_etc1.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-//#include <stdio.h>
-#include <math.h>
-#include <stdio.h>
-#pragma warning (disable: 4201) //  nonstandard extension used : nameless struct/union
-
-#if defined(_DEBUG) || defined(DEBUG)
-#define RG_ETC1_BUILD_DEBUG
-#endif
-
-#define RG_ETC1_ASSERT assert
-
-namespace rg_etc1
-{
-
-   inline long labs(long val) {
-        return val < 0 ? -val : val;
-   }
-
-   inline int intabs(int val) {
-
-       return val<0?-val:val;
-   }
-
-   typedef unsigned char uint8;
-   typedef unsigned short uint16;
-   typedef unsigned int uint;
-   typedef unsigned int uint32;
-   typedef long long int64;
-   typedef unsigned long long uint64;
-
-   const uint32 cUINT32_MAX = 0xFFFFFFFFU;
-   const uint64 cUINT64_MAX = 0xFFFFFFFFFFFFFFFFULL; //0xFFFFFFFFFFFFFFFFui64;
-   
-   template<typename T> inline T minimum(T a, T b) { return (a < b) ? a : b; }
-   template<typename T> inline T minimum(T a, T b, T c) { return minimum(minimum(a, b), c); }
-   template<typename T> inline T maximum(T a, T b) { return (a > b) ? a : b; }
-   template<typename T> inline T maximum(T a, T b, T c) { return maximum(maximum(a, b), c); }
-   template<typename T> inline T clamp(T value, T low, T high) { return (value < low) ? low : ((value > high) ? high : value); }
-   template<typename T> inline T square(T value) { return value * value; }
-   template<typename T> inline void zero_object(T& obj) { memset((void*)&obj, 0, sizeof(obj)); }
-   template<typename T> inline void zero_this(T* pObj) { memset((void*)pObj, 0, sizeof(*pObj)); }
-
-   template<class T, size_t N> T decay_array_to_subtype(T (&a)[N]);   
-
-#define RG_ETC1_ARRAY_SIZE(X) (sizeof(X) / sizeof(decay_array_to_subtype(X)))
-
-   enum eNoClamp { cNoClamp };
-
-   struct color_quad_u8
-   {
-      static inline int clamp(int v) { if (v & 0xFFFFFF00U) v = (~(static_cast<int>(v) >> 31)) & 0xFF; return v; }
-
-      struct component_traits { enum { cSigned = false, cFloat = false, cMin = 0U, cMax = 255U }; };
-
-   public:
-      typedef unsigned char component_t;
-      typedef int parameter_t;
-
-      enum { cNumComps = 4 };
-
-      union
-      {
-         struct
-         {
-            component_t r;
-            component_t g;
-            component_t b;
-            component_t a;
-         };
-
-         component_t c[cNumComps];
-
-         uint32 m_u32;
-      };
-
-      inline color_quad_u8()
-      {
-      }
-
-      inline color_quad_u8(const color_quad_u8& other) : m_u32(other.m_u32)
-      {
-      }
-
-      explicit inline color_quad_u8(parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         set(y, alpha);
-      }
-
-      inline color_quad_u8(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
-      {
-         set(red, green, blue, alpha);
-      }
-
-      explicit inline color_quad_u8(eNoClamp, parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         set_noclamp_y_alpha(y, alpha);
-      }
-
-      inline color_quad_u8(eNoClamp, parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
-      {
-         set_noclamp_rgba(red, green, blue, alpha);
-      }
-
-      inline void clear()
-      {
-         m_u32 = 0;
-      }
-
-      inline color_quad_u8& operator= (const color_quad_u8& other)
-      {
-         m_u32 = other.m_u32;
-         return *this;
-      }
-
-      inline color_quad_u8& set_rgb(const color_quad_u8& other)
-      {
-         r = other.r;
-         g = other.g;
-         b = other.b;
-         return *this;
-      }
-
-      inline color_quad_u8& operator= (parameter_t y)
-      {
-         set(y, component_traits::cMax);
-         return *this;
-      }
-
-      inline color_quad_u8& set(parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         y = clamp(y);
-         alpha = clamp(alpha);
-         r = static_cast<component_t>(y);
-         g = static_cast<component_t>(y);
-         b = static_cast<component_t>(y);
-         a = static_cast<component_t>(alpha);
-         return *this;
-      }
-
-      inline color_quad_u8& set_noclamp_y_alpha(parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         RG_ETC1_ASSERT( (y >= component_traits::cMin) && (y <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
-
-         r = static_cast<component_t>(y);
-         g = static_cast<component_t>(y);
-         b = static_cast<component_t>(y);
-         a = static_cast<component_t>(alpha);
-         return *this;
-      }
-
-      inline color_quad_u8& set(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
-      {
-         r = static_cast<component_t>(clamp(red));
-         g = static_cast<component_t>(clamp(green));
-         b = static_cast<component_t>(clamp(blue));
-         a = static_cast<component_t>(clamp(alpha));
-         return *this;
-      }
-
-      inline color_quad_u8& set_noclamp_rgba(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha)
-      {
-         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
-
-         r = static_cast<component_t>(red);
-         g = static_cast<component_t>(green);
-         b = static_cast<component_t>(blue);
-         a = static_cast<component_t>(alpha);
-         return *this;
-      }
-
-      inline color_quad_u8& set_noclamp_rgb(parameter_t red, parameter_t green, parameter_t blue)
-      {
-         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
-
-         r = static_cast<component_t>(red);
-         g = static_cast<component_t>(green);
-         b = static_cast<component_t>(blue);
-         return *this;
-      }
-
-      static inline parameter_t get_min_comp() { return component_traits::cMin; }
-      static inline parameter_t get_max_comp() { return component_traits::cMax; }
-      static inline bool get_comps_are_signed() { return component_traits::cSigned; }
-
-      inline component_t operator[] (uint i) const { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
-      inline component_t& operator[] (uint i) { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
-
-      inline color_quad_u8& set_component(uint i, parameter_t f)
-      {
-         RG_ETC1_ASSERT(i < cNumComps);
-
-         c[i] = static_cast<component_t>(clamp(f));
-
-         return *this;
-      }
-
-      inline color_quad_u8& set_grayscale(parameter_t l)
-      {
-         component_t x = static_cast<component_t>(clamp(l));
-         c[0] = x;
-         c[1] = x;
-         c[2] = x;
-         return *this;
-      }
-
-      inline color_quad_u8& clamp(const color_quad_u8& l, const color_quad_u8& h)
-      {
-         for (uint i = 0; i < cNumComps; i++)
-            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l[i], h[i]));
-         return *this;
-      }
-
-      inline color_quad_u8& clamp(parameter_t l, parameter_t h)
-      {
-         for (uint i = 0; i < cNumComps; i++)
-            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l, h));
-         return *this;
-      }
-
-      // Returns CCIR 601 luma (consistent with color_utils::RGB_To_Y).
-      inline parameter_t get_luma() const
-      {
-         return static_cast<parameter_t>((19595U * r + 38470U * g + 7471U * b + 32768U) >> 16U);
-      }
-
-      // Returns REC 709 luma.
-      inline parameter_t get_luma_rec709() const
-      {
-         return static_cast<parameter_t>((13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U);
-      }
-
-      inline uint squared_distance_rgb(const color_quad_u8& c) const
-      {
-         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b);
-      }
-
-      inline uint squared_distance_rgba(const color_quad_u8& c) const
-      {
-         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b) + rg_etc1::square(a - c.a);
-      }
-
-      inline bool rgb_equals(const color_quad_u8& rhs) const
-      {
-         return (r == rhs.r) && (g == rhs.g) && (b == rhs.b);
-      }
-
-      inline bool operator== (const color_quad_u8& rhs) const
-      {
-         return m_u32 == rhs.m_u32;
-      }
-
-      color_quad_u8& operator+= (const color_quad_u8& other)
-      {
-         for (uint i = 0; i < 4; i++)
-            c[i] = static_cast<component_t>(clamp(c[i] + other.c[i]));
-         return *this;
-      }
-
-      color_quad_u8& operator-= (const color_quad_u8& other)
-      {
-         for (uint i = 0; i < 4; i++)
-            c[i] = static_cast<component_t>(clamp(c[i] - other.c[i]));
-         return *this;
-      }
-
-      friend color_quad_u8 operator+ (const color_quad_u8& lhs, const color_quad_u8& rhs)
-      {
-         color_quad_u8 result(lhs);
-         result += rhs;
-         return result;
-      }
-
-      friend color_quad_u8 operator- (const color_quad_u8& lhs, const color_quad_u8& rhs)
-      {
-         color_quad_u8 result(lhs);
-         result -= rhs;
-         return result;
-      }
-   }; // class color_quad_u8
-
-   struct vec3F
-   {
-      float m_s[3];
-      
-      inline vec3F() { }
-      inline vec3F(float s) { m_s[0] = s; m_s[1] = s; m_s[2] = s; }
-      inline vec3F(float x, float y, float z) { m_s[0] = x; m_s[1] = y; m_s[2] = z; }
-      
-      inline float operator[] (uint i) const { RG_ETC1_ASSERT(i < 3); return m_s[i]; }
-
-      inline vec3F& operator += (const vec3F& other) { for (uint i = 0; i < 3; i++) m_s[i] += other.m_s[i]; return *this; }
-
-      inline vec3F& operator *= (float s) { for (uint i = 0; i < 3; i++) m_s[i] *= s; return *this; }
-   };
-     
-   enum etc_constants
-   {
-      cETC1BytesPerBlock = 8U,
-
-      cETC1SelectorBits = 2U,
-      cETC1SelectorValues = 1U << cETC1SelectorBits,
-      cETC1SelectorMask = cETC1SelectorValues - 1U,
-
-      cETC1BlockShift = 2U,
-      cETC1BlockSize = 1U << cETC1BlockShift,
-
-      cETC1LSBSelectorIndicesBitOffset = 0,
-      cETC1MSBSelectorIndicesBitOffset = 16,
-
-      cETC1FlipBitOffset = 32,
-      cETC1DiffBitOffset = 33,
-
-      cETC1IntenModifierNumBits = 3,
-      cETC1IntenModifierValues = 1 << cETC1IntenModifierNumBits,
-      cETC1RightIntenModifierTableBitOffset = 34,
-      cETC1LeftIntenModifierTableBitOffset = 37,
-
-      // Base+Delta encoding (5 bit bases, 3 bit delta)
-      cETC1BaseColorCompNumBits = 5,
-      cETC1BaseColorCompMax = 1 << cETC1BaseColorCompNumBits,
-
-      cETC1DeltaColorCompNumBits = 3,
-      cETC1DeltaColorComp = 1 << cETC1DeltaColorCompNumBits,
-      cETC1DeltaColorCompMax = 1 << cETC1DeltaColorCompNumBits,
-
-      cETC1BaseColor5RBitOffset = 59,
-      cETC1BaseColor5GBitOffset = 51,
-      cETC1BaseColor5BBitOffset = 43,
-
-      cETC1DeltaColor3RBitOffset = 56,
-      cETC1DeltaColor3GBitOffset = 48,
-      cETC1DeltaColor3BBitOffset = 40,
-
-      // Absolute (non-delta) encoding (two 4-bit per component bases)
-      cETC1AbsColorCompNumBits = 4,
-      cETC1AbsColorCompMax = 1 << cETC1AbsColorCompNumBits,
-
-      cETC1AbsColor4R1BitOffset = 60,
-      cETC1AbsColor4G1BitOffset = 52,
-      cETC1AbsColor4B1BitOffset = 44,
-
-      cETC1AbsColor4R2BitOffset = 56,
-      cETC1AbsColor4G2BitOffset = 48,
-      cETC1AbsColor4B2BitOffset = 40,
-
-      cETC1ColorDeltaMin = -4,
-      cETC1ColorDeltaMax = 3,
-
-      // Delta3:
-      // 0   1   2   3   4   5   6   7
-      // 000 001 010 011 100 101 110 111
-      // 0   1   2   3   -4  -3  -2  -1
-   };
-   
-   static uint8 g_quant5_tab[256+16];
-
-
-   static const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues] = 
-   { 
-      { -8,  -2,   2,   8 }, { -17,  -5,  5,  17 }, { -29,  -9,   9,  29 }, {  -42, -13, 13,  42 }, 
-      { -60, -18, 18,  60 }, { -80, -24, 24,  80 }, { -106, -33, 33, 106 }, { -183, -47, 47, 183 } 
-   };
-
-   static const uint8 g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
-   static const uint8 g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
-      
-   // Given an ETC1 diff/inten_table/selector, and an 8-bit desired color, this table encodes the best packed_color in the low byte, and the abs error in the high byte.
-   static uint16 g_etc1_inverse_lookup[2*8*4][256];      // [diff/inten_table/selector][desired_color]
-
-   // g_color8_to_etc_block_config[color][table_index] = Supplies for each 8-bit color value a list of packed ETC1 diff/intensity table/selectors/packed_colors that map to that color.
-   // To pack: diff | (inten << 1) | (selector << 4) | (packed_c << 8)
-   static const uint16 g_color8_to_etc_block_config_0_255[2][33] =
-   {
-      { 0x0000,  0x0010,  0x0002,  0x0012,  0x0004,  0x0014,  0x0006,  0x0016,  0x0008,  0x0018,  0x000A,  0x001A,  0x000C,  0x001C,  0x000E,  0x001E,
-        0x0001,  0x0011,  0x0003,  0x0013,  0x0005,  0x0015,  0x0007,  0x0017,  0x0009,  0x0019,  0x000B,  0x001B,  0x000D,  0x001D,  0x000F,  0x001F, 0xFFFF },
-      { 0x0F20,  0x0F30,  0x0E32,  0x0F22,  0x0E34,  0x0F24,  0x0D36,  0x0F26,  0x0C38,  0x0E28,  0x0B3A,  0x0E2A,  0x093C,  0x0E2C,  0x053E,  0x0D2E,
-        0x1E31,  0x1F21,  0x1D33,  0x1F23,  0x1C35,  0x1E25,  0x1A37,  0x1E27,  0x1839,  0x1D29,  0x163B,  0x1C2B,  0x133D,  0x1B2D,  0x093F,  0x1A2F, 0xFFFF },
-   };
-
-   // Really only [254][11].
-   static const uint16 g_color8_to_etc_block_config_1_to_254[254][12] = 
-   {
-      { 0x021C, 0x0D0D, 0xFFFF }, { 0x0020, 0x0021, 0x0A0B, 0x061F, 0xFFFF }, { 0x0113, 0x0217, 0xFFFF }, { 0x0116, 0x031E,
-      0x0B0E, 0x0405, 0xFFFF }, { 0x0022, 0x0204, 0x050A, 0x0023, 0xFFFF }, { 0x0111, 0x0319, 0x0809, 0x170F, 0xFFFF }, {
-      0x0303, 0x0215, 0x0607, 0xFFFF }, { 0x0030, 0x0114, 0x0408, 0x0031, 0x0201, 0x051D, 0xFFFF }, { 0x0100, 0x0024, 0x0306,
-      0x0025, 0x041B, 0x0E0D, 0xFFFF }, { 0x021A, 0x0121, 0x0B0B, 0x071F, 0xFFFF }, { 0x0213, 0x0317, 0xFFFF }, { 0x0112,
-      0x0505, 0xFFFF }, { 0x0026, 0x070C, 0x0123, 0x0027, 0xFFFF }, { 0x0211, 0x0909, 0xFFFF }, { 0x0110, 0x0315, 0x0707,
-      0x0419, 0x180F, 0xFFFF }, { 0x0218, 0x0131, 0x0301, 0x0403, 0x061D, 0xFFFF }, { 0x0032, 0x0202, 0x0033, 0x0125, 0x051B,
-      0x0F0D, 0xFFFF }, { 0x0028, 0x031C, 0x0221, 0x0029, 0xFFFF }, { 0x0120, 0x0313, 0x0C0B, 0x081F, 0xFFFF }, { 0x0605,
-      0x0417, 0xFFFF }, { 0x0216, 0x041E, 0x0C0E, 0x0223, 0x0127, 0xFFFF }, { 0x0122, 0x0304, 0x060A, 0x0311, 0x0A09, 0xFFFF
-      }, { 0x0519, 0x190F, 0xFFFF }, { 0x002A, 0x0231, 0x0503, 0x0415, 0x0807, 0x002B, 0x071D, 0xFFFF }, { 0x0130, 0x0214,
-      0x0508, 0x0401, 0x0133, 0x0225, 0x061B, 0xFFFF }, { 0x0200, 0x0124, 0x0406, 0x0321, 0x0129, 0x100D, 0xFFFF }, { 0x031A,
-      0x0D0B, 0x091F, 0xFFFF }, { 0x0413, 0x0705, 0x0517, 0xFFFF }, { 0x0212, 0x0034, 0x0323, 0x0035, 0x0227, 0xFFFF }, {
-      0x0126, 0x080C, 0x0B09, 0xFFFF }, { 0x0411, 0x0619, 0x1A0F, 0xFFFF }, { 0x0210, 0x0331, 0x0603, 0x0515, 0x0907, 0x012B,
-      0xFFFF }, { 0x0318, 0x002C, 0x0501, 0x0233, 0x0325, 0x071B, 0x002D, 0x081D, 0xFFFF }, { 0x0132, 0x0302, 0x0229, 0x110D,
-      0xFFFF }, { 0x0128, 0x041C, 0x0421, 0x0E0B, 0x0A1F, 0xFFFF }, { 0x0220, 0x0513, 0x0617, 0xFFFF }, { 0x0135, 0x0805,
-      0x0327, 0xFFFF }, { 0x0316, 0x051E, 0x0D0E, 0x0423, 0xFFFF }, { 0x0222, 0x0404, 0x070A, 0x0511, 0x0719, 0x0C09, 0x1B0F,
-      0xFFFF }, { 0x0703, 0x0615, 0x0A07, 0x022B, 0xFFFF }, { 0x012A, 0x0431, 0x0601, 0x0333, 0x012D, 0x091D, 0xFFFF }, {
-      0x0230, 0x0314, 0x0036, 0x0608, 0x0425, 0x0037, 0x0329, 0x081B, 0x120D, 0xFFFF }, { 0x0300, 0x0224, 0x0506, 0x0521,
-      0x0F0B, 0x0B1F, 0xFFFF }, { 0x041A, 0x0613, 0x0717, 0xFFFF }, { 0x0235, 0x0905, 0xFFFF }, { 0x0312, 0x0134, 0x0523,
-      0x0427, 0xFFFF }, { 0x0226, 0x090C, 0x002E, 0x0611, 0x0D09, 0x002F, 0xFFFF }, { 0x0715, 0x0B07, 0x0819, 0x032B, 0x1C0F,
-      0xFFFF }, { 0x0310, 0x0531, 0x0701, 0x0803, 0x022D, 0x0A1D, 0xFFFF }, { 0x0418, 0x012C, 0x0433, 0x0525, 0x0137, 0x091B,
-      0x130D, 0xFFFF }, { 0x0232, 0x0402, 0x0621, 0x0429, 0xFFFF }, { 0x0228, 0x051C, 0x0713, 0x100B, 0x0C1F, 0xFFFF }, {
-      0x0320, 0x0335, 0x0A05, 0x0817, 0xFFFF }, { 0x0623, 0x0527, 0xFFFF }, { 0x0416, 0x061E, 0x0E0E, 0x0711, 0x0E09, 0x012F,
-      0xFFFF }, { 0x0322, 0x0504, 0x080A, 0x0919, 0x1D0F, 0xFFFF }, { 0x0631, 0x0903, 0x0815, 0x0C07, 0x042B, 0x032D, 0x0B1D,
-      0xFFFF }, { 0x022A, 0x0801, 0x0533, 0x0625, 0x0237, 0x0A1B, 0xFFFF }, { 0x0330, 0x0414, 0x0136, 0x0708, 0x0721, 0x0529,
-      0x140D, 0xFFFF }, { 0x0400, 0x0324, 0x0606, 0x0038, 0x0039, 0x110B, 0x0D1F, 0xFFFF }, { 0x051A, 0x0813, 0x0B05, 0x0917,
-      0xFFFF }, { 0x0723, 0x0435, 0x0627, 0xFFFF }, { 0x0412, 0x0234, 0x0F09, 0x022F, 0xFFFF }, { 0x0326, 0x0A0C, 0x012E,
-      0x0811, 0x0A19, 0x1E0F, 0xFFFF }, { 0x0731, 0x0A03, 0x0915, 0x0D07, 0x052B, 0xFFFF }, { 0x0410, 0x0901, 0x0633, 0x0725,
-      0x0337, 0x0B1B, 0x042D, 0x0C1D, 0xFFFF }, { 0x0518, 0x022C, 0x0629, 0x150D, 0xFFFF }, { 0x0332, 0x0502, 0x0821, 0x0139,
-      0x120B, 0x0E1F, 0xFFFF }, { 0x0328, 0x061C, 0x0913, 0x0A17, 0xFFFF }, { 0x0420, 0x0535, 0x0C05, 0x0727, 0xFFFF }, {
-      0x0823, 0x032F, 0xFFFF }, { 0x0516, 0x071E, 0x0F0E, 0x0911, 0x0B19, 0x1009, 0x1F0F, 0xFFFF }, { 0x0422, 0x0604, 0x090A,
-      0x0B03, 0x0A15, 0x0E07, 0x062B, 0xFFFF }, { 0x0831, 0x0A01, 0x0733, 0x052D, 0x0D1D, 0xFFFF }, { 0x032A, 0x0825, 0x0437,
-      0x0729, 0x0C1B, 0x160D, 0xFFFF }, { 0x0430, 0x0514, 0x0236, 0x0808, 0x0921, 0x0239, 0x130B, 0x0F1F, 0xFFFF }, { 0x0500,
-      0x0424, 0x0706, 0x0138, 0x0A13, 0x0B17, 0xFFFF }, { 0x061A, 0x0635, 0x0D05, 0xFFFF }, { 0x0923, 0x0827, 0xFFFF }, {
-      0x0512, 0x0334, 0x003A, 0x0A11, 0x1109, 0x003B, 0x042F, 0xFFFF }, { 0x0426, 0x0B0C, 0x022E, 0x0B15, 0x0F07, 0x0C19,
-      0x072B, 0xFFFF }, { 0x0931, 0x0B01, 0x0C03, 0x062D, 0x0E1D, 0xFFFF }, { 0x0510, 0x0833, 0x0925, 0x0537, 0x0D1B, 0x170D,
-      0xFFFF }, { 0x0618, 0x032C, 0x0A21, 0x0339, 0x0829, 0xFFFF }, { 0x0432, 0x0602, 0x0B13, 0x140B, 0x101F, 0xFFFF }, {
-      0x0428, 0x071C, 0x0735, 0x0E05, 0x0C17, 0xFFFF }, { 0x0520, 0x0A23, 0x0927, 0xFFFF }, { 0x0B11, 0x1209, 0x013B, 0x052F,
-      0xFFFF }, { 0x0616, 0x081E, 0x0D19, 0xFFFF }, { 0x0522, 0x0704, 0x0A0A, 0x0A31, 0x0D03, 0x0C15, 0x1007, 0x082B, 0x072D,
-      0x0F1D, 0xFFFF }, { 0x0C01, 0x0933, 0x0A25, 0x0637, 0x0E1B, 0xFFFF }, { 0x042A, 0x0B21, 0x0929, 0x180D, 0xFFFF }, {
-	      0x0530, 0x0614, 0x0336, 0x0908, 0x0439, 0x150B, 0x111F, 0xFFFF }, { 0x0600, 0x0524, 0x0806, 0x0238, 0x0C13, 0x0F05,
-      0x0D17, 0xFFFF }, { 0x071A, 0x0B23, 0x0835, 0x0A27, 0xFFFF }, { 0x1309, 0x023B, 0x062F, 0xFFFF }, { 0x0612, 0x0434,
-      0x013A, 0x0C11, 0x0E19, 0xFFFF }, { 0x0526, 0x0C0C, 0x032E, 0x0B31, 0x0E03, 0x0D15, 0x1107, 0x092B, 0xFFFF }, { 0x0D01,
-      0x0A33, 0x0B25, 0x0737, 0x0F1B, 0x082D, 0x101D, 0xFFFF }, { 0x0610, 0x0A29, 0x190D, 0xFFFF }, { 0x0718, 0x042C, 0x0C21,
-      0x0539, 0x160B, 0x121F, 0xFFFF }, { 0x0532, 0x0702, 0x0D13, 0x0E17, 0xFFFF }, { 0x0528, 0x081C, 0x0935, 0x1005, 0x0B27,
-      0xFFFF }, { 0x0620, 0x0C23, 0x033B, 0x072F, 0xFFFF }, { 0x0D11, 0x0F19, 0x1409, 0xFFFF }, { 0x0716, 0x003C, 0x091E,
-      0x0F03, 0x0E15, 0x1207, 0x0A2B, 0x003D, 0xFFFF }, { 0x0622, 0x0804, 0x0B0A, 0x0C31, 0x0E01, 0x0B33, 0x092D, 0x111D,
-      0xFFFF }, { 0x0C25, 0x0837, 0x0B29, 0x101B, 0x1A0D, 0xFFFF }, { 0x052A, 0x0D21, 0x0639, 0x170B, 0x131F, 0xFFFF }, {
-      0x0630, 0x0714, 0x0436, 0x0A08, 0x0E13, 0x0F17, 0xFFFF }, { 0x0700, 0x0624, 0x0906, 0x0338, 0x0A35, 0x1105, 0xFFFF }, {
-      0x081A, 0x0D23, 0x0C27, 0xFFFF }, { 0x0E11, 0x1509, 0x043B, 0x082F, 0xFFFF }, { 0x0712, 0x0534, 0x023A, 0x0F15, 0x1307,
-      0x1019, 0x0B2B, 0x013D, 0xFFFF }, { 0x0626, 0x0D0C, 0x042E, 0x0D31, 0x0F01, 0x1003, 0x0A2D, 0x121D, 0xFFFF }, { 0x0C33,
-      0x0D25, 0x0937, 0x111B, 0x1B0D, 0xFFFF }, { 0x0710, 0x0E21, 0x0739, 0x0C29, 0xFFFF }, { 0x0818, 0x052C, 0x0F13, 0x180B,
-      0x141F, 0xFFFF }, { 0x0632, 0x0802, 0x0B35, 0x1205, 0x1017, 0xFFFF }, { 0x0628, 0x091C, 0x0E23, 0x0D27, 0xFFFF }, {
-      0x0720, 0x0F11, 0x1609, 0x053B, 0x092F, 0xFFFF }, { 0x1119, 0x023D, 0xFFFF }, { 0x0816, 0x013C, 0x0A1E, 0x0E31, 0x1103,
-      0x1015, 0x1407, 0x0C2B, 0x0B2D, 0x131D, 0xFFFF }, { 0x0722, 0x0904, 0x0C0A, 0x1001, 0x0D33, 0x0E25, 0x0A37, 0x121B,
-      0xFFFF }, { 0x0F21, 0x0D29, 0x1C0D, 0xFFFF }, { 0x062A, 0x0839, 0x190B, 0x151F, 0xFFFF }, { 0x0730, 0x0814, 0x0536,
-      0x0B08, 0x1013, 0x1305, 0x1117, 0xFFFF }, { 0x0800, 0x0724, 0x0A06, 0x0438, 0x0F23, 0x0C35, 0x0E27, 0xFFFF }, { 0x091A,
-      0x1709, 0x063B, 0x0A2F, 0xFFFF }, { 0x1011, 0x1219, 0x033D, 0xFFFF }, { 0x0812, 0x0634, 0x033A, 0x0F31, 0x1203, 0x1115,
-      0x1507, 0x0D2B, 0xFFFF }, { 0x0726, 0x0E0C, 0x052E, 0x1101, 0x0E33, 0x0F25, 0x0B37, 0x131B, 0x0C2D, 0x141D, 0xFFFF }, {
-      0x0E29, 0x1D0D, 0xFFFF }, { 0x0810, 0x1021, 0x0939, 0x1A0B, 0x161F, 0xFFFF }, { 0x0918, 0x062C, 0x1113, 0x1217, 0xFFFF
-      }, { 0x0732, 0x0902, 0x0D35, 0x1405, 0x0F27, 0xFFFF }, { 0x0728, 0x0A1C, 0x1023, 0x073B, 0x0B2F, 0xFFFF }, { 0x0820,
-      0x1111, 0x1319, 0x1809, 0xFFFF }, { 0x1303, 0x1215, 0x1607, 0x0E2B, 0x043D, 0xFFFF }, { 0x0916, 0x023C, 0x0B1E, 0x1031,
-      0x1201, 0x0F33, 0x0D2D, 0x151D, 0xFFFF }, { 0x0822, 0x0A04, 0x0D0A, 0x1025, 0x0C37, 0x0F29, 0x141B, 0x1E0D, 0xFFFF }, {
-      0x1121, 0x0A39, 0x1B0B, 0x171F, 0xFFFF }, { 0x072A, 0x1213, 0x1317, 0xFFFF }, { 0x0830, 0x0914, 0x0636, 0x0C08, 0x0E35,
-      0x1505, 0xFFFF }, { 0x0900, 0x0824, 0x0B06, 0x0538, 0x1123, 0x1027, 0xFFFF }, { 0x0A1A, 0x1211, 0x1909, 0x083B, 0x0C2F,
-      0xFFFF }, { 0x1315, 0x1707, 0x1419, 0x0F2B, 0x053D, 0xFFFF }, { 0x0912, 0x0734, 0x043A, 0x1131, 0x1301, 0x1403, 0x0E2D,
-      0x161D, 0xFFFF }, { 0x0826, 0x0F0C, 0x062E, 0x1033, 0x1125, 0x0D37, 0x151B, 0x1F0D, 0xFFFF }, { 0x1221, 0x0B39, 0x1029,
-      0xFFFF }, { 0x0910, 0x1313, 0x1C0B, 0x181F, 0xFFFF }, { 0x0A18, 0x072C, 0x0F35, 0x1605, 0x1417, 0xFFFF }, { 0x0832,
-      0x0A02, 0x1223, 0x1127, 0xFFFF }, { 0x0828, 0x0B1C, 0x1311, 0x1A09, 0x093B, 0x0D2F, 0xFFFF }, { 0x0920, 0x1519, 0x063D,
-      0xFFFF }, { 0x1231, 0x1503, 0x1415, 0x1807, 0x102B, 0x0F2D, 0x171D, 0xFFFF }, { 0x0A16, 0x033C, 0x0C1E, 0x1401, 0x1133,
-      0x1225, 0x0E37, 0x161B, 0xFFFF }, { 0x0922, 0x0B04, 0x0E0A, 0x1321, 0x1129, 0xFFFF }, { 0x0C39, 0x1D0B, 0x191F, 0xFFFF
-      }, { 0x082A, 0x1413, 0x1705, 0x1517, 0xFFFF }, { 0x0930, 0x0A14, 0x0736, 0x0D08, 0x1323, 0x1035, 0x1227, 0xFFFF }, {
-      0x0A00, 0x0924, 0x0C06, 0x0638, 0x1B09, 0x0A3B, 0x0E2F, 0xFFFF }, { 0x0B1A, 0x1411, 0x1619, 0x073D, 0xFFFF }, { 0x1331,
-      0x1603, 0x1515, 0x1907, 0x112B, 0xFFFF }, { 0x0A12, 0x0834, 0x053A, 0x1501, 0x1233, 0x1325, 0x0F37, 0x171B, 0x102D,
-      0x181D, 0xFFFF }, { 0x0926, 0x072E, 0x1229, 0xFFFF }, { 0x1421, 0x0D39, 0x1E0B, 0x1A1F, 0xFFFF }, { 0x0A10, 0x1513,
-      0x1617, 0xFFFF }, { 0x0B18, 0x082C, 0x1135, 0x1805, 0x1327, 0xFFFF }, { 0x0932, 0x0B02, 0x1423, 0x0B3B, 0x0F2F, 0xFFFF
-      }, { 0x0928, 0x0C1C, 0x1511, 0x1719, 0x1C09, 0xFFFF }, { 0x0A20, 0x1703, 0x1615, 0x1A07, 0x122B, 0x083D, 0xFFFF }, {
-      0x1431, 0x1601, 0x1333, 0x112D, 0x191D, 0xFFFF }, { 0x0B16, 0x043C, 0x0D1E, 0x1425, 0x1037, 0x1329, 0x181B, 0xFFFF }, {
-      0x0A22, 0x0C04, 0x0F0A, 0x1521, 0x0E39, 0x1F0B, 0x1B1F, 0xFFFF }, { 0x1613, 0x1717, 0xFFFF }, { 0x092A, 0x1235, 0x1905,
-      0xFFFF }, { 0x0A30, 0x0B14, 0x0836, 0x0E08, 0x1523, 0x1427, 0xFFFF }, { 0x0B00, 0x0A24, 0x0D06, 0x0738, 0x1611, 0x1D09,
-      0x0C3B, 0x102F, 0xFFFF }, { 0x0C1A, 0x1715, 0x1B07, 0x1819, 0x132B, 0x093D, 0xFFFF }, { 0x1531, 0x1701, 0x1803, 0x122D,
-      0x1A1D, 0xFFFF }, { 0x0B12, 0x0934, 0x063A, 0x1433, 0x1525, 0x1137, 0x191B, 0xFFFF }, { 0x0A26, 0x003E, 0x082E, 0x1621,
-      0x0F39, 0x1429, 0x003F, 0xFFFF }, { 0x1713, 0x1C1F, 0xFFFF }, { 0x0B10, 0x1335, 0x1A05, 0x1817, 0xFFFF }, { 0x0C18,
-      0x092C, 0x1623, 0x1527, 0xFFFF }, { 0x0A32, 0x0C02, 0x1711, 0x1E09, 0x0D3B, 0x112F, 0xFFFF }, { 0x0A28, 0x0D1C, 0x1919,
-      0x0A3D, 0xFFFF }, { 0x0B20, 0x1631, 0x1903, 0x1815, 0x1C07, 0x142B, 0x132D, 0x1B1D, 0xFFFF }, { 0x1801, 0x1533, 0x1625,
-      0x1237, 0x1A1B, 0xFFFF }, { 0x0C16, 0x053C, 0x0E1E, 0x1721, 0x1529, 0x013F, 0xFFFF }, { 0x0B22, 0x0D04, 0x1039, 0x1D1F,
-      0xFFFF }, { 0x1813, 0x1B05, 0x1917, 0xFFFF }, { 0x0A2A, 0x1723, 0x1435, 0x1627, 0xFFFF }, { 0x0B30, 0x0C14, 0x0936,
-      0x0F08, 0x1F09, 0x0E3B, 0x122F, 0xFFFF }, { 0x0C00, 0x0B24, 0x0E06, 0x0838, 0x1811, 0x1A19, 0x0B3D, 0xFFFF }, { 0x0D1A,
-      0x1731, 0x1A03, 0x1915, 0x1D07, 0x152B, 0xFFFF }, { 0x1901, 0x1633, 0x1725, 0x1337, 0x1B1B, 0x142D, 0x1C1D, 0xFFFF }, {
-      0x0C12, 0x0A34, 0x073A, 0x1629, 0x023F, 0xFFFF }, { 0x0B26, 0x013E, 0x092E, 0x1821, 0x1139, 0x1E1F, 0xFFFF }, { 0x1913,
-      0x1A17, 0xFFFF }, { 0x0C10, 0x1535, 0x1C05, 0x1727, 0xFFFF }, { 0x0D18, 0x0A2C, 0x1823, 0x0F3B, 0x132F, 0xFFFF }, {
-      0x0B32, 0x0D02, 0x1911, 0x1B19, 0xFFFF }, { 0x0B28, 0x0E1C, 0x1B03, 0x1A15, 0x1E07, 0x162B, 0x0C3D, 0xFFFF }, { 0x0C20,
-      0x1831, 0x1A01, 0x1733, 0x152D, 0x1D1D, 0xFFFF }, { 0x1825, 0x1437, 0x1729, 0x1C1B, 0x033F, 0xFFFF }, { 0x0D16, 0x063C,
-      0x0F1E, 0x1921, 0x1239, 0x1F1F, 0xFFFF }, { 0x0C22, 0x0E04, 0x1A13, 0x1B17, 0xFFFF }, { 0x1635, 0x1D05, 0xFFFF }, {
-      0x0B2A, 0x1923, 0x1827, 0xFFFF }, { 0x0C30, 0x0D14, 0x0A36, 0x1A11, 0x103B, 0x142F, 0xFFFF }, { 0x0D00, 0x0C24, 0x0F06,
-      0x0938, 0x1B15, 0x1F07, 0x1C19, 0x172B, 0x0D3D, 0xFFFF }, { 0x0E1A, 0x1931, 0x1B01, 0x1C03, 0x162D, 0x1E1D, 0xFFFF }, {
-      0x1833, 0x1925, 0x1537, 0x1D1B, 0xFFFF }, { 0x0D12, 0x0B34, 0x083A, 0x1A21, 0x1339, 0x1829, 0x043F, 0xFFFF }, { 0x0C26,
-      0x023E, 0x0A2E, 0x1B13, 0xFFFF }, { 0x1735, 0x1E05, 0x1C17, 0xFFFF }, { 0x0D10, 0x1A23, 0x1927, 0xFFFF }, { 0x0E18,
-      0x0B2C, 0x1B11, 0x113B, 0x152F, 0xFFFF }, { 0x0C32, 0x0E02, 0x1D19, 0x0E3D, 0xFFFF }, { 0x0C28, 0x0F1C, 0x1A31, 0x1D03,
-      0x1C15, 0x182B, 0x172D, 0x1F1D, 0xFFFF }, { 0x0D20, 0x1C01, 0x1933, 0x1A25, 0x1637, 0x1E1B, 0xFFFF }, { 0x1B21, 0x1929,
-      0x053F, 0xFFFF }, { 0x0E16, 0x073C, 0x1439, 0xFFFF }, { 0x0D22, 0x0F04, 0x1C13, 0x1F05, 0x1D17, 0xFFFF }, { 0x1B23,
-      0x1835, 0x1A27, 0xFFFF }, { 0x0C2A, 0x123B, 0x162F, 0xFFFF }, { 0x0D30, 0x0E14, 0x0B36, 0x1C11, 0x1E19, 0x0F3D, 0xFFFF
-      }, { 0x0E00, 0x0D24, 0x0A38, 0x1B31, 0x1E03, 0x1D15, 0x192B, 0xFFFF }, { 0x0F1A, 0x1D01, 0x1A33, 0x1B25, 0x1737, 0x1F1B,
-      0x182D, 0xFFFF }, { 0x1A29, 0x063F, 0xFFFF }, { 0x0E12, 0x0C34, 0x093A, 0x1C21, 0x1539, 0xFFFF }, { 0x0D26, 0x033E,
-      0x0B2E, 0x1D13, 0x1E17, 0xFFFF }, { 0x1935, 0x1B27, 0xFFFF }, { 0x0E10, 0x1C23, 0x133B, 0x172F, 0xFFFF }, { 0x0F18,
-      0x0C2C, 0x1D11, 0x1F19, 0xFFFF }, { 0x0D32, 0x0F02, 0x1F03, 0x1E15, 0x1A2B, 0x103D, 0xFFFF }, { 0x0D28, 0x1C31, 0x1E01,
-      0x1B33, 0x192D, 0xFFFF }, { 0x0E20, 0x1C25, 0x1837, 0x1B29, 0x073F, 0xFFFF }, { 0x1D21, 0x1639, 0xFFFF }, { 0x0F16,
-      0x083C, 0x1E13, 0x1F17, 0xFFFF }, { 0x0E22, 0x1A35, 0xFFFF }, { 0x1D23, 0x1C27, 0xFFFF }, { 0x0D2A, 0x1E11, 0x143B,
-      0x182F, 0xFFFF }, { 0x0E30, 0x0F14, 0x0C36, 0x1F15, 0x1B2B, 0x113D, 0xFFFF }, { 0x0F00, 0x0E24, 0x0B38, 0x1D31, 0x1F01,
-      0x1A2D, 0xFFFF }, { 0x1C33, 0x1D25, 0x1937, 0xFFFF }, { 0x1E21, 0x1739, 0x1C29, 0x083F, 0xFFFF }, { 0x0F12, 0x0D34,
-      0x0A3A, 0x1F13, 0xFFFF }, { 0x0E26, 0x043E, 0x0C2E, 0x1B35, 0xFFFF }, { 0x1E23, 0x1D27, 0xFFFF }, { 0x0F10, 0x1F11,
-      0x153B, 0x192F, 0xFFFF }, { 0x0D2C, 0x123D, 0xFFFF },
-   };
-
-   struct etc1_block
-   {
-      // big endian uint64:
-      // bit ofs:  56  48  40  32  24  16   8   0
-      // byte ofs: b0, b1, b2, b3, b4, b5, b6, b7 
-      union 
-      {
-         uint64 m_uint64;
-         uint8 m_bytes[8];
-      };
-
-      uint8 m_low_color[2];
-      uint8 m_high_color[2];
-
-      enum { cNumSelectorBytes = 4 };
-      uint8 m_selectors[cNumSelectorBytes];
-
-      inline void clear()
-      {
-         zero_this(this);
-      }
-
-      inline uint get_byte_bits(uint ofs, uint num) const
-      {
-         RG_ETC1_ASSERT((ofs + num) <= 64U);
-         RG_ETC1_ASSERT(num && (num <= 8U));
-         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
-         const uint byte_ofs = 7 - (ofs >> 3);
-         const uint byte_bit_ofs = ofs & 7;
-         return (m_bytes[byte_ofs] >> byte_bit_ofs) & ((1 << num) - 1);
-      }
-
-      inline void set_byte_bits(uint ofs, uint num, uint bits)
-      {
-         RG_ETC1_ASSERT((ofs + num) <= 64U);
-         RG_ETC1_ASSERT(num && (num < 32U));
-         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
-         RG_ETC1_ASSERT(bits < (1U << num));
-         const uint byte_ofs = 7 - (ofs >> 3);
-         const uint byte_bit_ofs = ofs & 7;
-         const uint mask = (1 << num) - 1;
-         m_bytes[byte_ofs] &= ~(mask << byte_bit_ofs);
-         m_bytes[byte_ofs] |= (bits << byte_bit_ofs);
-      }
-
-      // false = left/right subblocks
-      // true = upper/lower subblocks
-      inline bool get_flip_bit() const 
-      {
-         return (m_bytes[3] & 1) != 0;
-      }   
-
-      inline void set_flip_bit(bool flip)
-      {
-         m_bytes[3] &= ~1;
-         m_bytes[3] |= static_cast<uint8>(flip);
-      }
-
-      inline bool get_diff_bit() const
-      {
-         return (m_bytes[3] & 2) != 0;
-      }
-
-      inline void set_diff_bit(bool diff)
-      {
-         m_bytes[3] &= ~2;
-         m_bytes[3] |= (static_cast<uint>(diff) << 1);
-      }
-
-      // Returns intensity modifier table (0-7) used by subblock subblock_id.
-      // subblock_id=0 left/top (CW 1), 1=right/bottom (CW 2)
-      inline uint get_inten_table(uint subblock_id) const
-      {
-         RG_ETC1_ASSERT(subblock_id < 2);
-         const uint ofs = subblock_id ? 2 : 5;
-         return (m_bytes[3] >> ofs) & 7;
-      }
-
-      // Sets intensity modifier table (0-7) used by subblock subblock_id (0 or 1)
-      inline void set_inten_table(uint subblock_id, uint t)
-      {
-         RG_ETC1_ASSERT(subblock_id < 2);
-         RG_ETC1_ASSERT(t < 8);
-         const uint ofs = subblock_id ? 2 : 5;
-         m_bytes[3] &= ~(7 << ofs);
-         m_bytes[3] |= (t << ofs);
-      }
-
-      // Returned selector value ranges from 0-3 and is a direct index into g_etc1_inten_tables.
-      inline uint get_selector(uint x, uint y) const
-      {
-         RG_ETC1_ASSERT((x | y) < 4);
-
-         const uint bit_index = x * 4 + y;
-         const uint byte_bit_ofs = bit_index & 7;
-         const uint8 *p = &m_bytes[7 - (bit_index >> 3)];
-         const uint lsb = (p[0] >> byte_bit_ofs) & 1;
-         const uint msb = (p[-2] >> byte_bit_ofs) & 1;
-         const uint val = lsb | (msb << 1);
-
-         return g_etc1_to_selector_index[val];
-      }
-
-      // Selector "val" ranges from 0-3 and is a direct index into g_etc1_inten_tables.
-      inline void set_selector(uint x, uint y, uint val)
-      {
-         RG_ETC1_ASSERT((x | y | val) < 4);
-         const uint bit_index = x * 4 + y;
-
-         uint8 *p = &m_bytes[7 - (bit_index >> 3)];
-
-         const uint byte_bit_ofs = bit_index & 7;
-         const uint mask = 1 << byte_bit_ofs;
-
-         const uint etc1_val = g_selector_index_to_etc1[val];
-
-         const uint lsb = etc1_val & 1;
-         const uint msb = etc1_val >> 1;
-
-         p[0] &= ~mask;
-         p[0] |= (lsb << byte_bit_ofs);
-
-         p[-2] &= ~mask;
-         p[-2] |= (msb << byte_bit_ofs);
-      }
-
-      inline void set_base4_color(uint idx, uint16 c)
-      {
-         if (idx)
-         {
-            set_byte_bits(cETC1AbsColor4R2BitOffset, 4, (c >> 8) & 15);
-            set_byte_bits(cETC1AbsColor4G2BitOffset, 4, (c >> 4) & 15);
-            set_byte_bits(cETC1AbsColor4B2BitOffset, 4, c & 15);
-         }
-         else
-         {
-            set_byte_bits(cETC1AbsColor4R1BitOffset, 4, (c >> 8) & 15);
-            set_byte_bits(cETC1AbsColor4G1BitOffset, 4, (c >> 4) & 15);
-            set_byte_bits(cETC1AbsColor4B1BitOffset, 4, c & 15);
-         }
-      }
-
-      inline uint16 get_base4_color(uint idx) const
-      {
-         uint r, g, b;
-         if (idx)
-         {
-            r = get_byte_bits(cETC1AbsColor4R2BitOffset, 4);
-            g = get_byte_bits(cETC1AbsColor4G2BitOffset, 4);
-            b = get_byte_bits(cETC1AbsColor4B2BitOffset, 4);
-         }
-         else
-         {
-            r = get_byte_bits(cETC1AbsColor4R1BitOffset, 4);
-            g = get_byte_bits(cETC1AbsColor4G1BitOffset, 4);
-            b = get_byte_bits(cETC1AbsColor4B1BitOffset, 4);
-         }
-         return static_cast<uint16>(b | (g << 4U) | (r << 8U));
-      }
-
-      inline void set_base5_color(uint16 c)
-      {
-         set_byte_bits(cETC1BaseColor5RBitOffset, 5, (c >> 10) & 31);
-         set_byte_bits(cETC1BaseColor5GBitOffset, 5, (c >> 5) & 31);
-         set_byte_bits(cETC1BaseColor5BBitOffset, 5, c & 31);
-      }
-
-      inline uint16 get_base5_color() const
-      {
-         const uint r = get_byte_bits(cETC1BaseColor5RBitOffset, 5);
-         const uint g = get_byte_bits(cETC1BaseColor5GBitOffset, 5);
-         const uint b = get_byte_bits(cETC1BaseColor5BBitOffset, 5);
-         return static_cast<uint16>(b | (g << 5U) | (r << 10U));
-      }
-
-      void set_delta3_color(uint16 c)
-      {
-         set_byte_bits(cETC1DeltaColor3RBitOffset, 3, (c >> 6) & 7);
-         set_byte_bits(cETC1DeltaColor3GBitOffset, 3, (c >> 3) & 7);
-         set_byte_bits(cETC1DeltaColor3BBitOffset, 3, c & 7);
-      }
-
-      inline uint16 get_delta3_color() const
-      {
-         const uint r = get_byte_bits(cETC1DeltaColor3RBitOffset, 3);
-         const uint g = get_byte_bits(cETC1DeltaColor3GBitOffset, 3);
-         const uint b = get_byte_bits(cETC1DeltaColor3BBitOffset, 3);
-         return static_cast<uint16>(b | (g << 3U) | (r << 6U));
-      }
-
-      // Base color 5
-      static uint16 pack_color5(const color_quad_u8& color, bool scaled, uint bias = 127U);
-      static uint16 pack_color5(uint r, uint g, uint b, bool scaled, uint bias = 127U);
-
-      static color_quad_u8 unpack_color5(uint16 packed_color5, bool scaled, uint alpha = 255U);
-      static void unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color, bool scaled);
-
-      static bool unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
-      static bool unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
-
-      // Delta color 3
-      // Inputs range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
-      static uint16 pack_delta3(int r, int g, int b);
-
-      // Results range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
-      static void unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3);
-
-      // Abs color 4
-      static uint16 pack_color4(const color_quad_u8& color, bool scaled, uint bias = 127U);
-      static uint16 pack_color4(uint r, uint g, uint b, bool scaled, uint bias = 127U);
-
-      static color_quad_u8 unpack_color4(uint16 packed_color4, bool scaled, uint alpha = 255U);
-      static void unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled);
-
-      // subblock colors
-      static void get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx);
-      static bool get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx);
-      static void get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx);
-
-      static inline void unscaled_to_scaled_color(color_quad_u8& dst, const color_quad_u8& src, bool color4)
-      {
-         if (color4)
-         {
-            dst.r = src.r | (src.r << 4);
-            dst.g = src.g | (src.g << 4);
-            dst.b = src.b | (src.b << 4);
-         }
-         else
-         {
-            dst.r = (src.r >> 2) | (src.r << 3);
-            dst.g = (src.g >> 2) | (src.g << 3);
-            dst.b = (src.b >> 2) | (src.b << 3);
-         }
-         dst.a = src.a;
-      }
-   };
-
-   // Returns pointer to sorted array.
-   template<typename T, typename Q>
-   T* indirect_radix_sort(uint num_indices, T* pIndices0, T* pIndices1, const Q* pKeys, uint key_ofs, uint key_size, bool init_indices)
-   {  
-      RG_ETC1_ASSERT((key_ofs >= 0) && (key_ofs < sizeof(T)));
-      RG_ETC1_ASSERT((key_size >= 1) && (key_size <= 4));
-
-      if (init_indices)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + (num_indices >> 1) * 2;
-         uint i;
-         for (i = 0; p != q; p += 2, i += 2)
-         {
-            p[0] = static_cast<T>(i);
-            p[1] = static_cast<T>(i + 1); 
-         }
-
-         if (num_indices & 1)
-            *p = static_cast<T>(i);
-      }
-
-      uint hist[256 * 4];
-
-      memset(hist, 0, sizeof(hist[0]) * 256 * key_size);
-
-#define RG_ETC1_GET_KEY(p) (*(const uint*)((const uint8*)(pKeys + *(p)) + key_ofs))
-#define RG_ETC1_GET_KEY_FROM_INDEX(i) (*(const uint*)((const uint8*)(pKeys + (i)) + key_ofs))
-
-      if (key_size == 4)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + num_indices;
-         for ( ; p != q; p++)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[        key        & 0xFF]++;
-            hist[256 + ((key >>  8) & 0xFF)]++;
-            hist[512 + ((key >> 16) & 0xFF)]++;
-            hist[768 + ((key >> 24) & 0xFF)]++;
-         }
-      }
-      else if (key_size == 3)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + num_indices;
-         for ( ; p != q; p++)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[        key        & 0xFF]++;
-            hist[256 + ((key >>  8) & 0xFF)]++;
-            hist[512 + ((key >> 16) & 0xFF)]++;
-         }
-      }   
-      else if (key_size == 2)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + (num_indices >> 1) * 2;
-
-         for ( ; p != q; p += 2)
-         {
-            const uint key0 = RG_ETC1_GET_KEY(p);
-            const uint key1 = RG_ETC1_GET_KEY(p+1);
-
-            hist[        key0         & 0xFF]++;
-            hist[256 + ((key0 >>  8) & 0xFF)]++;
-
-            hist[        key1        & 0xFF]++;
-            hist[256 + ((key1 >>  8) & 0xFF)]++;
-         }
-
-         if (num_indices & 1)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[        key        & 0xFF]++;
-            hist[256 + ((key >>  8) & 0xFF)]++;
-         }
-      }      
-      else
-      {
-         RG_ETC1_ASSERT(key_size == 1);
-         if (key_size != 1)
-            return NULL;
-
-         T* p = pIndices0;
-         T* q = pIndices0 + (num_indices >> 1) * 2;
-
-         for ( ; p != q; p += 2)
-         {
-            const uint key0 = RG_ETC1_GET_KEY(p);
-            const uint key1 = RG_ETC1_GET_KEY(p+1);
-
-            hist[key0 & 0xFF]++;
-            hist[key1 & 0xFF]++;
-         }
-
-         if (num_indices & 1)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[key & 0xFF]++;
-         }
-      }      
-
-      T* pCur = pIndices0;
-      T* pNew = pIndices1;
-
-      for (uint pass = 0; pass < key_size; pass++)
-      {
-         const uint* pHist = &hist[pass << 8];
-
-         uint offsets[256];
-
-         uint cur_ofs = 0;
-         for (uint i = 0; i < 256; i += 2)
-         {
-            offsets[i] = cur_ofs;
-            cur_ofs += pHist[i];
-
-            offsets[i+1] = cur_ofs;
-            cur_ofs += pHist[i+1];
-         }
-
-         const uint pass_shift = pass << 3;
-
-         T* p = pCur;
-         T* q = pCur + (num_indices >> 1) * 2;
-
-         for ( ; p != q; p += 2)
-         {
-            uint index0 = p[0];
-            uint index1 = p[1];
-
-            uint c0 = (RG_ETC1_GET_KEY_FROM_INDEX(index0) >> pass_shift) & 0xFF;
-            uint c1 = (RG_ETC1_GET_KEY_FROM_INDEX(index1) >> pass_shift) & 0xFF;
-
-            if (c0 == c1)
-            {
-               uint dst_offset0 = offsets[c0];
-
-               offsets[c0] = dst_offset0 + 2;
-
-               pNew[dst_offset0] = static_cast<T>(index0);
-               pNew[dst_offset0 + 1] = static_cast<T>(index1);
-            }
-            else
-            {
-               uint dst_offset0 = offsets[c0]++;
-               uint dst_offset1 = offsets[c1]++;
-
-               pNew[dst_offset0] = static_cast<T>(index0);
-               pNew[dst_offset1] = static_cast<T>(index1);
-            }
-         }
-
-         if (num_indices & 1)
-         {
-            uint index = *p;
-            uint c = (RG_ETC1_GET_KEY_FROM_INDEX(index) >> pass_shift) & 0xFF;
-
-            uint dst_offset = offsets[c];
-            offsets[c] = dst_offset + 1;
-
-            pNew[dst_offset] = static_cast<T>(index);
-         }
-
-         T* t = pCur;
-         pCur = pNew;
-         pNew = t;
-      }            
-
-      return pCur;
-   }
-
-#undef RG_ETC1_GET_KEY
-#undef RG_ETC1_GET_KEY_FROM_INDEX
-
-   uint16 etc1_block::pack_color5(const color_quad_u8& color, bool scaled, uint bias)
-   {
-      return pack_color5(color.r, color.g, color.b, scaled, bias);
-   }
-   
-   uint16 etc1_block::pack_color5(uint r, uint g, uint b, bool scaled, uint bias)
-   {
-      if (scaled)
-      {
-         r = (r * 31U + bias) / 255U;
-         g = (g * 31U + bias) / 255U;
-         b = (b * 31U + bias) / 255U;
-      }
-
-      r = rg_etc1::minimum(r, 31U);
-      g = rg_etc1::minimum(g, 31U);
-      b = rg_etc1::minimum(b, 31U);
-
-      return static_cast<uint16>(b | (g << 5U) | (r << 10U));
-   }
-
-   color_quad_u8 etc1_block::unpack_color5(uint16 packed_color5, bool scaled, uint alpha)
-   {
-      uint b = packed_color5 & 31U;
-      uint g = (packed_color5 >> 5U) & 31U;
-      uint r = (packed_color5 >> 10U) & 31U;
-
-      if (scaled)
-      {
-         b = (b << 3U) | (b >> 2U);
-         g = (g << 3U) | (g >> 2U);
-         r = (r << 3U) | (r >> 2U);
-      }
-
-      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
-   }
-
-   void etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, bool scaled)
-   {
-      color_quad_u8 c(unpack_color5(packed_color5, scaled, 0));
-      r = c.r;
-      g = c.g;
-      b = c.b;
-   }
-
-   bool etc1_block::unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
-   {
-      int dc_r, dc_g, dc_b;
-      unpack_delta3(dc_r, dc_g, dc_b, packed_delta3);
-      
-      int b = (packed_color5 & 31U) + dc_b;
-      int g = ((packed_color5 >> 5U) & 31U) + dc_g;
-      int r = ((packed_color5 >> 10U) & 31U) + dc_r;
-
-      bool success = true;
-      if (static_cast<uint>(r | g | b) > 31U)
-      {
-         success = false;
-         r = rg_etc1::clamp<int>(r, 0, 31);
-         g = rg_etc1::clamp<int>(g, 0, 31);
-         b = rg_etc1::clamp<int>(b, 0, 31);
-      }
-
-      if (scaled)
-      {
-         b = (b << 3U) | (b >> 2U);
-         g = (g << 3U) | (g >> 2U);
-         r = (r << 3U) | (r >> 2U);
-      }
-
-      result.set_noclamp_rgba(r, g, b, rg_etc1::minimum(alpha, 255U));
-      return success;
-   }
-
-   bool etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
-   {
-      color_quad_u8 result;
-      const bool success = unpack_color5(result, packed_color5, packed_delta3, scaled, alpha);
-      r = result.r;
-      g = result.g;
-      b = result.b;
-      return success;
-   }
-     
-   uint16 etc1_block::pack_delta3(int r, int g, int b)
-   {
-      RG_ETC1_ASSERT((r >= cETC1ColorDeltaMin) && (r <= cETC1ColorDeltaMax));
-      RG_ETC1_ASSERT((g >= cETC1ColorDeltaMin) && (g <= cETC1ColorDeltaMax));
-      RG_ETC1_ASSERT((b >= cETC1ColorDeltaMin) && (b <= cETC1ColorDeltaMax));
-      if (r < 0) r += 8;
-      if (g < 0) g += 8;
-      if (b < 0) b += 8;
-      return static_cast<uint16>(b | (g << 3) | (r << 6));
-   }
-   
-   void etc1_block::unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3)
-   {
-      r = (packed_delta3 >> 6) & 7;
-      g = (packed_delta3 >> 3) & 7;
-      b = packed_delta3 & 7;
-      if (r >= 4) r -= 8;
-      if (g >= 4) g -= 8;
-      if (b >= 4) b -= 8;
-   }
-
-   uint16 etc1_block::pack_color4(const color_quad_u8& color, bool scaled, uint bias)
-   {
-      return pack_color4(color.r, color.g, color.b, scaled, bias);
-   }
-   
-   uint16 etc1_block::pack_color4(uint r, uint g, uint b, bool scaled, uint bias)
-   {
-      if (scaled)
-      {
-         r = (r * 15U + bias) / 255U;
-         g = (g * 15U + bias) / 255U;
-         b = (b * 15U + bias) / 255U;
-      }
-
-      r = rg_etc1::minimum(r, 15U);
-      g = rg_etc1::minimum(g, 15U);
-      b = rg_etc1::minimum(b, 15U);
-
-      return static_cast<uint16>(b | (g << 4U) | (r << 8U));
-   }
-
-   color_quad_u8 etc1_block::unpack_color4(uint16 packed_color4, bool scaled, uint alpha)
-   {
-      uint b = packed_color4 & 15U;
-      uint g = (packed_color4 >> 4U) & 15U;
-      uint r = (packed_color4 >> 8U) & 15U;
-
-      if (scaled)
-      {
-         b = (b << 4U) | b;
-         g = (g << 4U) | g;
-         r = (r << 4U) | r;
-      }
-
-      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
-   }
-   
-   void etc1_block::unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled)
-   {
-      color_quad_u8 c(unpack_color4(packed_color4, scaled, 0));
-      r = c.r;
-      g = c.g;
-      b = c.b;
-   }
-
-   void etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx)
-   {
-      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
-      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-      uint r, g, b;
-      unpack_color5(r, g, b, packed_color5, true);
-
-      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-      const int y0 = pInten_modifer_table[0];
-      pDst[0].set(ir + y0, ig + y0, ib + y0);
-
-      const int y1 = pInten_modifer_table[1];
-      pDst[1].set(ir + y1, ig + y1, ib + y1);
-
-      const int y2 = pInten_modifer_table[2];
-      pDst[2].set(ir + y2, ig + y2, ib + y2);
-
-      const int y3 = pInten_modifer_table[3];
-      pDst[3].set(ir + y3, ig + y3, ib + y3);
-   }
-   
-   bool etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx)
-   {
-      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
-      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-      uint r, g, b;
-      bool success = unpack_color5(r, g, b, packed_color5, packed_delta3, true);
-
-      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-      const int y0 = pInten_modifer_table[0];
-      pDst[0].set(ir + y0, ig + y0, ib + y0);
-
-      const int y1 = pInten_modifer_table[1];
-      pDst[1].set(ir + y1, ig + y1, ib + y1);
-
-      const int y2 = pInten_modifer_table[2];
-      pDst[2].set(ir + y2, ig + y2, ib + y2);
-
-      const int y3 = pInten_modifer_table[3];
-      pDst[3].set(ir + y3, ig + y3, ib + y3);
-
-      return success;
-   }
-   
-   void etc1_block::get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx)
-   {
-      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
-      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-      uint r, g, b;
-      unpack_color4(r, g, b, packed_color4, true);
-      
-      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-      const int y0 = pInten_modifer_table[0];
-      pDst[0].set(ir + y0, ig + y0, ib + y0);
-      
-      const int y1 = pInten_modifer_table[1];
-      pDst[1].set(ir + y1, ig + y1, ib + y1);
-
-      const int y2 = pInten_modifer_table[2];
-      pDst[2].set(ir + y2, ig + y2, ib + y2);
-
-      const int y3 = pInten_modifer_table[3];
-      pDst[3].set(ir + y3, ig + y3, ib + y3);
-   }
-      
-   bool unpack_etc1_block(const void* pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha)
-   {
-      color_quad_u8* pDst = reinterpret_cast<color_quad_u8*>(pDst_pixels_rgba);
-      const etc1_block& block = *static_cast<const etc1_block*>(pETC1_block);
-
-      const bool diff_flag = block.get_diff_bit();
-      const bool flip_flag = block.get_flip_bit();
-      const uint table_index0 = block.get_inten_table(0);
-      const uint table_index1 = block.get_inten_table(1);
-
-      color_quad_u8 subblock_colors0[4];
-      color_quad_u8 subblock_colors1[4];
-      bool success = true;
-
-      if (diff_flag)
-      {
-         const uint16 base_color5 = block.get_base5_color();
-         const uint16 delta_color3 = block.get_delta3_color();
-         etc1_block::get_diff_subblock_colors(subblock_colors0, base_color5, table_index0);
-            
-         if (!etc1_block::get_diff_subblock_colors(subblock_colors1, base_color5, delta_color3, table_index1))
-            success = false;
-      }
-      else
-      {
-         const uint16 base_color4_0 = block.get_base4_color(0);
-         etc1_block::get_abs_subblock_colors(subblock_colors0, base_color4_0, table_index0);
-
-         const uint16 base_color4_1 = block.get_base4_color(1);
-         etc1_block::get_abs_subblock_colors(subblock_colors1, base_color4_1, table_index1);
-      }
-
-      if (preserve_alpha)
-      {
-         if (flip_flag)
-         {
-            for (uint y = 0; y < 2; y++)
-            {
-               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
-               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
-               pDst[2].set_rgb(subblock_colors0[block.get_selector(2, y)]);
-               pDst[3].set_rgb(subblock_colors0[block.get_selector(3, y)]);
-               pDst += 4;
-            }
-
-            for (uint y = 2; y < 4; y++)
-            {
-               pDst[0].set_rgb(subblock_colors1[block.get_selector(0, y)]);
-               pDst[1].set_rgb(subblock_colors1[block.get_selector(1, y)]);
-               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
-               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
-               pDst += 4;
-            }
-         }
-         else
-         {
-            for (uint y = 0; y < 4; y++)
-            {
-               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
-               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
-               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
-               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
-               pDst += 4;
-            }
-         }
-      }
-      else 
-      {
-         if (flip_flag)
-         {
-            // 0000
-            // 0000
-            // 1111
-            // 1111
-            for (uint y = 0; y < 2; y++)
-            {
-               pDst[0] = subblock_colors0[block.get_selector(0, y)];
-               pDst[1] = subblock_colors0[block.get_selector(1, y)];
-               pDst[2] = subblock_colors0[block.get_selector(2, y)];
-               pDst[3] = subblock_colors0[block.get_selector(3, y)];
-               pDst += 4;
-            }
-
-            for (uint y = 2; y < 4; y++)
-            {
-               pDst[0] = subblock_colors1[block.get_selector(0, y)];
-               pDst[1] = subblock_colors1[block.get_selector(1, y)];
-               pDst[2] = subblock_colors1[block.get_selector(2, y)];
-               pDst[3] = subblock_colors1[block.get_selector(3, y)];
-               pDst += 4;
-            }
-         }
-         else
-         {
-            // 0011
-            // 0011
-            // 0011
-            // 0011
-            for (uint y = 0; y < 4; y++)
-            {
-               pDst[0] = subblock_colors0[block.get_selector(0, y)];
-               pDst[1] = subblock_colors0[block.get_selector(1, y)];
-               pDst[2] = subblock_colors1[block.get_selector(2, y)];
-               pDst[3] = subblock_colors1[block.get_selector(3, y)];
-               pDst += 4;
-            }
-         }
-      }
-      
-      return success;
-   }
-
-   struct etc1_solution_coordinates
-   {
-      inline etc1_solution_coordinates() :
-      m_unscaled_color(0, 0, 0, 0),
-         m_inten_table(0),
-         m_color4(false)
-      {
-      }
-
-      inline etc1_solution_coordinates(uint r, uint g, uint b, uint inten_table, bool color4) : 
-      m_unscaled_color(r, g, b, 255),
-         m_inten_table(inten_table),
-         m_color4(color4)
-      {
-      }
-
-      inline etc1_solution_coordinates(const color_quad_u8& c, uint inten_table, bool color4) : 
-      m_unscaled_color(c),
-         m_inten_table(inten_table),
-         m_color4(color4)
-      {
-      }
-
-      inline etc1_solution_coordinates(const etc1_solution_coordinates& other)
-      {
-         *this = other;
-      }
-
-      inline etc1_solution_coordinates& operator= (const etc1_solution_coordinates& rhs)
-      {
-         m_unscaled_color = rhs.m_unscaled_color;
-         m_inten_table = rhs.m_inten_table;
-         m_color4 = rhs.m_color4;
-         return *this;
-      }
-
-      inline void clear()
-      {
-         m_unscaled_color.clear();
-         m_inten_table = 0;
-         m_color4 = false;
-      }
-
-      inline color_quad_u8 get_scaled_color() const
-      {
-         int br, bg, bb;
-         if (m_color4)
-         {
-            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
-            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
-            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
-         }
-         else
-         {
-            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
-            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
-            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
-         }
-         return color_quad_u8(br, bg, bb);
-      }
-
-      inline void get_block_colors(color_quad_u8* pBlock_colors)
-      {
-         int br, bg, bb;
-         if (m_color4)
-         {
-            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
-            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
-            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
-         }
-         else
-         {
-            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
-            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
-            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
-         }
-         const int* pInten_table = g_etc1_inten_tables[m_inten_table];
-         pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0]);
-         pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1]);
-         pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2]);
-         pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3]);
-      }
-
-      color_quad_u8 m_unscaled_color;
-      uint m_inten_table;
-      bool m_color4;
-   };
-
-   class etc1_optimizer
-   {
-      etc1_optimizer(const etc1_optimizer&);
-      etc1_optimizer& operator= (const etc1_optimizer&);
-
-   public:
-      etc1_optimizer()
-      {
-         clear();
-      }
-
-      void clear()
-      {
-         m_pParams = NULL;
-         m_pResult = NULL;
-         m_pSorted_luma = NULL;
-         m_pSorted_luma_indices = NULL;
-      }
-
-      struct params : etc1_pack_params
-      {
-         params()
-         {
-            clear();
-         }
-
-         params(const etc1_pack_params& base_params) : 
-         etc1_pack_params(base_params)
-         {
-            clear_optimizer_params();
-         }
-
-         void clear()
-         {
-            etc1_pack_params::clear();
-            clear_optimizer_params();
-         }
-
-         void clear_optimizer_params()
-         {
-            m_num_src_pixels = 0;
-            m_pSrc_pixels = 0;
-
-            m_use_color4 = false;
-            static const int s_default_scan_delta[] = { 0 };
-            m_pScan_deltas = s_default_scan_delta;
-            m_scan_delta_size = 1;
-
-            m_base_color5.clear();
-            m_constrain_against_base_color5 = false;
-         }
-
-         uint m_num_src_pixels;
-         const color_quad_u8* m_pSrc_pixels;
-
-         bool m_use_color4;
-         const int* m_pScan_deltas;
-         uint m_scan_delta_size;
-
-         color_quad_u8 m_base_color5;
-         bool m_constrain_against_base_color5;
-      };
-
-      struct results
-      {
-         uint64 m_error;
-         color_quad_u8 m_block_color_unscaled;
-         uint m_block_inten_table;
-         uint m_n;
-         uint8* m_pSelectors;
-         bool m_block_color4;
-
-         inline results& operator= (const results& rhs)
-         {
-            m_block_color_unscaled = rhs.m_block_color_unscaled;
-            m_block_color4 = rhs.m_block_color4;
-            m_block_inten_table = rhs.m_block_inten_table;
-            m_error = rhs.m_error;
-            RG_ETC1_ASSERT(m_n == rhs.m_n);
-            memcpy(m_pSelectors, rhs.m_pSelectors, rhs.m_n);
-            return *this;
-         }
-      };
-
-      void init(const params& params, results& result);
-      bool compute();
-
-   private:      
-      struct potential_solution
-      {
-         potential_solution() : m_coords(), m_error(cUINT64_MAX), m_valid(false)
-         {
-         }
-
-         etc1_solution_coordinates  m_coords;
-         uint8                      m_selectors[8];
-         uint64                     m_error;
-         bool                       m_valid;
-
-         void clear()
-         {
-            m_coords.clear();
-            m_error = cUINT64_MAX;
-            m_valid = false;
-         }
-      };
-
-      const params* m_pParams;
-      results* m_pResult;
-
-      int m_limit;
-
-      vec3F m_avg_color;
-      int m_br, m_bg, m_bb;
-      uint16 m_luma[8];
-      uint32 m_sorted_luma[2][8];
-      const uint32* m_pSorted_luma_indices;
-      uint32* m_pSorted_luma;
-
-      uint8 m_selectors[8];
-      uint8 m_best_selectors[8];
-
-      potential_solution m_best_solution;
-      potential_solution m_trial_solution;
-      uint8 m_temp_selectors[8];
-
-      bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
-      bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
-   };
-      
-   bool etc1_optimizer::compute()
-   {
-      const uint n = m_pParams->m_num_src_pixels;
-      const int scan_delta_size = m_pParams->m_scan_delta_size;
-      
-      // Scan through a subset of the 3D lattice centered around the avg block color trying each 3D (555 or 444) lattice point as a potential block color.
-      // Each time a better solution is found try to refine the current solution's block color based of the current selectors and intensity table index.
-      for (int zdi = 0; zdi < scan_delta_size; zdi++)
-      {
-         const int zd = m_pParams->m_pScan_deltas[zdi];
-         const int mbb = m_bb + zd;
-         if (mbb < 0) continue; else if (mbb > m_limit) break;
-         
-         for (int ydi = 0; ydi < scan_delta_size; ydi++)
-         {
-            const int yd = m_pParams->m_pScan_deltas[ydi];
-            const int mbg = m_bg + yd;
-            if (mbg < 0) continue; else if (mbg > m_limit) break;
-
-            for (int xdi = 0; xdi < scan_delta_size; xdi++)
-            {
-               const int xd = m_pParams->m_pScan_deltas[xdi];
-               const int mbr = m_br + xd;
-               if (mbr < 0) continue; else if (mbr > m_limit) break;
-      
-               etc1_solution_coordinates coords(mbr, mbg, mbb, 0, m_pParams->m_use_color4);
-               if (m_pParams->m_quality == cHighQuality)
-               {
-                  if (!evaluate_solution(coords, m_trial_solution, &m_best_solution))
-                     continue;
-               }
-               else
-               {
-                  if (!evaluate_solution_fast(coords, m_trial_solution, &m_best_solution))
-                     continue;
-               }
-               
-               // Now we have the input block, the avg. color of the input pixels, a set of trial selector indices, and the block color+intensity index.
-               // Now, for each component, attempt to refine the current solution by solving a simple linear equation. For example, for 4 colors:
-               // The goal is:
-               // pixel0 - (block_color+inten_table[selector0]) + pixel1 - (block_color+inten_table[selector1]) + pixel2 - (block_color+inten_table[selector2]) + pixel3 - (block_color+inten_table[selector3]) = 0
-               // Rearranging this:
-               // (pixel0 + pixel1 + pixel2 + pixel3) - (block_color+inten_table[selector0]) - (block_color+inten_table[selector1]) - (block_color+inten_table[selector2]) - (block_color+inten_table[selector3]) = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3) - block_color - inten_table[selector0] - block_color-inten_table[selector1] - block_color-inten_table[selector2] - block_color-inten_table[selector3] = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - inten_table[selector0] - inten_table[selector1] - inten_table[selector2] - inten_table[selector3] = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3]) = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3)/4 - block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4 = 0
-               // block_color = (pixel0 + pixel1 + pixel2 + pixel3)/4 - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4
-               // So what this means:
-               // optimal_block_color = avg_input - avg_inten_delta
-               // So the optimal block color can be computed by taking the average block color and subtracting the current average of the intensity delta.
-               // Unfortunately, optimal_block_color must then be quantized to 555 or 444 so it's not always possible to improve matters using this formula.
-               // Also, the above formula is for unclamped intensity deltas. The actual implementation takes into account clamping.
-
-               const uint max_refinement_trials = (m_pParams->m_quality == cLowQuality) ? 2 : (((xd | yd | zd) == 0) ? 4 : 2);
-               for (uint refinement_trial = 0; refinement_trial < max_refinement_trials; refinement_trial++)
-               {
-                  const uint8* pSelectors = m_best_solution.m_selectors;
-                  const int* pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
-
-                  int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
-                  const color_quad_u8 base_color(m_best_solution.m_coords.get_scaled_color());
-                  for (uint r = 0; r < n; r++)
-                  {
-                     const uint s = *pSelectors++;
-                     const int yd = pInten_table[s];
-                     // Compute actual delta being applied to each pixel, taking into account clamping.
-                     delta_sum_r += rg_etc1::clamp<int>(base_color.r + yd, 0, 255) - base_color.r;
-                     delta_sum_g += rg_etc1::clamp<int>(base_color.g + yd, 0, 255) - base_color.g;
-                     delta_sum_b += rg_etc1::clamp<int>(base_color.b + yd, 0, 255) - base_color.b;
-                  }
-                  if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
-                     break;
-                  const float avg_delta_r_f = static_cast<float>(delta_sum_r) / n;
-                  const float avg_delta_g_f = static_cast<float>(delta_sum_g) / n;
-                  const float avg_delta_b_f = static_cast<float>(delta_sum_b) / n;
-                  const int br1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
-                  const int bg1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
-                  const int bb1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
-                  
-                  bool skip = false;
-                  
-                  if ((mbr == br1) && (mbg == bg1) && (mbb == bb1))
-                     skip = true;
-                  else if ((br1 == m_best_solution.m_coords.m_unscaled_color.r) && (bg1 == m_best_solution.m_coords.m_unscaled_color.g) && (bb1 == m_best_solution.m_coords.m_unscaled_color.b))
-                     skip = true;
-                  else if ((m_br == br1) && (m_bg == bg1) && (m_bb == bb1))
-                     skip = true;
-
-                  if (skip)
-                     break;
-
-                  etc1_solution_coordinates coords1(br1, bg1, bb1, 0, m_pParams->m_use_color4);
-                  if (m_pParams->m_quality == cHighQuality)
-                  {
-                     if (!evaluate_solution(coords1, m_trial_solution, &m_best_solution)) 
-                        break;
-                  }
-                  else
-                  {
-                     if (!evaluate_solution_fast(coords1, m_trial_solution, &m_best_solution))
-                        break;
-                  }
-
-               }  // refinement_trial
-
-            } // xdi
-         } // ydi
-      } // zdi
-
-      if (!m_best_solution.m_valid)
-      {
-         m_pResult->m_error = cUINT32_MAX;
-         return false;
-      }
-      
-      const uint8* pSelectors = m_best_solution.m_selectors;
-
-#ifdef RG_ETC1_BUILD_DEBUG
-      {
-         color_quad_u8 block_colors[4];
-         m_best_solution.m_coords.get_block_colors(block_colors);
-
-         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
-         uint64 actual_error = 0;
-         for (uint i = 0; i < n; i++)
-            actual_error += pSrc_pixels[i].squared_distance_rgb(block_colors[pSelectors[i]]);
-         
-         RG_ETC1_ASSERT(actual_error == m_best_solution.m_error);
-      }
-#endif      
-      
-      m_pResult->m_error = m_best_solution.m_error;
-
-      m_pResult->m_block_color_unscaled = m_best_solution.m_coords.m_unscaled_color;
-      m_pResult->m_block_color4 = m_best_solution.m_coords.m_color4;
-      
-      m_pResult->m_block_inten_table = m_best_solution.m_coords.m_inten_table;
-      memcpy(m_pResult->m_pSelectors, pSelectors, n);
-      m_pResult->m_n = n;
-
-      return true;
-   }
-
-   void etc1_optimizer::init(const params& p, results& r)
-   {
-      // This version is hardcoded for 8 pixel subblocks.
-      RG_ETC1_ASSERT(p.m_num_src_pixels == 8);
-      
-      m_pParams = &p;
-      m_pResult = &r;
-                  
-      const uint n = 8;
-      
-      m_limit = m_pParams->m_use_color4 ? 15 : 31;
-
-      vec3F avg_color(0.0f);
-
-      for (uint i = 0; i < n; i++)
-      {
-         const color_quad_u8& c = m_pParams->m_pSrc_pixels[i];
-         const vec3F fc(c.r, c.g, c.b);
-
-         avg_color += fc;
-
-         m_luma[i] = static_cast<uint16>(c.r + c.g + c.b);
-         m_sorted_luma[0][i] = i;
-      }
-      avg_color *= (1.0f / static_cast<float>(n));
-      m_avg_color = avg_color;
-
-      m_br = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
-      m_bg = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
-      m_bb = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
-
-      if (m_pParams->m_quality <= cMediumQuality)
-      {
-         m_pSorted_luma_indices = indirect_radix_sort(n, m_sorted_luma[0], m_sorted_luma[1], m_luma, 0, sizeof(m_luma[0]), false);
-         m_pSorted_luma = m_sorted_luma[0];
-         if (m_pSorted_luma_indices == m_sorted_luma[0])
-            m_pSorted_luma = m_sorted_luma[1];
-      
-         for (uint i = 0; i < n; i++)
-            m_pSorted_luma[i] = m_luma[m_pSorted_luma_indices[i]];
-      }
-      
-      m_best_solution.m_coords.clear();
-      m_best_solution.m_valid = false;
-      m_best_solution.m_error = cUINT64_MAX;
-   }
-
-   bool etc1_optimizer::evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
-   {
-      trial_solution.m_valid = false;
-
-      if (m_pParams->m_constrain_against_base_color5)
-      {
-         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
-         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
-         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
-
-         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
-            return false;
-      }
-
-      const color_quad_u8 base_color(coords.get_scaled_color());
-      
-      const uint n = 8;
-            
-      trial_solution.m_error = cUINT64_MAX;
-            
-      for (uint inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
-      {
-         const int* pInten_table = g_etc1_inten_tables[inten_table];
-
-         color_quad_u8 block_colors[4];
-         for (uint s = 0; s < 4; s++)
-         {
-            const int yd = pInten_table[s];
-            block_colors[s].set(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
-         }
-         
-         uint64 total_error = 0;
-         
-         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
-         for (uint c = 0; c < n; c++)
-         {
-            const color_quad_u8& src_pixel = *pSrc_pixels++;
-            
-            uint best_selector_index = 0;
-            uint best_error = rg_etc1::square(src_pixel.r - block_colors[0].r) + rg_etc1::square(src_pixel.g - block_colors[0].g) + rg_etc1::square(src_pixel.b - block_colors[0].b);
-
-            uint trial_error = rg_etc1::square(src_pixel.r - block_colors[1].r) + rg_etc1::square(src_pixel.g - block_colors[1].g) + rg_etc1::square(src_pixel.b - block_colors[1].b);
-            if (trial_error < best_error)
-            {
-               best_error = trial_error;
-               best_selector_index = 1;
-            }
-
-            trial_error = rg_etc1::square(src_pixel.r - block_colors[2].r) + rg_etc1::square(src_pixel.g - block_colors[2].g) + rg_etc1::square(src_pixel.b - block_colors[2].b);
-            if (trial_error < best_error)
-            {
-               best_error = trial_error;
-               best_selector_index = 2;
-            }
-
-            trial_error = rg_etc1::square(src_pixel.r - block_colors[3].r) + rg_etc1::square(src_pixel.g - block_colors[3].g) + rg_etc1::square(src_pixel.b - block_colors[3].b);
-            if (trial_error < best_error)
-            {
-               best_error = trial_error;
-               best_selector_index = 3;
-            }
-
-            m_temp_selectors[c] = static_cast<uint8>(best_selector_index);
-
-            total_error += best_error;
-            if (total_error >= trial_solution.m_error)
-               break;
-         }
-         
-         if (total_error < trial_solution.m_error)
-         {
-            trial_solution.m_error = total_error;
-            trial_solution.m_coords.m_inten_table = inten_table;
-            memcpy(trial_solution.m_selectors, m_temp_selectors, 8);
-            trial_solution.m_valid = true;
-         }
-      }
-      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
-      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
-
-      bool success = false;
-      if (pBest_solution)
-      {
-         if (trial_solution.m_error < pBest_solution->m_error)
-         {
-            *pBest_solution = trial_solution;
-            success = true;
-         }
-      }
-
-      return success;
-   }
-
-   bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
-   {
-      if (m_pParams->m_constrain_against_base_color5)
-      {
-         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
-         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
-         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
-
-         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
-         {
-            trial_solution.m_valid = false;
-            return false;
-         }
-      }
-
-      const color_quad_u8 base_color(coords.get_scaled_color());
-
-      const uint n = 8;
-      
-      trial_solution.m_error = cUINT64_MAX;
-
-      for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
-      {
-         const int* pInten_table = g_etc1_inten_tables[inten_table];
-
-         uint block_inten[4];
-         color_quad_u8 block_colors[4];
-         for (uint s = 0; s < 4; s++)
-         {
-            const int yd = pInten_table[s];
-            color_quad_u8 block_color(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
-            block_colors[s] = block_color;
-            block_inten[s] = block_color.r + block_color.g + block_color.b;
-         }
-
-         // evaluate_solution_fast() enforces/assumesd a total ordering of the input colors along the intensity (1,1,1) axis to more quickly classify the inputs to selectors.
-         // The inputs colors have been presorted along the projection onto this axis, and ETC1 block colors are always ordered along the intensity axis, so this classification is fast.
-         // 0   1   2   3
-         //   01  12  23
-         const uint block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
-
-         uint64 total_error = 0;
-         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
-         if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
-         {
-            if (block_inten[0] > m_pSorted_luma[n - 1])
-            {
-           const uint min_error = intabs(block_inten[0] - m_pSorted_luma[n - 1]);
-               if (min_error >= trial_solution.m_error)
-                  continue;
-            }
-
-            memset(&m_temp_selectors[0], 0, n);
-
-            for (uint c = 0; c < n; c++)
-               total_error += block_colors[0].squared_distance_rgb(pSrc_pixels[c]);
-         }
-         else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
-         {
-            if (m_pSorted_luma[0] > block_inten[3])
-            {
-           const uint min_error = intabs(m_pSorted_luma[0] - block_inten[3]);
-               if (min_error >= trial_solution.m_error)
-                  continue;
-            }
-
-            memset(&m_temp_selectors[0], 3, n);
-
-            for (uint c = 0; c < n; c++)
-               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[c]);
-         }
-         else
-         {
-            uint cur_selector = 0, c;
-            for (c = 0; c < n; c++)
-            {
-               const uint y = m_pSorted_luma[c];
-               while ((y * 2) >= block_inten_midpoints[cur_selector])
-                  if (++cur_selector > 2)
-                     goto done;
-               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
-               m_temp_selectors[sorted_pixel_index] = static_cast<uint8>(cur_selector);
-               total_error += block_colors[cur_selector].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
-            }
-done:
-            while (c < n)
-            {
-               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
-               m_temp_selectors[sorted_pixel_index] = 3;
-               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
-               ++c;
-            }
-         }
-
-         if (total_error < trial_solution.m_error)
-         {
-            trial_solution.m_error = total_error;
-            trial_solution.m_coords.m_inten_table = inten_table;
-            memcpy(trial_solution.m_selectors, m_temp_selectors, n);
-            trial_solution.m_valid = true;
-            if (!total_error)
-               break;
-         }
-      }
-      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
-      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
-      
-      bool success = false;
-      if (pBest_solution)
-      {
-         if (trial_solution.m_error < pBest_solution->m_error)
-         {
-            *pBest_solution = trial_solution;
-            success = true;
-         }
-      }
-
-      return success;
-   }
-         
-   static uint etc1_decode_value(uint diff, uint inten, uint selector, uint packed_c)
-   {
-      const uint limit = diff ? 32 : 16; limit;
-      RG_ETC1_ASSERT((diff < 2) && (inten < 8) && (selector < 4) && (packed_c < limit));
-      int c;
-      if (diff)
-         c = (packed_c >> 2) | (packed_c << 3);
-      else 
-         c = packed_c | (packed_c << 4);
-      c += g_etc1_inten_tables[inten][selector];
-      c = rg_etc1::clamp<int>(c, 0, 255);
-      return c;
-   }
-
-   static inline int mul_8bit(int a, int b) { int t = a*b + 128; return (t + (t >> 8)) >> 8; }
-
-   void pack_etc1_block_init()
-   {
-      for (uint diff = 0; diff < 2; diff++)
-      {
-         const uint limit = diff ? 32 : 16;
-
-         for (uint inten = 0; inten < 8; inten++)
-         {
-            for (uint selector = 0; selector < 4; selector++)
-            {
-               const uint inverse_table_index = diff + (inten << 1) + (selector << 4);
-               for (uint color = 0; color < 256; color++)
-               {
-                  uint best_error = cUINT32_MAX, best_packed_c = 0;
-                  for (uint packed_c = 0; packed_c < limit; packed_c++)
-                  {
-                     int v = etc1_decode_value(diff, inten, selector, packed_c);
-                     uint err = labs(v - static_cast<int>(color));
-		     //printf("err: %d - %u = %u\n",v,color,err);
-                     if (err < best_error)
-                     {
-                        best_error = err;
-                        best_packed_c = packed_c;
-                        if (!best_error) 
-                           break;
-                     }
-                  }
-                  RG_ETC1_ASSERT(best_error <= 255);
-                  g_etc1_inverse_lookup[inverse_table_index][color] = static_cast<uint16>(best_packed_c | (best_error << 8));
-               }
-            }
-         }
-      }
-      
-      uint expand5[32];
-      for(int i = 0; i < 32; i++)
-         expand5[i] = (i << 3) | (i >> 2);
-
-      for(int i = 0; i < 256 + 16; i++)
-      {
-         int v = clamp<int>(i - 8, 0, 255);
-         g_quant5_tab[i] = static_cast<uint8>(expand5[mul_8bit(v,31)]);
-      }
-   }
-
-   // Packs solid color blocks efficiently using a set of small precomputed tables.
-   // For random 888 inputs, MSE results are better than Erricson's ETC1 packer in "slow" mode ~9.5% of the time, is slightly worse only ~.01% of the time, and is equal the rest of the time.
-   static uint64 pack_etc1_block_solid_color(etc1_block& block, const uint8* pColor, etc1_pack_params& pack_params)
-   {
-      pack_params;
-      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
-            
-      static uint s_next_comp[4] = { 1, 2, 0, 1 };
-            
-      uint best_error = cUINT32_MAX, best_i = 0;
-      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
-
-      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
-      for (uint i = 0; i < 3; i++)
-      {
-         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
-
-         const int delta_range = 1;
-         for (int delta = -delta_range; delta <= delta_range; delta++)
-         {
-            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
-
-            const uint16* pTable;
-            if (!c_plus_delta)
-               pTable = g_color8_to_etc_block_config_0_255[0];
-            else if (c_plus_delta == 255)
-               pTable = g_color8_to_etc_block_config_0_255[1];
-            else
-               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
-
-            do
-            {
-               const uint x = *pTable++;
-
-#ifdef RG_ETC1_BUILD_DEBUG
-               const uint diff = x & 1;
-               const uint inten = (x >> 1) & 7;
-               const uint selector = (x >> 4) & 3;
-               const uint p0 = (x >> 8) & 255;
-               RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
-#endif
-
-               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
-               uint16 p1 = pInverse_table[c1];
-               uint16 p2 = pInverse_table[c2];
-               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
-               if (trial_error < best_error)
-               {
-                  best_error = trial_error;
-                  best_x = x;
-                  best_packed_c1 = p1 & 0xFF;
-                  best_packed_c2 = p2 & 0xFF;
-                  best_i = i;
-                  if (!best_error)
-                     goto found_perfect_match;
-               }
-            } while (*pTable != 0xFFFF);
-         }
-      }
-found_perfect_match:
-
-      const uint diff = best_x & 1;
-      const uint inten = (best_x >> 1) & 7;
-
-      block.m_bytes[3] = static_cast<uint8>(((inten | (inten << 3)) << 2) | (diff << 1));
-                        
-      const uint etc1_selector = g_selector_index_to_etc1[(best_x >> 4) & 3];
-      *reinterpret_cast<uint16*>(&block.m_bytes[4]) = (etc1_selector & 2) ? 0xFFFF : 0;
-      *reinterpret_cast<uint16*>(&block.m_bytes[6]) = (etc1_selector & 1) ? 0xFFFF : 0;
-
-      const uint best_packed_c0 = (best_x >> 8) & 255;
-      if (diff)
-      {
-         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 << 3);
-         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 << 3);
-         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 << 3);
-      }
-      else
-      {
-         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 | (best_packed_c0 << 4));
-         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 | (best_packed_c1 << 4));
-         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 | (best_packed_c2 << 4));
-      }
-
-      return best_error;
-   }
-      
-   static uint pack_etc1_block_solid_color_constrained(
-      etc1_optimizer::results& results, 
-      uint num_colors, const uint8* pColor, 
-      etc1_pack_params& pack_params, 
-      bool use_diff,
-      const color_quad_u8* pBase_color5_unscaled)
-   {
-      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
-
-      pack_params;
-      static uint s_next_comp[4] = { 1, 2, 0, 1 };
-
-      uint best_error = cUINT32_MAX, best_i = 0;
-      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
-
-      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
-      for (uint i = 0; i < 3; i++)
-      {
-         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
-
-         const int delta_range = 1;
-         for (int delta = -delta_range; delta <= delta_range; delta++)
-         {
-            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
-
-            const uint16* pTable;
-            if (!c_plus_delta)
-               pTable = g_color8_to_etc_block_config_0_255[0];
-            else if (c_plus_delta == 255)
-               pTable = g_color8_to_etc_block_config_0_255[1];
-            else
-               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
-
-            do
-            {
-               const uint x = *pTable++;
-               const uint diff = x & 1;
-               if (static_cast<uint>(use_diff) != diff)
-               {
-                  if (*pTable == 0xFFFF)
-                     break;
-                  continue;
-               }
-
-               if ((diff) && (pBase_color5_unscaled))
-               {
-                  const int p0 = (x >> 8) & 255;
-                  int delta = p0 - static_cast<int>(pBase_color5_unscaled->c[i]);
-                  if ((delta < cETC1ColorDeltaMin) || (delta > cETC1ColorDeltaMax))
-                  {
-                     if (*pTable == 0xFFFF)
-                        break;
-                     continue;
-                  }
-               }
-
-#ifdef RG_ETC1_BUILD_DEBUG
-               {
-                  const uint inten = (x >> 1) & 7;
-                  const uint selector = (x >> 4) & 3;
-                  const uint p0 = (x >> 8) & 255;
-                  RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
-               }
-#endif
-
-               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
-               uint16 p1 = pInverse_table[c1];
-               uint16 p2 = pInverse_table[c2];
-
-               if ((diff) && (pBase_color5_unscaled))
-               {
-                  int delta1 = (p1 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i]]);
-                  int delta2 = (p2 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i + 1]]);
-                  if ((delta1 < cETC1ColorDeltaMin) || (delta1 > cETC1ColorDeltaMax) || (delta2 < cETC1ColorDeltaMin) || (delta2 > cETC1ColorDeltaMax))
-                  {
-                     if (*pTable == 0xFFFF)
-                        break;
-                     continue;
-                  }
-               }
-
-               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
-               if (trial_error < best_error)
-               {
-                  best_error = trial_error;
-                  best_x = x;
-                  best_packed_c1 = p1 & 0xFF;
-                  best_packed_c2 = p2 & 0xFF;
-                  best_i = i;
-                  if (!best_error)
-                     goto found_perfect_match;
-               }
-            } while (*pTable != 0xFFFF);
-         }
-      }
-found_perfect_match:
-
-      if (best_error == cUINT32_MAX)
-         return best_error;
-
-      best_error *= num_colors;
-
-      results.m_n = num_colors;
-      results.m_block_color4 = !(best_x & 1);
-      results.m_block_inten_table = (best_x >> 1) & 7;
-      memset(results.m_pSelectors, (best_x >> 4) & 3, num_colors);
-
-      const uint best_packed_c0 = (best_x >> 8) & 255;
-      results.m_block_color_unscaled[best_i] = static_cast<uint8>(best_packed_c0);
-      results.m_block_color_unscaled[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1);
-      results.m_block_color_unscaled[s_next_comp[best_i + 1]] = static_cast<uint8>(best_packed_c2);
-      results.m_error = best_error;
-      
-      return best_error;
-   }
-
-   // Function originally from RYG's public domain real-time DXT1 compressor, modified for 555.
-   static void dither_block_555(color_quad_u8* dest, const color_quad_u8* block)
-   {
-      int err[8],*ep1 = err,*ep2 = err+4;
-      uint8 *quant = g_quant5_tab+8;
-
-      memset(dest, 0xFF, sizeof(color_quad_u8)*16);
-
-      // process channels seperately
-      for(int ch=0;ch<3;ch++)
-      {
-         uint8* bp = (uint8*)block;
-         uint8* dp = (uint8*)dest;
-
-         bp += ch; dp += ch;
-
-         memset(err,0, sizeof(err));
-         for(int y = 0; y < 4; y++)
-         {
-            // pixel 0
-            dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
-            ep1[0] = bp[ 0] - dp[ 0];
-
-            // pixel 1
-            dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
-            ep1[1] = bp[ 4] - dp[ 4];
-
-            // pixel 2
-            dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
-            ep1[2] = bp[ 8] - dp[ 8];
-
-            // pixel 3
-            dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
-            ep1[3] = bp[12] - dp[12];
-
-            // advance to next line
-            int* tmp = ep1; ep1 = ep2; ep2 = tmp;
-            bp += 16;
-            dp += 16;
-         }
-      }
-   }
-
-   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params)
-   {
-      const color_quad_u8* pSrc_pixels = reinterpret_cast<const color_quad_u8*>(pSrc_pixels_rgba);
-      etc1_block& dst_block = *static_cast<etc1_block*>(pETC1_block);
-
-#ifdef RG_ETC1_BUILD_DEBUG
-      // Ensure all alpha values are 0xFF.
-      for (uint i = 0; i < 16; i++)
-      {
-         RG_ETC1_ASSERT(pSrc_pixels[i].a == 255);
-      }
-#endif
-
-      color_quad_u8 src_pixel0(pSrc_pixels[0]);
-
-      // Check for solid block.
-      const uint32 first_pixel_u32 = pSrc_pixels->m_u32;
-      int r;
-      for (r = 15; r >= 1; --r)
-         if (pSrc_pixels[r].m_u32 != first_pixel_u32)
-            break;
-      if (!r)
-         return static_cast<unsigned int>(16 * pack_etc1_block_solid_color(dst_block, &pSrc_pixels[0].r, pack_params));
-      
-      color_quad_u8 dithered_pixels[16];
-      if (pack_params.m_dithering)
-      {
-         dither_block_555(dithered_pixels, pSrc_pixels);
-         pSrc_pixels = dithered_pixels;
-      }
-
-      etc1_optimizer optimizer;
-
-      uint64 best_error = cUINT64_MAX;
-      uint best_flip = false, best_use_color4 = false;
-      
-      uint8 best_selectors[2][8];
-      etc1_optimizer::results best_results[2];
-      for (uint i = 0; i < 2; i++)
-      {
-         best_results[i].m_n = 8;
-         best_results[i].m_pSelectors = best_selectors[i];
-      }
-      
-      uint8 selectors[3][8];
-      etc1_optimizer::results results[3];
-      
-      for (uint i = 0; i < 3; i++)
-      {
-         results[i].m_n = 8;
-         results[i].m_pSelectors = selectors[i];
-      }
-            
-      color_quad_u8 subblock_pixels[8];
-
-      etc1_optimizer::params params(pack_params);
-      params.m_num_src_pixels = 8;
-      params.m_pSrc_pixels = subblock_pixels;
-
-      for (uint flip = 0; flip < 2; flip++)
-      {
-         for (uint use_color4 = 0; use_color4 < 2; use_color4++)
-         {
-            uint64 trial_error = 0;
-
-            uint subblock;
-            for (subblock = 0; subblock < 2; subblock++)
-            {
-               if (flip)
-                  memcpy(subblock_pixels, pSrc_pixels + subblock * 8, sizeof(color_quad_u8) * 8);
-               else
-               {
-                  const color_quad_u8* pSrc_col = pSrc_pixels + subblock * 2;
-                  subblock_pixels[0] = pSrc_col[0]; subblock_pixels[1] = pSrc_col[4]; subblock_pixels[2] = pSrc_col[8]; subblock_pixels[3] = pSrc_col[12];
-                  subblock_pixels[4] = pSrc_col[1]; subblock_pixels[5] = pSrc_col[5]; subblock_pixels[6] = pSrc_col[9]; subblock_pixels[7] = pSrc_col[13];
-               }
-
-               results[2].m_error = cUINT64_MAX;
-               if ((params.m_quality >= cMediumQuality) && ((subblock) || (use_color4)))
-               {
-                  const uint32 subblock_pixel0_u32 = subblock_pixels[0].m_u32;
-                  for (r = 7; r >= 1; --r)
-                     if (subblock_pixels[r].m_u32 != subblock_pixel0_u32)
-                        break;
-                  if (!r)
-                  {
-                     pack_etc1_block_solid_color_constrained(results[2], 8, &subblock_pixels[0].r, pack_params, !use_color4, (subblock && !use_color4) ? &results[0].m_block_color_unscaled : NULL);
-                  }
-               }
-
-               params.m_use_color4 = (use_color4 != 0);
-               params.m_constrain_against_base_color5 = false;
-
-               if ((!use_color4) && (subblock))
-               {
-                  params.m_constrain_against_base_color5 = true;
-                  params.m_base_color5 = results[0].m_block_color_unscaled;
-               }
-                              
-               if (params.m_quality == cHighQuality)
-               {
-                  static const int s_scan_delta_0_to_4[] = { -4, -3, -2, -1, 0, 1, 2, 3, 4 };
-                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_4);
-                  params.m_pScan_deltas = s_scan_delta_0_to_4;
-               }
-               else if (params.m_quality == cMediumQuality)
-               {
-                  static const int s_scan_delta_0_to_1[] = { -1, 0, 1 };
-                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_1);
-                  params.m_pScan_deltas = s_scan_delta_0_to_1;
-               }
-               else
-               {
-                  static const int s_scan_delta_0[] = { 0 };
-                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0);
-                  params.m_pScan_deltas = s_scan_delta_0;
-               }
-               
-               optimizer.init(params, results[subblock]);
-               if (!optimizer.compute())
-                  break;
-                              
-               if (params.m_quality >= cMediumQuality)
-               {
-                  // TODO: Fix fairly arbitrary/unrefined thresholds that control how far away to scan for potentially better solutions.
-                  const uint refinement_error_thresh0 = 3000;
-                  const uint refinement_error_thresh1 = 6000;
-                  if (results[subblock].m_error > refinement_error_thresh0)
-                  {
-                     if (params.m_quality == cMediumQuality)
-                     {
-                        static const int s_scan_delta_2_to_3[] = { -3, -2, 2, 3 };
-                        params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_2_to_3);
-                        params.m_pScan_deltas = s_scan_delta_2_to_3;
-                     }
-                     else
-                     {
-                        static const int s_scan_delta_5_to_5[] = { -5, 5 };
-                        static const int s_scan_delta_5_to_8[] = { -8, -7, -6, -5, 5, 6, 7, 8 };
-                        if (results[subblock].m_error > refinement_error_thresh1)
-                        {
-                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_8);
-                           params.m_pScan_deltas = s_scan_delta_5_to_8;
-                        }
-                        else
-                        {
-                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_5);
-                           params.m_pScan_deltas = s_scan_delta_5_to_5;
-                        }
-                     }
-
-                     if (!optimizer.compute())
-                        break;
-                  }
-
-                  if (results[2].m_error < results[subblock].m_error)
-                     results[subblock] = results[2];
-               }
-                            
-               trial_error += results[subblock].m_error;
-               if (trial_error >= best_error)
-                  break;
-            }
-
-            if (subblock < 2)
-               continue;
-
-            best_error = trial_error;
-            best_results[0] = results[0];
-            best_results[1] = results[1];
-            best_flip = flip;
-            best_use_color4 = use_color4;
-            
-         } // use_color4
-
-      } // flip
-
-      int dr = best_results[1].m_block_color_unscaled.r - best_results[0].m_block_color_unscaled.r;
-      int dg = best_results[1].m_block_color_unscaled.g - best_results[0].m_block_color_unscaled.g;
-      int db = best_results[1].m_block_color_unscaled.b - best_results[0].m_block_color_unscaled.b;
-      RG_ETC1_ASSERT(best_use_color4 || ((rg_etc1::minimum(dr, dg, db) >= cETC1ColorDeltaMin) && (rg_etc1::maximum(dr, dg, db) <= cETC1ColorDeltaMax)));
-           
-      if (best_use_color4)
-      {
-         dst_block.m_bytes[0] = static_cast<uint8>(best_results[1].m_block_color_unscaled.r | (best_results[0].m_block_color_unscaled.r << 4));
-         dst_block.m_bytes[1] = static_cast<uint8>(best_results[1].m_block_color_unscaled.g | (best_results[0].m_block_color_unscaled.g << 4));
-         dst_block.m_bytes[2] = static_cast<uint8>(best_results[1].m_block_color_unscaled.b | (best_results[0].m_block_color_unscaled.b << 4));
-      }
-      else
-      {
-         if (dr < 0) dr += 8; dst_block.m_bytes[0] = static_cast<uint8>((best_results[0].m_block_color_unscaled.r << 3) | dr);
-         if (dg < 0) dg += 8; dst_block.m_bytes[1] = static_cast<uint8>((best_results[0].m_block_color_unscaled.g << 3) | dg);
-         if (db < 0) db += 8; dst_block.m_bytes[2] = static_cast<uint8>((best_results[0].m_block_color_unscaled.b << 3) | db);
-      }
-      
-      dst_block.m_bytes[3] = static_cast<uint8>( (best_results[1].m_block_inten_table << 2) | (best_results[0].m_block_inten_table << 5) | ((~best_use_color4 & 1) << 1) | best_flip );
-      
-      uint selector0 = 0, selector1 = 0;
-      if (best_flip)
-      {
-         // flipped:
-         // { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },               
-         // { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 } 
-         //
-         // { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
-         // { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
-         const uint8* pSelectors0 = best_results[0].m_pSelectors;
-         const uint8* pSelectors1 = best_results[1].m_pSelectors;
-         for (int x = 3; x >= 0; --x)
-         {
-            uint b;
-            b = g_selector_index_to_etc1[pSelectors1[4 + x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-            b = g_selector_index_to_etc1[pSelectors1[x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-            b = g_selector_index_to_etc1[pSelectors0[4 + x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-            b = g_selector_index_to_etc1[pSelectors0[x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-         }
-      }
-      else
-      {
-         // non-flipped:
-         // { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
-         // { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
-         //
-         // { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
-         // { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
-         for (int subblock = 1; subblock >= 0; --subblock)
-         {
-            const uint8* pSelectors = best_results[subblock].m_pSelectors + 4;
-            for (uint i = 0; i < 2; i++)
-            {
-               uint b;
-               b = g_selector_index_to_etc1[pSelectors[3]];
-               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-               b = g_selector_index_to_etc1[pSelectors[2]];
-               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-               b = g_selector_index_to_etc1[pSelectors[1]];
-               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-               b = g_selector_index_to_etc1[pSelectors[0]];
-               selector0 = (selector0 << 1) | (b & 1);selector1 = (selector1 << 1) | (b >> 1);
-
-               pSelectors -= 4;
-            }
-         }
-      }
-                  
-      dst_block.m_bytes[4] = static_cast<uint8>(selector1 >> 8); dst_block.m_bytes[5] = static_cast<uint8>(selector1 & 0xFF);
-      dst_block.m_bytes[6] = static_cast<uint8>(selector0 >> 8); dst_block.m_bytes[7] = static_cast<uint8>(selector0 & 0xFF);
-
-      return static_cast<unsigned int>(best_error);
-   }
-
-} // namespace rg_etc1
+// File: rg_etc1.cpp - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
+// Please see ZLIB license at the end of rg_etc1.h.
+//
+// For more information Ericsson Texture Compression (ETC/ETC1), see:
+// http://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8_texture.txt
+//
+// v1.03 - 5/12/13 - Initial public release
+#include "rg_etc1.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+//#include <stdio.h>
+#include <math.h>
+#include <stdio.h>
+#pragma warning (disable: 4201) //  nonstandard extension used : nameless struct/union
+
+#if defined(_DEBUG) || defined(DEBUG)
+#define RG_ETC1_BUILD_DEBUG
+#endif
+
+#define RG_ETC1_ASSERT assert
+
+namespace rg_etc1
+{
+
+   inline long labs(long val) {
+        return val < 0 ? -val : val;
+   }
+
+   inline int intabs(int val) {
+
+       return val<0?-val:val;
+   }
+
+   typedef unsigned char uint8;
+   typedef unsigned short uint16;
+   typedef unsigned int uint;
+   typedef unsigned int uint32;
+   typedef long long int64;
+   typedef unsigned long long uint64;
+
+   const uint32 cUINT32_MAX = 0xFFFFFFFFU;
+   const uint64 cUINT64_MAX = 0xFFFFFFFFFFFFFFFFULL; //0xFFFFFFFFFFFFFFFFui64;
+   
+   template<typename T> inline T minimum(T a, T b) { return (a < b) ? a : b; }
+   template<typename T> inline T minimum(T a, T b, T c) { return minimum(minimum(a, b), c); }
+   template<typename T> inline T maximum(T a, T b) { return (a > b) ? a : b; }
+   template<typename T> inline T maximum(T a, T b, T c) { return maximum(maximum(a, b), c); }
+   template<typename T> inline T clamp(T value, T low, T high) { return (value < low) ? low : ((value > high) ? high : value); }
+   template<typename T> inline T square(T value) { return value * value; }
+   template<typename T> inline void zero_object(T& obj) { memset((void*)&obj, 0, sizeof(obj)); }
+   template<typename T> inline void zero_this(T* pObj) { memset((void*)pObj, 0, sizeof(*pObj)); }
+
+   template<class T, size_t N> T decay_array_to_subtype(T (&a)[N]);   
+
+#define RG_ETC1_ARRAY_SIZE(X) (sizeof(X) / sizeof(decay_array_to_subtype(X)))
+
+   enum eNoClamp { cNoClamp };
+
+   struct color_quad_u8
+   {
+      static inline int clamp(int v) { if (v & 0xFFFFFF00U) v = (~(static_cast<int>(v) >> 31)) & 0xFF; return v; }
+
+      struct component_traits { enum { cSigned = false, cFloat = false, cMin = 0U, cMax = 255U }; };
+
+   public:
+      typedef unsigned char component_t;
+      typedef int parameter_t;
+
+      enum { cNumComps = 4 };
+
+      union
+      {
+         struct
+         {
+            component_t r;
+            component_t g;
+            component_t b;
+            component_t a;
+         };
+
+         component_t c[cNumComps];
+
+         uint32 m_u32;
+      };
+
+      inline color_quad_u8()
+      {
+      }
+
+      inline color_quad_u8(const color_quad_u8& other) : m_u32(other.m_u32)
+      {
+      }
+
+      explicit inline color_quad_u8(parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         set(y, alpha);
+      }
+
+      inline color_quad_u8(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
+      {
+         set(red, green, blue, alpha);
+      }
+
+      explicit inline color_quad_u8(eNoClamp, parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         set_noclamp_y_alpha(y, alpha);
+      }
+
+      inline color_quad_u8(eNoClamp, parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
+      {
+         set_noclamp_rgba(red, green, blue, alpha);
+      }
+
+      inline void clear()
+      {
+         m_u32 = 0;
+      }
+
+      inline color_quad_u8& operator= (const color_quad_u8& other)
+      {
+         m_u32 = other.m_u32;
+         return *this;
+      }
+
+      inline color_quad_u8& set_rgb(const color_quad_u8& other)
+      {
+         r = other.r;
+         g = other.g;
+         b = other.b;
+         return *this;
+      }
+
+      inline color_quad_u8& operator= (parameter_t y)
+      {
+         set(y, component_traits::cMax);
+         return *this;
+      }
+
+      inline color_quad_u8& set(parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         y = clamp(y);
+         alpha = clamp(alpha);
+         r = static_cast<component_t>(y);
+         g = static_cast<component_t>(y);
+         b = static_cast<component_t>(y);
+         a = static_cast<component_t>(alpha);
+         return *this;
+      }
+
+      inline color_quad_u8& set_noclamp_y_alpha(parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         RG_ETC1_ASSERT( (y >= component_traits::cMin) && (y <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
+
+         r = static_cast<component_t>(y);
+         g = static_cast<component_t>(y);
+         b = static_cast<component_t>(y);
+         a = static_cast<component_t>(alpha);
+         return *this;
+      }
+
+      inline color_quad_u8& set(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
+      {
+         r = static_cast<component_t>(clamp(red));
+         g = static_cast<component_t>(clamp(green));
+         b = static_cast<component_t>(clamp(blue));
+         a = static_cast<component_t>(clamp(alpha));
+         return *this;
+      }
+
+      inline color_quad_u8& set_noclamp_rgba(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha)
+      {
+         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
+
+         r = static_cast<component_t>(red);
+         g = static_cast<component_t>(green);
+         b = static_cast<component_t>(blue);
+         a = static_cast<component_t>(alpha);
+         return *this;
+      }
+
+      inline color_quad_u8& set_noclamp_rgb(parameter_t red, parameter_t green, parameter_t blue)
+      {
+         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
+
+         r = static_cast<component_t>(red);
+         g = static_cast<component_t>(green);
+         b = static_cast<component_t>(blue);
+         return *this;
+      }
+
+      static inline parameter_t get_min_comp() { return component_traits::cMin; }
+      static inline parameter_t get_max_comp() { return component_traits::cMax; }
+      static inline bool get_comps_are_signed() { return component_traits::cSigned; }
+
+      inline component_t operator[] (uint i) const { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
+      inline component_t& operator[] (uint i) { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
+
+      inline color_quad_u8& set_component(uint i, parameter_t f)
+      {
+         RG_ETC1_ASSERT(i < cNumComps);
+
+         c[i] = static_cast<component_t>(clamp(f));
+
+         return *this;
+      }
+
+      inline color_quad_u8& set_grayscale(parameter_t l)
+      {
+         component_t x = static_cast<component_t>(clamp(l));
+         c[0] = x;
+         c[1] = x;
+         c[2] = x;
+         return *this;
+      }
+
+      inline color_quad_u8& clamp(const color_quad_u8& l, const color_quad_u8& h)
+      {
+         for (uint i = 0; i < cNumComps; i++)
+            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l[i], h[i]));
+         return *this;
+      }
+
+      inline color_quad_u8& clamp(parameter_t l, parameter_t h)
+      {
+         for (uint i = 0; i < cNumComps; i++)
+            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l, h));
+         return *this;
+      }
+
+      // Returns CCIR 601 luma (consistent with color_utils::RGB_To_Y).
+      inline parameter_t get_luma() const
+      {
+         return static_cast<parameter_t>((19595U * r + 38470U * g + 7471U * b + 32768U) >> 16U);
+      }
+
+      // Returns REC 709 luma.
+      inline parameter_t get_luma_rec709() const
+      {
+         return static_cast<parameter_t>((13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U);
+      }
+
+      inline uint squared_distance_rgb(const color_quad_u8& c) const
+      {
+         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b);
+      }
+
+      inline uint squared_distance_rgba(const color_quad_u8& c) const
+      {
+         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b) + rg_etc1::square(a - c.a);
+      }
+
+      inline bool rgb_equals(const color_quad_u8& rhs) const
+      {
+         return (r == rhs.r) && (g == rhs.g) && (b == rhs.b);
+      }
+
+      inline bool operator== (const color_quad_u8& rhs) const
+      {
+         return m_u32 == rhs.m_u32;
+      }
+
+      color_quad_u8& operator+= (const color_quad_u8& other)
+      {
+         for (uint i = 0; i < 4; i++)
+            c[i] = static_cast<component_t>(clamp(c[i] + other.c[i]));
+         return *this;
+      }
+
+      color_quad_u8& operator-= (const color_quad_u8& other)
+      {
+         for (uint i = 0; i < 4; i++)
+            c[i] = static_cast<component_t>(clamp(c[i] - other.c[i]));
+         return *this;
+      }
+
+      friend color_quad_u8 operator+ (const color_quad_u8& lhs, const color_quad_u8& rhs)
+      {
+         color_quad_u8 result(lhs);
+         result += rhs;
+         return result;
+      }
+
+      friend color_quad_u8 operator- (const color_quad_u8& lhs, const color_quad_u8& rhs)
+      {
+         color_quad_u8 result(lhs);
+         result -= rhs;
+         return result;
+      }
+   }; // class color_quad_u8
+
+   struct vec3F
+   {
+      float m_s[3];
+      
+      inline vec3F() { }
+      inline vec3F(float s) { m_s[0] = s; m_s[1] = s; m_s[2] = s; }
+      inline vec3F(float x, float y, float z) { m_s[0] = x; m_s[1] = y; m_s[2] = z; }
+      
+      inline float operator[] (uint i) const { RG_ETC1_ASSERT(i < 3); return m_s[i]; }
+
+      inline vec3F& operator += (const vec3F& other) { for (uint i = 0; i < 3; i++) m_s[i] += other.m_s[i]; return *this; }
+
+      inline vec3F& operator *= (float s) { for (uint i = 0; i < 3; i++) m_s[i] *= s; return *this; }
+   };
+     
+   enum etc_constants
+   {
+      cETC1BytesPerBlock = 8U,
+
+      cETC1SelectorBits = 2U,
+      cETC1SelectorValues = 1U << cETC1SelectorBits,
+      cETC1SelectorMask = cETC1SelectorValues - 1U,
+
+      cETC1BlockShift = 2U,
+      cETC1BlockSize = 1U << cETC1BlockShift,
+
+      cETC1LSBSelectorIndicesBitOffset = 0,
+      cETC1MSBSelectorIndicesBitOffset = 16,
+
+      cETC1FlipBitOffset = 32,
+      cETC1DiffBitOffset = 33,
+
+      cETC1IntenModifierNumBits = 3,
+      cETC1IntenModifierValues = 1 << cETC1IntenModifierNumBits,
+      cETC1RightIntenModifierTableBitOffset = 34,
+      cETC1LeftIntenModifierTableBitOffset = 37,
+
+      // Base+Delta encoding (5 bit bases, 3 bit delta)
+      cETC1BaseColorCompNumBits = 5,
+      cETC1BaseColorCompMax = 1 << cETC1BaseColorCompNumBits,
+
+      cETC1DeltaColorCompNumBits = 3,
+      cETC1DeltaColorComp = 1 << cETC1DeltaColorCompNumBits,
+      cETC1DeltaColorCompMax = 1 << cETC1DeltaColorCompNumBits,
+
+      cETC1BaseColor5RBitOffset = 59,
+      cETC1BaseColor5GBitOffset = 51,
+      cETC1BaseColor5BBitOffset = 43,
+
+      cETC1DeltaColor3RBitOffset = 56,
+      cETC1DeltaColor3GBitOffset = 48,
+      cETC1DeltaColor3BBitOffset = 40,
+
+      // Absolute (non-delta) encoding (two 4-bit per component bases)
+      cETC1AbsColorCompNumBits = 4,
+      cETC1AbsColorCompMax = 1 << cETC1AbsColorCompNumBits,
+
+      cETC1AbsColor4R1BitOffset = 60,
+      cETC1AbsColor4G1BitOffset = 52,
+      cETC1AbsColor4B1BitOffset = 44,
+
+      cETC1AbsColor4R2BitOffset = 56,
+      cETC1AbsColor4G2BitOffset = 48,
+      cETC1AbsColor4B2BitOffset = 40,
+
+      cETC1ColorDeltaMin = -4,
+      cETC1ColorDeltaMax = 3,
+
+      // Delta3:
+      // 0   1   2   3   4   5   6   7
+      // 000 001 010 011 100 101 110 111
+      // 0   1   2   3   -4  -3  -2  -1
+   };
+   
+   static uint8 g_quant5_tab[256+16];
+
+
+   static const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues] = 
+   { 
+      { -8,  -2,   2,   8 }, { -17,  -5,  5,  17 }, { -29,  -9,   9,  29 }, {  -42, -13, 13,  42 }, 
+      { -60, -18, 18,  60 }, { -80, -24, 24,  80 }, { -106, -33, 33, 106 }, { -183, -47, 47, 183 } 
+   };
+
+   static const uint8 g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
+   static const uint8 g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
+      
+   // Given an ETC1 diff/inten_table/selector, and an 8-bit desired color, this table encodes the best packed_color in the low byte, and the abs error in the high byte.
+   static uint16 g_etc1_inverse_lookup[2*8*4][256];      // [diff/inten_table/selector][desired_color]
+
+   // g_color8_to_etc_block_config[color][table_index] = Supplies for each 8-bit color value a list of packed ETC1 diff/intensity table/selectors/packed_colors that map to that color.
+   // To pack: diff | (inten << 1) | (selector << 4) | (packed_c << 8)
+   static const uint16 g_color8_to_etc_block_config_0_255[2][33] =
+   {
+      { 0x0000,  0x0010,  0x0002,  0x0012,  0x0004,  0x0014,  0x0006,  0x0016,  0x0008,  0x0018,  0x000A,  0x001A,  0x000C,  0x001C,  0x000E,  0x001E,
+        0x0001,  0x0011,  0x0003,  0x0013,  0x0005,  0x0015,  0x0007,  0x0017,  0x0009,  0x0019,  0x000B,  0x001B,  0x000D,  0x001D,  0x000F,  0x001F, 0xFFFF },
+      { 0x0F20,  0x0F30,  0x0E32,  0x0F22,  0x0E34,  0x0F24,  0x0D36,  0x0F26,  0x0C38,  0x0E28,  0x0B3A,  0x0E2A,  0x093C,  0x0E2C,  0x053E,  0x0D2E,
+        0x1E31,  0x1F21,  0x1D33,  0x1F23,  0x1C35,  0x1E25,  0x1A37,  0x1E27,  0x1839,  0x1D29,  0x163B,  0x1C2B,  0x133D,  0x1B2D,  0x093F,  0x1A2F, 0xFFFF },
+   };
+
+   // Really only [254][11].
+   static const uint16 g_color8_to_etc_block_config_1_to_254[254][12] = 
+   {
+      { 0x021C, 0x0D0D, 0xFFFF }, { 0x0020, 0x0021, 0x0A0B, 0x061F, 0xFFFF }, { 0x0113, 0x0217, 0xFFFF }, { 0x0116, 0x031E,
+      0x0B0E, 0x0405, 0xFFFF }, { 0x0022, 0x0204, 0x050A, 0x0023, 0xFFFF }, { 0x0111, 0x0319, 0x0809, 0x170F, 0xFFFF }, {
+      0x0303, 0x0215, 0x0607, 0xFFFF }, { 0x0030, 0x0114, 0x0408, 0x0031, 0x0201, 0x051D, 0xFFFF }, { 0x0100, 0x0024, 0x0306,
+      0x0025, 0x041B, 0x0E0D, 0xFFFF }, { 0x021A, 0x0121, 0x0B0B, 0x071F, 0xFFFF }, { 0x0213, 0x0317, 0xFFFF }, { 0x0112,
+      0x0505, 0xFFFF }, { 0x0026, 0x070C, 0x0123, 0x0027, 0xFFFF }, { 0x0211, 0x0909, 0xFFFF }, { 0x0110, 0x0315, 0x0707,
+      0x0419, 0x180F, 0xFFFF }, { 0x0218, 0x0131, 0x0301, 0x0403, 0x061D, 0xFFFF }, { 0x0032, 0x0202, 0x0033, 0x0125, 0x051B,
+      0x0F0D, 0xFFFF }, { 0x0028, 0x031C, 0x0221, 0x0029, 0xFFFF }, { 0x0120, 0x0313, 0x0C0B, 0x081F, 0xFFFF }, { 0x0605,
+      0x0417, 0xFFFF }, { 0x0216, 0x041E, 0x0C0E, 0x0223, 0x0127, 0xFFFF }, { 0x0122, 0x0304, 0x060A, 0x0311, 0x0A09, 0xFFFF
+      }, { 0x0519, 0x190F, 0xFFFF }, { 0x002A, 0x0231, 0x0503, 0x0415, 0x0807, 0x002B, 0x071D, 0xFFFF }, { 0x0130, 0x0214,
+      0x0508, 0x0401, 0x0133, 0x0225, 0x061B, 0xFFFF }, { 0x0200, 0x0124, 0x0406, 0x0321, 0x0129, 0x100D, 0xFFFF }, { 0x031A,
+      0x0D0B, 0x091F, 0xFFFF }, { 0x0413, 0x0705, 0x0517, 0xFFFF }, { 0x0212, 0x0034, 0x0323, 0x0035, 0x0227, 0xFFFF }, {
+      0x0126, 0x080C, 0x0B09, 0xFFFF }, { 0x0411, 0x0619, 0x1A0F, 0xFFFF }, { 0x0210, 0x0331, 0x0603, 0x0515, 0x0907, 0x012B,
+      0xFFFF }, { 0x0318, 0x002C, 0x0501, 0x0233, 0x0325, 0x071B, 0x002D, 0x081D, 0xFFFF }, { 0x0132, 0x0302, 0x0229, 0x110D,
+      0xFFFF }, { 0x0128, 0x041C, 0x0421, 0x0E0B, 0x0A1F, 0xFFFF }, { 0x0220, 0x0513, 0x0617, 0xFFFF }, { 0x0135, 0x0805,
+      0x0327, 0xFFFF }, { 0x0316, 0x051E, 0x0D0E, 0x0423, 0xFFFF }, { 0x0222, 0x0404, 0x070A, 0x0511, 0x0719, 0x0C09, 0x1B0F,
+      0xFFFF }, { 0x0703, 0x0615, 0x0A07, 0x022B, 0xFFFF }, { 0x012A, 0x0431, 0x0601, 0x0333, 0x012D, 0x091D, 0xFFFF }, {
+      0x0230, 0x0314, 0x0036, 0x0608, 0x0425, 0x0037, 0x0329, 0x081B, 0x120D, 0xFFFF }, { 0x0300, 0x0224, 0x0506, 0x0521,
+      0x0F0B, 0x0B1F, 0xFFFF }, { 0x041A, 0x0613, 0x0717, 0xFFFF }, { 0x0235, 0x0905, 0xFFFF }, { 0x0312, 0x0134, 0x0523,
+      0x0427, 0xFFFF }, { 0x0226, 0x090C, 0x002E, 0x0611, 0x0D09, 0x002F, 0xFFFF }, { 0x0715, 0x0B07, 0x0819, 0x032B, 0x1C0F,
+      0xFFFF }, { 0x0310, 0x0531, 0x0701, 0x0803, 0x022D, 0x0A1D, 0xFFFF }, { 0x0418, 0x012C, 0x0433, 0x0525, 0x0137, 0x091B,
+      0x130D, 0xFFFF }, { 0x0232, 0x0402, 0x0621, 0x0429, 0xFFFF }, { 0x0228, 0x051C, 0x0713, 0x100B, 0x0C1F, 0xFFFF }, {
+      0x0320, 0x0335, 0x0A05, 0x0817, 0xFFFF }, { 0x0623, 0x0527, 0xFFFF }, { 0x0416, 0x061E, 0x0E0E, 0x0711, 0x0E09, 0x012F,
+      0xFFFF }, { 0x0322, 0x0504, 0x080A, 0x0919, 0x1D0F, 0xFFFF }, { 0x0631, 0x0903, 0x0815, 0x0C07, 0x042B, 0x032D, 0x0B1D,
+      0xFFFF }, { 0x022A, 0x0801, 0x0533, 0x0625, 0x0237, 0x0A1B, 0xFFFF }, { 0x0330, 0x0414, 0x0136, 0x0708, 0x0721, 0x0529,
+      0x140D, 0xFFFF }, { 0x0400, 0x0324, 0x0606, 0x0038, 0x0039, 0x110B, 0x0D1F, 0xFFFF }, { 0x051A, 0x0813, 0x0B05, 0x0917,
+      0xFFFF }, { 0x0723, 0x0435, 0x0627, 0xFFFF }, { 0x0412, 0x0234, 0x0F09, 0x022F, 0xFFFF }, { 0x0326, 0x0A0C, 0x012E,
+      0x0811, 0x0A19, 0x1E0F, 0xFFFF }, { 0x0731, 0x0A03, 0x0915, 0x0D07, 0x052B, 0xFFFF }, { 0x0410, 0x0901, 0x0633, 0x0725,
+      0x0337, 0x0B1B, 0x042D, 0x0C1D, 0xFFFF }, { 0x0518, 0x022C, 0x0629, 0x150D, 0xFFFF }, { 0x0332, 0x0502, 0x0821, 0x0139,
+      0x120B, 0x0E1F, 0xFFFF }, { 0x0328, 0x061C, 0x0913, 0x0A17, 0xFFFF }, { 0x0420, 0x0535, 0x0C05, 0x0727, 0xFFFF }, {
+      0x0823, 0x032F, 0xFFFF }, { 0x0516, 0x071E, 0x0F0E, 0x0911, 0x0B19, 0x1009, 0x1F0F, 0xFFFF }, { 0x0422, 0x0604, 0x090A,
+      0x0B03, 0x0A15, 0x0E07, 0x062B, 0xFFFF }, { 0x0831, 0x0A01, 0x0733, 0x052D, 0x0D1D, 0xFFFF }, { 0x032A, 0x0825, 0x0437,
+      0x0729, 0x0C1B, 0x160D, 0xFFFF }, { 0x0430, 0x0514, 0x0236, 0x0808, 0x0921, 0x0239, 0x130B, 0x0F1F, 0xFFFF }, { 0x0500,
+      0x0424, 0x0706, 0x0138, 0x0A13, 0x0B17, 0xFFFF }, { 0x061A, 0x0635, 0x0D05, 0xFFFF }, { 0x0923, 0x0827, 0xFFFF }, {
+      0x0512, 0x0334, 0x003A, 0x0A11, 0x1109, 0x003B, 0x042F, 0xFFFF }, { 0x0426, 0x0B0C, 0x022E, 0x0B15, 0x0F07, 0x0C19,
+      0x072B, 0xFFFF }, { 0x0931, 0x0B01, 0x0C03, 0x062D, 0x0E1D, 0xFFFF }, { 0x0510, 0x0833, 0x0925, 0x0537, 0x0D1B, 0x170D,
+      0xFFFF }, { 0x0618, 0x032C, 0x0A21, 0x0339, 0x0829, 0xFFFF }, { 0x0432, 0x0602, 0x0B13, 0x140B, 0x101F, 0xFFFF }, {
+      0x0428, 0x071C, 0x0735, 0x0E05, 0x0C17, 0xFFFF }, { 0x0520, 0x0A23, 0x0927, 0xFFFF }, { 0x0B11, 0x1209, 0x013B, 0x052F,
+      0xFFFF }, { 0x0616, 0x081E, 0x0D19, 0xFFFF }, { 0x0522, 0x0704, 0x0A0A, 0x0A31, 0x0D03, 0x0C15, 0x1007, 0x082B, 0x072D,
+      0x0F1D, 0xFFFF }, { 0x0C01, 0x0933, 0x0A25, 0x0637, 0x0E1B, 0xFFFF }, { 0x042A, 0x0B21, 0x0929, 0x180D, 0xFFFF }, {
+	      0x0530, 0x0614, 0x0336, 0x0908, 0x0439, 0x150B, 0x111F, 0xFFFF }, { 0x0600, 0x0524, 0x0806, 0x0238, 0x0C13, 0x0F05,
+      0x0D17, 0xFFFF }, { 0x071A, 0x0B23, 0x0835, 0x0A27, 0xFFFF }, { 0x1309, 0x023B, 0x062F, 0xFFFF }, { 0x0612, 0x0434,
+      0x013A, 0x0C11, 0x0E19, 0xFFFF }, { 0x0526, 0x0C0C, 0x032E, 0x0B31, 0x0E03, 0x0D15, 0x1107, 0x092B, 0xFFFF }, { 0x0D01,
+      0x0A33, 0x0B25, 0x0737, 0x0F1B, 0x082D, 0x101D, 0xFFFF }, { 0x0610, 0x0A29, 0x190D, 0xFFFF }, { 0x0718, 0x042C, 0x0C21,
+      0x0539, 0x160B, 0x121F, 0xFFFF }, { 0x0532, 0x0702, 0x0D13, 0x0E17, 0xFFFF }, { 0x0528, 0x081C, 0x0935, 0x1005, 0x0B27,
+      0xFFFF }, { 0x0620, 0x0C23, 0x033B, 0x072F, 0xFFFF }, { 0x0D11, 0x0F19, 0x1409, 0xFFFF }, { 0x0716, 0x003C, 0x091E,
+      0x0F03, 0x0E15, 0x1207, 0x0A2B, 0x003D, 0xFFFF }, { 0x0622, 0x0804, 0x0B0A, 0x0C31, 0x0E01, 0x0B33, 0x092D, 0x111D,
+      0xFFFF }, { 0x0C25, 0x0837, 0x0B29, 0x101B, 0x1A0D, 0xFFFF }, { 0x052A, 0x0D21, 0x0639, 0x170B, 0x131F, 0xFFFF }, {
+      0x0630, 0x0714, 0x0436, 0x0A08, 0x0E13, 0x0F17, 0xFFFF }, { 0x0700, 0x0624, 0x0906, 0x0338, 0x0A35, 0x1105, 0xFFFF }, {
+      0x081A, 0x0D23, 0x0C27, 0xFFFF }, { 0x0E11, 0x1509, 0x043B, 0x082F, 0xFFFF }, { 0x0712, 0x0534, 0x023A, 0x0F15, 0x1307,
+      0x1019, 0x0B2B, 0x013D, 0xFFFF }, { 0x0626, 0x0D0C, 0x042E, 0x0D31, 0x0F01, 0x1003, 0x0A2D, 0x121D, 0xFFFF }, { 0x0C33,
+      0x0D25, 0x0937, 0x111B, 0x1B0D, 0xFFFF }, { 0x0710, 0x0E21, 0x0739, 0x0C29, 0xFFFF }, { 0x0818, 0x052C, 0x0F13, 0x180B,
+      0x141F, 0xFFFF }, { 0x0632, 0x0802, 0x0B35, 0x1205, 0x1017, 0xFFFF }, { 0x0628, 0x091C, 0x0E23, 0x0D27, 0xFFFF }, {
+      0x0720, 0x0F11, 0x1609, 0x053B, 0x092F, 0xFFFF }, { 0x1119, 0x023D, 0xFFFF }, { 0x0816, 0x013C, 0x0A1E, 0x0E31, 0x1103,
+      0x1015, 0x1407, 0x0C2B, 0x0B2D, 0x131D, 0xFFFF }, { 0x0722, 0x0904, 0x0C0A, 0x1001, 0x0D33, 0x0E25, 0x0A37, 0x121B,
+      0xFFFF }, { 0x0F21, 0x0D29, 0x1C0D, 0xFFFF }, { 0x062A, 0x0839, 0x190B, 0x151F, 0xFFFF }, { 0x0730, 0x0814, 0x0536,
+      0x0B08, 0x1013, 0x1305, 0x1117, 0xFFFF }, { 0x0800, 0x0724, 0x0A06, 0x0438, 0x0F23, 0x0C35, 0x0E27, 0xFFFF }, { 0x091A,
+      0x1709, 0x063B, 0x0A2F, 0xFFFF }, { 0x1011, 0x1219, 0x033D, 0xFFFF }, { 0x0812, 0x0634, 0x033A, 0x0F31, 0x1203, 0x1115,
+      0x1507, 0x0D2B, 0xFFFF }, { 0x0726, 0x0E0C, 0x052E, 0x1101, 0x0E33, 0x0F25, 0x0B37, 0x131B, 0x0C2D, 0x141D, 0xFFFF }, {
+      0x0E29, 0x1D0D, 0xFFFF }, { 0x0810, 0x1021, 0x0939, 0x1A0B, 0x161F, 0xFFFF }, { 0x0918, 0x062C, 0x1113, 0x1217, 0xFFFF
+      }, { 0x0732, 0x0902, 0x0D35, 0x1405, 0x0F27, 0xFFFF }, { 0x0728, 0x0A1C, 0x1023, 0x073B, 0x0B2F, 0xFFFF }, { 0x0820,
+      0x1111, 0x1319, 0x1809, 0xFFFF }, { 0x1303, 0x1215, 0x1607, 0x0E2B, 0x043D, 0xFFFF }, { 0x0916, 0x023C, 0x0B1E, 0x1031,
+      0x1201, 0x0F33, 0x0D2D, 0x151D, 0xFFFF }, { 0x0822, 0x0A04, 0x0D0A, 0x1025, 0x0C37, 0x0F29, 0x141B, 0x1E0D, 0xFFFF }, {
+      0x1121, 0x0A39, 0x1B0B, 0x171F, 0xFFFF }, { 0x072A, 0x1213, 0x1317, 0xFFFF }, { 0x0830, 0x0914, 0x0636, 0x0C08, 0x0E35,
+      0x1505, 0xFFFF }, { 0x0900, 0x0824, 0x0B06, 0x0538, 0x1123, 0x1027, 0xFFFF }, { 0x0A1A, 0x1211, 0x1909, 0x083B, 0x0C2F,
+      0xFFFF }, { 0x1315, 0x1707, 0x1419, 0x0F2B, 0x053D, 0xFFFF }, { 0x0912, 0x0734, 0x043A, 0x1131, 0x1301, 0x1403, 0x0E2D,
+      0x161D, 0xFFFF }, { 0x0826, 0x0F0C, 0x062E, 0x1033, 0x1125, 0x0D37, 0x151B, 0x1F0D, 0xFFFF }, { 0x1221, 0x0B39, 0x1029,
+      0xFFFF }, { 0x0910, 0x1313, 0x1C0B, 0x181F, 0xFFFF }, { 0x0A18, 0x072C, 0x0F35, 0x1605, 0x1417, 0xFFFF }, { 0x0832,
+      0x0A02, 0x1223, 0x1127, 0xFFFF }, { 0x0828, 0x0B1C, 0x1311, 0x1A09, 0x093B, 0x0D2F, 0xFFFF }, { 0x0920, 0x1519, 0x063D,
+      0xFFFF }, { 0x1231, 0x1503, 0x1415, 0x1807, 0x102B, 0x0F2D, 0x171D, 0xFFFF }, { 0x0A16, 0x033C, 0x0C1E, 0x1401, 0x1133,
+      0x1225, 0x0E37, 0x161B, 0xFFFF }, { 0x0922, 0x0B04, 0x0E0A, 0x1321, 0x1129, 0xFFFF }, { 0x0C39, 0x1D0B, 0x191F, 0xFFFF
+      }, { 0x082A, 0x1413, 0x1705, 0x1517, 0xFFFF }, { 0x0930, 0x0A14, 0x0736, 0x0D08, 0x1323, 0x1035, 0x1227, 0xFFFF }, {
+      0x0A00, 0x0924, 0x0C06, 0x0638, 0x1B09, 0x0A3B, 0x0E2F, 0xFFFF }, { 0x0B1A, 0x1411, 0x1619, 0x073D, 0xFFFF }, { 0x1331,
+      0x1603, 0x1515, 0x1907, 0x112B, 0xFFFF }, { 0x0A12, 0x0834, 0x053A, 0x1501, 0x1233, 0x1325, 0x0F37, 0x171B, 0x102D,
+      0x181D, 0xFFFF }, { 0x0926, 0x072E, 0x1229, 0xFFFF }, { 0x1421, 0x0D39, 0x1E0B, 0x1A1F, 0xFFFF }, { 0x0A10, 0x1513,
+      0x1617, 0xFFFF }, { 0x0B18, 0x082C, 0x1135, 0x1805, 0x1327, 0xFFFF }, { 0x0932, 0x0B02, 0x1423, 0x0B3B, 0x0F2F, 0xFFFF
+      }, { 0x0928, 0x0C1C, 0x1511, 0x1719, 0x1C09, 0xFFFF }, { 0x0A20, 0x1703, 0x1615, 0x1A07, 0x122B, 0x083D, 0xFFFF }, {
+      0x1431, 0x1601, 0x1333, 0x112D, 0x191D, 0xFFFF }, { 0x0B16, 0x043C, 0x0D1E, 0x1425, 0x1037, 0x1329, 0x181B, 0xFFFF }, {
+      0x0A22, 0x0C04, 0x0F0A, 0x1521, 0x0E39, 0x1F0B, 0x1B1F, 0xFFFF }, { 0x1613, 0x1717, 0xFFFF }, { 0x092A, 0x1235, 0x1905,
+      0xFFFF }, { 0x0A30, 0x0B14, 0x0836, 0x0E08, 0x1523, 0x1427, 0xFFFF }, { 0x0B00, 0x0A24, 0x0D06, 0x0738, 0x1611, 0x1D09,
+      0x0C3B, 0x102F, 0xFFFF }, { 0x0C1A, 0x1715, 0x1B07, 0x1819, 0x132B, 0x093D, 0xFFFF }, { 0x1531, 0x1701, 0x1803, 0x122D,
+      0x1A1D, 0xFFFF }, { 0x0B12, 0x0934, 0x063A, 0x1433, 0x1525, 0x1137, 0x191B, 0xFFFF }, { 0x0A26, 0x003E, 0x082E, 0x1621,
+      0x0F39, 0x1429, 0x003F, 0xFFFF }, { 0x1713, 0x1C1F, 0xFFFF }, { 0x0B10, 0x1335, 0x1A05, 0x1817, 0xFFFF }, { 0x0C18,
+      0x092C, 0x1623, 0x1527, 0xFFFF }, { 0x0A32, 0x0C02, 0x1711, 0x1E09, 0x0D3B, 0x112F, 0xFFFF }, { 0x0A28, 0x0D1C, 0x1919,
+      0x0A3D, 0xFFFF }, { 0x0B20, 0x1631, 0x1903, 0x1815, 0x1C07, 0x142B, 0x132D, 0x1B1D, 0xFFFF }, { 0x1801, 0x1533, 0x1625,
+      0x1237, 0x1A1B, 0xFFFF }, { 0x0C16, 0x053C, 0x0E1E, 0x1721, 0x1529, 0x013F, 0xFFFF }, { 0x0B22, 0x0D04, 0x1039, 0x1D1F,
+      0xFFFF }, { 0x1813, 0x1B05, 0x1917, 0xFFFF }, { 0x0A2A, 0x1723, 0x1435, 0x1627, 0xFFFF }, { 0x0B30, 0x0C14, 0x0936,
+      0x0F08, 0x1F09, 0x0E3B, 0x122F, 0xFFFF }, { 0x0C00, 0x0B24, 0x0E06, 0x0838, 0x1811, 0x1A19, 0x0B3D, 0xFFFF }, { 0x0D1A,
+      0x1731, 0x1A03, 0x1915, 0x1D07, 0x152B, 0xFFFF }, { 0x1901, 0x1633, 0x1725, 0x1337, 0x1B1B, 0x142D, 0x1C1D, 0xFFFF }, {
+      0x0C12, 0x0A34, 0x073A, 0x1629, 0x023F, 0xFFFF }, { 0x0B26, 0x013E, 0x092E, 0x1821, 0x1139, 0x1E1F, 0xFFFF }, { 0x1913,
+      0x1A17, 0xFFFF }, { 0x0C10, 0x1535, 0x1C05, 0x1727, 0xFFFF }, { 0x0D18, 0x0A2C, 0x1823, 0x0F3B, 0x132F, 0xFFFF }, {
+      0x0B32, 0x0D02, 0x1911, 0x1B19, 0xFFFF }, { 0x0B28, 0x0E1C, 0x1B03, 0x1A15, 0x1E07, 0x162B, 0x0C3D, 0xFFFF }, { 0x0C20,
+      0x1831, 0x1A01, 0x1733, 0x152D, 0x1D1D, 0xFFFF }, { 0x1825, 0x1437, 0x1729, 0x1C1B, 0x033F, 0xFFFF }, { 0x0D16, 0x063C,
+      0x0F1E, 0x1921, 0x1239, 0x1F1F, 0xFFFF }, { 0x0C22, 0x0E04, 0x1A13, 0x1B17, 0xFFFF }, { 0x1635, 0x1D05, 0xFFFF }, {
+      0x0B2A, 0x1923, 0x1827, 0xFFFF }, { 0x0C30, 0x0D14, 0x0A36, 0x1A11, 0x103B, 0x142F, 0xFFFF }, { 0x0D00, 0x0C24, 0x0F06,
+      0x0938, 0x1B15, 0x1F07, 0x1C19, 0x172B, 0x0D3D, 0xFFFF }, { 0x0E1A, 0x1931, 0x1B01, 0x1C03, 0x162D, 0x1E1D, 0xFFFF }, {
+      0x1833, 0x1925, 0x1537, 0x1D1B, 0xFFFF }, { 0x0D12, 0x0B34, 0x083A, 0x1A21, 0x1339, 0x1829, 0x043F, 0xFFFF }, { 0x0C26,
+      0x023E, 0x0A2E, 0x1B13, 0xFFFF }, { 0x1735, 0x1E05, 0x1C17, 0xFFFF }, { 0x0D10, 0x1A23, 0x1927, 0xFFFF }, { 0x0E18,
+      0x0B2C, 0x1B11, 0x113B, 0x152F, 0xFFFF }, { 0x0C32, 0x0E02, 0x1D19, 0x0E3D, 0xFFFF }, { 0x0C28, 0x0F1C, 0x1A31, 0x1D03,
+      0x1C15, 0x182B, 0x172D, 0x1F1D, 0xFFFF }, { 0x0D20, 0x1C01, 0x1933, 0x1A25, 0x1637, 0x1E1B, 0xFFFF }, { 0x1B21, 0x1929,
+      0x053F, 0xFFFF }, { 0x0E16, 0x073C, 0x1439, 0xFFFF }, { 0x0D22, 0x0F04, 0x1C13, 0x1F05, 0x1D17, 0xFFFF }, { 0x1B23,
+      0x1835, 0x1A27, 0xFFFF }, { 0x0C2A, 0x123B, 0x162F, 0xFFFF }, { 0x0D30, 0x0E14, 0x0B36, 0x1C11, 0x1E19, 0x0F3D, 0xFFFF
+      }, { 0x0E00, 0x0D24, 0x0A38, 0x1B31, 0x1E03, 0x1D15, 0x192B, 0xFFFF }, { 0x0F1A, 0x1D01, 0x1A33, 0x1B25, 0x1737, 0x1F1B,
+      0x182D, 0xFFFF }, { 0x1A29, 0x063F, 0xFFFF }, { 0x0E12, 0x0C34, 0x093A, 0x1C21, 0x1539, 0xFFFF }, { 0x0D26, 0x033E,
+      0x0B2E, 0x1D13, 0x1E17, 0xFFFF }, { 0x1935, 0x1B27, 0xFFFF }, { 0x0E10, 0x1C23, 0x133B, 0x172F, 0xFFFF }, { 0x0F18,
+      0x0C2C, 0x1D11, 0x1F19, 0xFFFF }, { 0x0D32, 0x0F02, 0x1F03, 0x1E15, 0x1A2B, 0x103D, 0xFFFF }, { 0x0D28, 0x1C31, 0x1E01,
+      0x1B33, 0x192D, 0xFFFF }, { 0x0E20, 0x1C25, 0x1837, 0x1B29, 0x073F, 0xFFFF }, { 0x1D21, 0x1639, 0xFFFF }, { 0x0F16,
+      0x083C, 0x1E13, 0x1F17, 0xFFFF }, { 0x0E22, 0x1A35, 0xFFFF }, { 0x1D23, 0x1C27, 0xFFFF }, { 0x0D2A, 0x1E11, 0x143B,
+      0x182F, 0xFFFF }, { 0x0E30, 0x0F14, 0x0C36, 0x1F15, 0x1B2B, 0x113D, 0xFFFF }, { 0x0F00, 0x0E24, 0x0B38, 0x1D31, 0x1F01,
+      0x1A2D, 0xFFFF }, { 0x1C33, 0x1D25, 0x1937, 0xFFFF }, { 0x1E21, 0x1739, 0x1C29, 0x083F, 0xFFFF }, { 0x0F12, 0x0D34,
+      0x0A3A, 0x1F13, 0xFFFF }, { 0x0E26, 0x043E, 0x0C2E, 0x1B35, 0xFFFF }, { 0x1E23, 0x1D27, 0xFFFF }, { 0x0F10, 0x1F11,
+      0x153B, 0x192F, 0xFFFF }, { 0x0D2C, 0x123D, 0xFFFF },
+   };
+
+   struct etc1_block
+   {
+      // big endian uint64:
+      // bit ofs:  56  48  40  32  24  16   8   0
+      // byte ofs: b0, b1, b2, b3, b4, b5, b6, b7 
+      union 
+      {
+         uint64 m_uint64;
+         uint8 m_bytes[8];
+      };
+
+      uint8 m_low_color[2];
+      uint8 m_high_color[2];
+
+      enum { cNumSelectorBytes = 4 };
+      uint8 m_selectors[cNumSelectorBytes];
+
+      inline void clear()
+      {
+         zero_this(this);
+      }
+
+      inline uint get_byte_bits(uint ofs, uint num) const
+      {
+         RG_ETC1_ASSERT((ofs + num) <= 64U);
+         RG_ETC1_ASSERT(num && (num <= 8U));
+         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
+         const uint byte_ofs = 7 - (ofs >> 3);
+         const uint byte_bit_ofs = ofs & 7;
+         return (m_bytes[byte_ofs] >> byte_bit_ofs) & ((1 << num) - 1);
+      }
+
+      inline void set_byte_bits(uint ofs, uint num, uint bits)
+      {
+         RG_ETC1_ASSERT((ofs + num) <= 64U);
+         RG_ETC1_ASSERT(num && (num < 32U));
+         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
+         RG_ETC1_ASSERT(bits < (1U << num));
+         const uint byte_ofs = 7 - (ofs >> 3);
+         const uint byte_bit_ofs = ofs & 7;
+         const uint mask = (1 << num) - 1;
+         m_bytes[byte_ofs] &= ~(mask << byte_bit_ofs);
+         m_bytes[byte_ofs] |= (bits << byte_bit_ofs);
+      }
+
+      // false = left/right subblocks
+      // true = upper/lower subblocks
+      inline bool get_flip_bit() const 
+      {
+         return (m_bytes[3] & 1) != 0;
+      }   
+
+      inline void set_flip_bit(bool flip)
+      {
+         m_bytes[3] &= ~1;
+         m_bytes[3] |= static_cast<uint8>(flip);
+      }
+
+      inline bool get_diff_bit() const
+      {
+         return (m_bytes[3] & 2) != 0;
+      }
+
+      inline void set_diff_bit(bool diff)
+      {
+         m_bytes[3] &= ~2;
+         m_bytes[3] |= (static_cast<uint>(diff) << 1);
+      }
+
+      // Returns intensity modifier table (0-7) used by subblock subblock_id.
+      // subblock_id=0 left/top (CW 1), 1=right/bottom (CW 2)
+      inline uint get_inten_table(uint subblock_id) const
+      {
+         RG_ETC1_ASSERT(subblock_id < 2);
+         const uint ofs = subblock_id ? 2 : 5;
+         return (m_bytes[3] >> ofs) & 7;
+      }
+
+      // Sets intensity modifier table (0-7) used by subblock subblock_id (0 or 1)
+      inline void set_inten_table(uint subblock_id, uint t)
+      {
+         RG_ETC1_ASSERT(subblock_id < 2);
+         RG_ETC1_ASSERT(t < 8);
+         const uint ofs = subblock_id ? 2 : 5;
+         m_bytes[3] &= ~(7 << ofs);
+         m_bytes[3] |= (t << ofs);
+      }
+
+      // Returned selector value ranges from 0-3 and is a direct index into g_etc1_inten_tables.
+      inline uint get_selector(uint x, uint y) const
+      {
+         RG_ETC1_ASSERT((x | y) < 4);
+
+         const uint bit_index = x * 4 + y;
+         const uint byte_bit_ofs = bit_index & 7;
+         const uint8 *p = &m_bytes[7 - (bit_index >> 3)];
+         const uint lsb = (p[0] >> byte_bit_ofs) & 1;
+         const uint msb = (p[-2] >> byte_bit_ofs) & 1;
+         const uint val = lsb | (msb << 1);
+
+         return g_etc1_to_selector_index[val];
+      }
+
+      // Selector "val" ranges from 0-3 and is a direct index into g_etc1_inten_tables.
+      inline void set_selector(uint x, uint y, uint val)
+      {
+         RG_ETC1_ASSERT((x | y | val) < 4);
+         const uint bit_index = x * 4 + y;
+
+         uint8 *p = &m_bytes[7 - (bit_index >> 3)];
+
+         const uint byte_bit_ofs = bit_index & 7;
+         const uint mask = 1 << byte_bit_ofs;
+
+         const uint etc1_val = g_selector_index_to_etc1[val];
+
+         const uint lsb = etc1_val & 1;
+         const uint msb = etc1_val >> 1;
+
+         p[0] &= ~mask;
+         p[0] |= (lsb << byte_bit_ofs);
+
+         p[-2] &= ~mask;
+         p[-2] |= (msb << byte_bit_ofs);
+      }
+
+      inline void set_base4_color(uint idx, uint16 c)
+      {
+         if (idx)
+         {
+            set_byte_bits(cETC1AbsColor4R2BitOffset, 4, (c >> 8) & 15);
+            set_byte_bits(cETC1AbsColor4G2BitOffset, 4, (c >> 4) & 15);
+            set_byte_bits(cETC1AbsColor4B2BitOffset, 4, c & 15);
+         }
+         else
+         {
+            set_byte_bits(cETC1AbsColor4R1BitOffset, 4, (c >> 8) & 15);
+            set_byte_bits(cETC1AbsColor4G1BitOffset, 4, (c >> 4) & 15);
+            set_byte_bits(cETC1AbsColor4B1BitOffset, 4, c & 15);
+         }
+      }
+
+      inline uint16 get_base4_color(uint idx) const
+      {
+         uint r, g, b;
+         if (idx)
+         {
+            r = get_byte_bits(cETC1AbsColor4R2BitOffset, 4);
+            g = get_byte_bits(cETC1AbsColor4G2BitOffset, 4);
+            b = get_byte_bits(cETC1AbsColor4B2BitOffset, 4);
+         }
+         else
+         {
+            r = get_byte_bits(cETC1AbsColor4R1BitOffset, 4);
+            g = get_byte_bits(cETC1AbsColor4G1BitOffset, 4);
+            b = get_byte_bits(cETC1AbsColor4B1BitOffset, 4);
+         }
+         return static_cast<uint16>(b | (g << 4U) | (r << 8U));
+      }
+
+      inline void set_base5_color(uint16 c)
+      {
+         set_byte_bits(cETC1BaseColor5RBitOffset, 5, (c >> 10) & 31);
+         set_byte_bits(cETC1BaseColor5GBitOffset, 5, (c >> 5) & 31);
+         set_byte_bits(cETC1BaseColor5BBitOffset, 5, c & 31);
+      }
+
+      inline uint16 get_base5_color() const
+      {
+         const uint r = get_byte_bits(cETC1BaseColor5RBitOffset, 5);
+         const uint g = get_byte_bits(cETC1BaseColor5GBitOffset, 5);
+         const uint b = get_byte_bits(cETC1BaseColor5BBitOffset, 5);
+         return static_cast<uint16>(b | (g << 5U) | (r << 10U));
+      }
+
+      void set_delta3_color(uint16 c)
+      {
+         set_byte_bits(cETC1DeltaColor3RBitOffset, 3, (c >> 6) & 7);
+         set_byte_bits(cETC1DeltaColor3GBitOffset, 3, (c >> 3) & 7);
+         set_byte_bits(cETC1DeltaColor3BBitOffset, 3, c & 7);
+      }
+
+      inline uint16 get_delta3_color() const
+      {
+         const uint r = get_byte_bits(cETC1DeltaColor3RBitOffset, 3);
+         const uint g = get_byte_bits(cETC1DeltaColor3GBitOffset, 3);
+         const uint b = get_byte_bits(cETC1DeltaColor3BBitOffset, 3);
+         return static_cast<uint16>(b | (g << 3U) | (r << 6U));
+      }
+
+      // Base color 5
+      static uint16 pack_color5(const color_quad_u8& color, bool scaled, uint bias = 127U);
+      static uint16 pack_color5(uint r, uint g, uint b, bool scaled, uint bias = 127U);
+
+      static color_quad_u8 unpack_color5(uint16 packed_color5, bool scaled, uint alpha = 255U);
+      static void unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color, bool scaled);
+
+      static bool unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
+      static bool unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
+
+      // Delta color 3
+      // Inputs range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
+      static uint16 pack_delta3(int r, int g, int b);
+
+      // Results range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
+      static void unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3);
+
+      // Abs color 4
+      static uint16 pack_color4(const color_quad_u8& color, bool scaled, uint bias = 127U);
+      static uint16 pack_color4(uint r, uint g, uint b, bool scaled, uint bias = 127U);
+
+      static color_quad_u8 unpack_color4(uint16 packed_color4, bool scaled, uint alpha = 255U);
+      static void unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled);
+
+      // subblock colors
+      static void get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx);
+      static bool get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx);
+      static void get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx);
+
+      static inline void unscaled_to_scaled_color(color_quad_u8& dst, const color_quad_u8& src, bool color4)
+      {
+         if (color4)
+         {
+            dst.r = src.r | (src.r << 4);
+            dst.g = src.g | (src.g << 4);
+            dst.b = src.b | (src.b << 4);
+         }
+         else
+         {
+            dst.r = (src.r >> 2) | (src.r << 3);
+            dst.g = (src.g >> 2) | (src.g << 3);
+            dst.b = (src.b >> 2) | (src.b << 3);
+         }
+         dst.a = src.a;
+      }
+   };
+
+   // Returns pointer to sorted array.
+   template<typename T, typename Q>
+   T* indirect_radix_sort(uint num_indices, T* pIndices0, T* pIndices1, const Q* pKeys, uint key_ofs, uint key_size, bool init_indices)
+   {  
+      RG_ETC1_ASSERT((key_ofs >= 0) && (key_ofs < sizeof(T)));
+      RG_ETC1_ASSERT((key_size >= 1) && (key_size <= 4));
+
+      if (init_indices)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + (num_indices >> 1) * 2;
+         uint i;
+         for (i = 0; p != q; p += 2, i += 2)
+         {
+            p[0] = static_cast<T>(i);
+            p[1] = static_cast<T>(i + 1); 
+         }
+
+         if (num_indices & 1)
+            *p = static_cast<T>(i);
+      }
+
+      uint hist[256 * 4];
+
+      memset(hist, 0, sizeof(hist[0]) * 256 * key_size);
+
+#define RG_ETC1_GET_KEY(p) (*(const uint*)((const uint8*)(pKeys + *(p)) + key_ofs))
+#define RG_ETC1_GET_KEY_FROM_INDEX(i) (*(const uint*)((const uint8*)(pKeys + (i)) + key_ofs))
+
+      if (key_size == 4)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + num_indices;
+         for ( ; p != q; p++)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[        key        & 0xFF]++;
+            hist[256 + ((key >>  8) & 0xFF)]++;
+            hist[512 + ((key >> 16) & 0xFF)]++;
+            hist[768 + ((key >> 24) & 0xFF)]++;
+         }
+      }
+      else if (key_size == 3)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + num_indices;
+         for ( ; p != q; p++)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[        key        & 0xFF]++;
+            hist[256 + ((key >>  8) & 0xFF)]++;
+            hist[512 + ((key >> 16) & 0xFF)]++;
+         }
+      }   
+      else if (key_size == 2)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + (num_indices >> 1) * 2;
+
+         for ( ; p != q; p += 2)
+         {
+            const uint key0 = RG_ETC1_GET_KEY(p);
+            const uint key1 = RG_ETC1_GET_KEY(p+1);
+
+            hist[        key0         & 0xFF]++;
+            hist[256 + ((key0 >>  8) & 0xFF)]++;
+
+            hist[        key1        & 0xFF]++;
+            hist[256 + ((key1 >>  8) & 0xFF)]++;
+         }
+
+         if (num_indices & 1)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[        key        & 0xFF]++;
+            hist[256 + ((key >>  8) & 0xFF)]++;
+         }
+      }      
+      else
+      {
+         RG_ETC1_ASSERT(key_size == 1);
+         if (key_size != 1)
+            return NULL;
+
+         T* p = pIndices0;
+         T* q = pIndices0 + (num_indices >> 1) * 2;
+
+         for ( ; p != q; p += 2)
+         {
+            const uint key0 = RG_ETC1_GET_KEY(p);
+            const uint key1 = RG_ETC1_GET_KEY(p+1);
+
+            hist[key0 & 0xFF]++;
+            hist[key1 & 0xFF]++;
+         }
+
+         if (num_indices & 1)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[key & 0xFF]++;
+         }
+      }      
+
+      T* pCur = pIndices0;
+      T* pNew = pIndices1;
+
+      for (uint pass = 0; pass < key_size; pass++)
+      {
+         const uint* pHist = &hist[pass << 8];
+
+         uint offsets[256];
+
+         uint cur_ofs = 0;
+         for (uint i = 0; i < 256; i += 2)
+         {
+            offsets[i] = cur_ofs;
+            cur_ofs += pHist[i];
+
+            offsets[i+1] = cur_ofs;
+            cur_ofs += pHist[i+1];
+         }
+
+         const uint pass_shift = pass << 3;
+
+         T* p = pCur;
+         T* q = pCur + (num_indices >> 1) * 2;
+
+         for ( ; p != q; p += 2)
+         {
+            uint index0 = p[0];
+            uint index1 = p[1];
+
+            uint c0 = (RG_ETC1_GET_KEY_FROM_INDEX(index0) >> pass_shift) & 0xFF;
+            uint c1 = (RG_ETC1_GET_KEY_FROM_INDEX(index1) >> pass_shift) & 0xFF;
+
+            if (c0 == c1)
+            {
+               uint dst_offset0 = offsets[c0];
+
+               offsets[c0] = dst_offset0 + 2;
+
+               pNew[dst_offset0] = static_cast<T>(index0);
+               pNew[dst_offset0 + 1] = static_cast<T>(index1);
+            }
+            else
+            {
+               uint dst_offset0 = offsets[c0]++;
+               uint dst_offset1 = offsets[c1]++;
+
+               pNew[dst_offset0] = static_cast<T>(index0);
+               pNew[dst_offset1] = static_cast<T>(index1);
+            }
+         }
+
+         if (num_indices & 1)
+         {
+            uint index = *p;
+            uint c = (RG_ETC1_GET_KEY_FROM_INDEX(index) >> pass_shift) & 0xFF;
+
+            uint dst_offset = offsets[c];
+            offsets[c] = dst_offset + 1;
+
+            pNew[dst_offset] = static_cast<T>(index);
+         }
+
+         T* t = pCur;
+         pCur = pNew;
+         pNew = t;
+      }            
+
+      return pCur;
+   }
+
+#undef RG_ETC1_GET_KEY
+#undef RG_ETC1_GET_KEY_FROM_INDEX
+
+   uint16 etc1_block::pack_color5(const color_quad_u8& color, bool scaled, uint bias)
+   {
+      return pack_color5(color.r, color.g, color.b, scaled, bias);
+   }
+   
+   uint16 etc1_block::pack_color5(uint r, uint g, uint b, bool scaled, uint bias)
+   {
+      if (scaled)
+      {
+         r = (r * 31U + bias) / 255U;
+         g = (g * 31U + bias) / 255U;
+         b = (b * 31U + bias) / 255U;
+      }
+
+      r = rg_etc1::minimum(r, 31U);
+      g = rg_etc1::minimum(g, 31U);
+      b = rg_etc1::minimum(b, 31U);
+
+      return static_cast<uint16>(b | (g << 5U) | (r << 10U));
+   }
+
+   color_quad_u8 etc1_block::unpack_color5(uint16 packed_color5, bool scaled, uint alpha)
+   {
+      uint b = packed_color5 & 31U;
+      uint g = (packed_color5 >> 5U) & 31U;
+      uint r = (packed_color5 >> 10U) & 31U;
+
+      if (scaled)
+      {
+         b = (b << 3U) | (b >> 2U);
+         g = (g << 3U) | (g >> 2U);
+         r = (r << 3U) | (r >> 2U);
+      }
+
+      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
+   }
+
+   void etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, bool scaled)
+   {
+      color_quad_u8 c(unpack_color5(packed_color5, scaled, 0));
+      r = c.r;
+      g = c.g;
+      b = c.b;
+   }
+
+   bool etc1_block::unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
+   {
+      int dc_r, dc_g, dc_b;
+      unpack_delta3(dc_r, dc_g, dc_b, packed_delta3);
+      
+      int b = (packed_color5 & 31U) + dc_b;
+      int g = ((packed_color5 >> 5U) & 31U) + dc_g;
+      int r = ((packed_color5 >> 10U) & 31U) + dc_r;
+
+      bool success = true;
+      if (static_cast<uint>(r | g | b) > 31U)
+      {
+         success = false;
+         r = rg_etc1::clamp<int>(r, 0, 31);
+         g = rg_etc1::clamp<int>(g, 0, 31);
+         b = rg_etc1::clamp<int>(b, 0, 31);
+      }
+
+      if (scaled)
+      {
+         b = (b << 3U) | (b >> 2U);
+         g = (g << 3U) | (g >> 2U);
+         r = (r << 3U) | (r >> 2U);
+      }
+
+      result.set_noclamp_rgba(r, g, b, rg_etc1::minimum(alpha, 255U));
+      return success;
+   }
+
+   bool etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
+   {
+      color_quad_u8 result;
+      const bool success = unpack_color5(result, packed_color5, packed_delta3, scaled, alpha);
+      r = result.r;
+      g = result.g;
+      b = result.b;
+      return success;
+   }
+     
+   uint16 etc1_block::pack_delta3(int r, int g, int b)
+   {
+      RG_ETC1_ASSERT((r >= cETC1ColorDeltaMin) && (r <= cETC1ColorDeltaMax));
+      RG_ETC1_ASSERT((g >= cETC1ColorDeltaMin) && (g <= cETC1ColorDeltaMax));
+      RG_ETC1_ASSERT((b >= cETC1ColorDeltaMin) && (b <= cETC1ColorDeltaMax));
+      if (r < 0) r += 8;
+      if (g < 0) g += 8;
+      if (b < 0) b += 8;
+      return static_cast<uint16>(b | (g << 3) | (r << 6));
+   }
+   
+   void etc1_block::unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3)
+   {
+      r = (packed_delta3 >> 6) & 7;
+      g = (packed_delta3 >> 3) & 7;
+      b = packed_delta3 & 7;
+      if (r >= 4) r -= 8;
+      if (g >= 4) g -= 8;
+      if (b >= 4) b -= 8;
+   }
+
+   uint16 etc1_block::pack_color4(const color_quad_u8& color, bool scaled, uint bias)
+   {
+      return pack_color4(color.r, color.g, color.b, scaled, bias);
+   }
+   
+   uint16 etc1_block::pack_color4(uint r, uint g, uint b, bool scaled, uint bias)
+   {
+      if (scaled)
+      {
+         r = (r * 15U + bias) / 255U;
+         g = (g * 15U + bias) / 255U;
+         b = (b * 15U + bias) / 255U;
+      }
+
+      r = rg_etc1::minimum(r, 15U);
+      g = rg_etc1::minimum(g, 15U);
+      b = rg_etc1::minimum(b, 15U);
+
+      return static_cast<uint16>(b | (g << 4U) | (r << 8U));
+   }
+
+   color_quad_u8 etc1_block::unpack_color4(uint16 packed_color4, bool scaled, uint alpha)
+   {
+      uint b = packed_color4 & 15U;
+      uint g = (packed_color4 >> 4U) & 15U;
+      uint r = (packed_color4 >> 8U) & 15U;
+
+      if (scaled)
+      {
+         b = (b << 4U) | b;
+         g = (g << 4U) | g;
+         r = (r << 4U) | r;
+      }
+
+      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
+   }
+   
+   void etc1_block::unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled)
+   {
+      color_quad_u8 c(unpack_color4(packed_color4, scaled, 0));
+      r = c.r;
+      g = c.g;
+      b = c.b;
+   }
+
+   void etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx)
+   {
+      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
+      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+      uint r, g, b;
+      unpack_color5(r, g, b, packed_color5, true);
+
+      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+      const int y0 = pInten_modifer_table[0];
+      pDst[0].set(ir + y0, ig + y0, ib + y0);
+
+      const int y1 = pInten_modifer_table[1];
+      pDst[1].set(ir + y1, ig + y1, ib + y1);
+
+      const int y2 = pInten_modifer_table[2];
+      pDst[2].set(ir + y2, ig + y2, ib + y2);
+
+      const int y3 = pInten_modifer_table[3];
+      pDst[3].set(ir + y3, ig + y3, ib + y3);
+   }
+   
+   bool etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx)
+   {
+      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
+      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+      uint r, g, b;
+      bool success = unpack_color5(r, g, b, packed_color5, packed_delta3, true);
+
+      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+      const int y0 = pInten_modifer_table[0];
+      pDst[0].set(ir + y0, ig + y0, ib + y0);
+
+      const int y1 = pInten_modifer_table[1];
+      pDst[1].set(ir + y1, ig + y1, ib + y1);
+
+      const int y2 = pInten_modifer_table[2];
+      pDst[2].set(ir + y2, ig + y2, ib + y2);
+
+      const int y3 = pInten_modifer_table[3];
+      pDst[3].set(ir + y3, ig + y3, ib + y3);
+
+      return success;
+   }
+   
+   void etc1_block::get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx)
+   {
+      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
+      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+      uint r, g, b;
+      unpack_color4(r, g, b, packed_color4, true);
+      
+      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+      const int y0 = pInten_modifer_table[0];
+      pDst[0].set(ir + y0, ig + y0, ib + y0);
+      
+      const int y1 = pInten_modifer_table[1];
+      pDst[1].set(ir + y1, ig + y1, ib + y1);
+
+      const int y2 = pInten_modifer_table[2];
+      pDst[2].set(ir + y2, ig + y2, ib + y2);
+
+      const int y3 = pInten_modifer_table[3];
+      pDst[3].set(ir + y3, ig + y3, ib + y3);
+   }
+      
+   bool unpack_etc1_block(const void* pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha)
+   {
+      color_quad_u8* pDst = reinterpret_cast<color_quad_u8*>(pDst_pixels_rgba);
+      const etc1_block& block = *static_cast<const etc1_block*>(pETC1_block);
+
+      const bool diff_flag = block.get_diff_bit();
+      const bool flip_flag = block.get_flip_bit();
+      const uint table_index0 = block.get_inten_table(0);
+      const uint table_index1 = block.get_inten_table(1);
+
+      color_quad_u8 subblock_colors0[4];
+      color_quad_u8 subblock_colors1[4];
+      bool success = true;
+
+      if (diff_flag)
+      {
+         const uint16 base_color5 = block.get_base5_color();
+         const uint16 delta_color3 = block.get_delta3_color();
+         etc1_block::get_diff_subblock_colors(subblock_colors0, base_color5, table_index0);
+            
+         if (!etc1_block::get_diff_subblock_colors(subblock_colors1, base_color5, delta_color3, table_index1))
+            success = false;
+      }
+      else
+      {
+         const uint16 base_color4_0 = block.get_base4_color(0);
+         etc1_block::get_abs_subblock_colors(subblock_colors0, base_color4_0, table_index0);
+
+         const uint16 base_color4_1 = block.get_base4_color(1);
+         etc1_block::get_abs_subblock_colors(subblock_colors1, base_color4_1, table_index1);
+      }
+
+      if (preserve_alpha)
+      {
+         if (flip_flag)
+         {
+            for (uint y = 0; y < 2; y++)
+            {
+               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+               pDst[2].set_rgb(subblock_colors0[block.get_selector(2, y)]);
+               pDst[3].set_rgb(subblock_colors0[block.get_selector(3, y)]);
+               pDst += 4;
+            }
+
+            for (uint y = 2; y < 4; y++)
+            {
+               pDst[0].set_rgb(subblock_colors1[block.get_selector(0, y)]);
+               pDst[1].set_rgb(subblock_colors1[block.get_selector(1, y)]);
+               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+               pDst += 4;
+            }
+         }
+         else
+         {
+            for (uint y = 0; y < 4; y++)
+            {
+               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+               pDst += 4;
+            }
+         }
+      }
+      else 
+      {
+         if (flip_flag)
+         {
+            // 0000
+            // 0000
+            // 1111
+            // 1111
+            for (uint y = 0; y < 2; y++)
+            {
+               pDst[0] = subblock_colors0[block.get_selector(0, y)];
+               pDst[1] = subblock_colors0[block.get_selector(1, y)];
+               pDst[2] = subblock_colors0[block.get_selector(2, y)];
+               pDst[3] = subblock_colors0[block.get_selector(3, y)];
+               pDst += 4;
+            }
+
+            for (uint y = 2; y < 4; y++)
+            {
+               pDst[0] = subblock_colors1[block.get_selector(0, y)];
+               pDst[1] = subblock_colors1[block.get_selector(1, y)];
+               pDst[2] = subblock_colors1[block.get_selector(2, y)];
+               pDst[3] = subblock_colors1[block.get_selector(3, y)];
+               pDst += 4;
+            }
+         }
+         else
+         {
+            // 0011
+            // 0011
+            // 0011
+            // 0011
+            for (uint y = 0; y < 4; y++)
+            {
+               pDst[0] = subblock_colors0[block.get_selector(0, y)];
+               pDst[1] = subblock_colors0[block.get_selector(1, y)];
+               pDst[2] = subblock_colors1[block.get_selector(2, y)];
+               pDst[3] = subblock_colors1[block.get_selector(3, y)];
+               pDst += 4;
+            }
+         }
+      }
+      
+      return success;
+   }
+
+   struct etc1_solution_coordinates
+   {
+      inline etc1_solution_coordinates() :
+      m_unscaled_color(0, 0, 0, 0),
+         m_inten_table(0),
+         m_color4(false)
+      {
+      }
+
+      inline etc1_solution_coordinates(uint r, uint g, uint b, uint inten_table, bool color4) : 
+      m_unscaled_color(r, g, b, 255),
+         m_inten_table(inten_table),
+         m_color4(color4)
+      {
+      }
+
+      inline etc1_solution_coordinates(const color_quad_u8& c, uint inten_table, bool color4) : 
+      m_unscaled_color(c),
+         m_inten_table(inten_table),
+         m_color4(color4)
+      {
+      }
+
+      inline etc1_solution_coordinates(const etc1_solution_coordinates& other)
+      {
+         *this = other;
+      }
+
+      inline etc1_solution_coordinates& operator= (const etc1_solution_coordinates& rhs)
+      {
+         m_unscaled_color = rhs.m_unscaled_color;
+         m_inten_table = rhs.m_inten_table;
+         m_color4 = rhs.m_color4;
+         return *this;
+      }
+
+      inline void clear()
+      {
+         m_unscaled_color.clear();
+         m_inten_table = 0;
+         m_color4 = false;
+      }
+
+      inline color_quad_u8 get_scaled_color() const
+      {
+         int br, bg, bb;
+         if (m_color4)
+         {
+            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
+            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
+            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
+         }
+         else
+         {
+            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
+            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
+            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
+         }
+         return color_quad_u8(br, bg, bb);
+      }
+
+      inline void get_block_colors(color_quad_u8* pBlock_colors)
+      {
+         int br, bg, bb;
+         if (m_color4)
+         {
+            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
+            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
+            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
+         }
+         else
+         {
+            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
+            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
+            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
+         }
+         const int* pInten_table = g_etc1_inten_tables[m_inten_table];
+         pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0]);
+         pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1]);
+         pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2]);
+         pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3]);
+      }
+
+      color_quad_u8 m_unscaled_color;
+      uint m_inten_table;
+      bool m_color4;
+   };
+
+   class etc1_optimizer
+   {
+      etc1_optimizer(const etc1_optimizer&);
+      etc1_optimizer& operator= (const etc1_optimizer&);
+
+   public:
+      etc1_optimizer()
+      {
+         clear();
+      }
+
+      void clear()
+      {
+         m_pParams = NULL;
+         m_pResult = NULL;
+         m_pSorted_luma = NULL;
+         m_pSorted_luma_indices = NULL;
+      }
+
+      struct params : etc1_pack_params
+      {
+         params()
+         {
+            clear();
+         }
+
+         params(const etc1_pack_params& base_params) : 
+         etc1_pack_params(base_params)
+         {
+            clear_optimizer_params();
+         }
+
+         void clear()
+         {
+            etc1_pack_params::clear();
+            clear_optimizer_params();
+         }
+
+         void clear_optimizer_params()
+         {
+            m_num_src_pixels = 0;
+            m_pSrc_pixels = 0;
+
+            m_use_color4 = false;
+            static const int s_default_scan_delta[] = { 0 };
+            m_pScan_deltas = s_default_scan_delta;
+            m_scan_delta_size = 1;
+
+            m_base_color5.clear();
+            m_constrain_against_base_color5 = false;
+         }
+
+         uint m_num_src_pixels;
+         const color_quad_u8* m_pSrc_pixels;
+
+         bool m_use_color4;
+         const int* m_pScan_deltas;
+         uint m_scan_delta_size;
+
+         color_quad_u8 m_base_color5;
+         bool m_constrain_against_base_color5;
+      };
+
+      struct results
+      {
+         uint64 m_error;
+         color_quad_u8 m_block_color_unscaled;
+         uint m_block_inten_table;
+         uint m_n;
+         uint8* m_pSelectors;
+         bool m_block_color4;
+
+         inline results& operator= (const results& rhs)
+         {
+            m_block_color_unscaled = rhs.m_block_color_unscaled;
+            m_block_color4 = rhs.m_block_color4;
+            m_block_inten_table = rhs.m_block_inten_table;
+            m_error = rhs.m_error;
+            RG_ETC1_ASSERT(m_n == rhs.m_n);
+            memcpy(m_pSelectors, rhs.m_pSelectors, rhs.m_n);
+            return *this;
+         }
+      };
+
+      void init(const params& params, results& result);
+      bool compute();
+
+   private:      
+      struct potential_solution
+      {
+         potential_solution() : m_coords(), m_error(cUINT64_MAX), m_valid(false)
+         {
+         }
+
+         etc1_solution_coordinates  m_coords;
+         uint8                      m_selectors[8];
+         uint64                     m_error;
+         bool                       m_valid;
+
+         void clear()
+         {
+            m_coords.clear();
+            m_error = cUINT64_MAX;
+            m_valid = false;
+         }
+      };
+
+      const params* m_pParams;
+      results* m_pResult;
+
+      int m_limit;
+
+      vec3F m_avg_color;
+      int m_br, m_bg, m_bb;
+      uint16 m_luma[8];
+      uint32 m_sorted_luma[2][8];
+      const uint32* m_pSorted_luma_indices;
+      uint32* m_pSorted_luma;
+
+      uint8 m_selectors[8];
+      uint8 m_best_selectors[8];
+
+      potential_solution m_best_solution;
+      potential_solution m_trial_solution;
+      uint8 m_temp_selectors[8];
+
+      bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
+      bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
+   };
+      
+   bool etc1_optimizer::compute()
+   {
+      const uint n = m_pParams->m_num_src_pixels;
+      const int scan_delta_size = m_pParams->m_scan_delta_size;
+      
+      // Scan through a subset of the 3D lattice centered around the avg block color trying each 3D (555 or 444) lattice point as a potential block color.
+      // Each time a better solution is found try to refine the current solution's block color based of the current selectors and intensity table index.
+      for (int zdi = 0; zdi < scan_delta_size; zdi++)
+      {
+         const int zd = m_pParams->m_pScan_deltas[zdi];
+         const int mbb = m_bb + zd;
+         if (mbb < 0) continue; else if (mbb > m_limit) break;
+         
+         for (int ydi = 0; ydi < scan_delta_size; ydi++)
+         {
+            const int yd = m_pParams->m_pScan_deltas[ydi];
+            const int mbg = m_bg + yd;
+            if (mbg < 0) continue; else if (mbg > m_limit) break;
+
+            for (int xdi = 0; xdi < scan_delta_size; xdi++)
+            {
+               const int xd = m_pParams->m_pScan_deltas[xdi];
+               const int mbr = m_br + xd;
+               if (mbr < 0) continue; else if (mbr > m_limit) break;
+      
+               etc1_solution_coordinates coords(mbr, mbg, mbb, 0, m_pParams->m_use_color4);
+               if (m_pParams->m_quality == cHighQuality)
+               {
+                  if (!evaluate_solution(coords, m_trial_solution, &m_best_solution))
+                     continue;
+               }
+               else
+               {
+                  if (!evaluate_solution_fast(coords, m_trial_solution, &m_best_solution))
+                     continue;
+               }
+               
+               // Now we have the input block, the avg. color of the input pixels, a set of trial selector indices, and the block color+intensity index.
+               // Now, for each component, attempt to refine the current solution by solving a simple linear equation. For example, for 4 colors:
+               // The goal is:
+               // pixel0 - (block_color+inten_table[selector0]) + pixel1 - (block_color+inten_table[selector1]) + pixel2 - (block_color+inten_table[selector2]) + pixel3 - (block_color+inten_table[selector3]) = 0
+               // Rearranging this:
+               // (pixel0 + pixel1 + pixel2 + pixel3) - (block_color+inten_table[selector0]) - (block_color+inten_table[selector1]) - (block_color+inten_table[selector2]) - (block_color+inten_table[selector3]) = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3) - block_color - inten_table[selector0] - block_color-inten_table[selector1] - block_color-inten_table[selector2] - block_color-inten_table[selector3] = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - inten_table[selector0] - inten_table[selector1] - inten_table[selector2] - inten_table[selector3] = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3]) = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3)/4 - block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4 = 0
+               // block_color = (pixel0 + pixel1 + pixel2 + pixel3)/4 - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4
+               // So what this means:
+               // optimal_block_color = avg_input - avg_inten_delta
+               // So the optimal block color can be computed by taking the average block color and subtracting the current average of the intensity delta.
+               // Unfortunately, optimal_block_color must then be quantized to 555 or 444 so it's not always possible to improve matters using this formula.
+               // Also, the above formula is for unclamped intensity deltas. The actual implementation takes into account clamping.
+
+               const uint max_refinement_trials = (m_pParams->m_quality == cLowQuality) ? 2 : (((xd | yd | zd) == 0) ? 4 : 2);
+               for (uint refinement_trial = 0; refinement_trial < max_refinement_trials; refinement_trial++)
+               {
+                  const uint8* pSelectors = m_best_solution.m_selectors;
+                  const int* pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
+
+                  int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
+                  const color_quad_u8 base_color(m_best_solution.m_coords.get_scaled_color());
+                  for (uint r = 0; r < n; r++)
+                  {
+                     const uint s = *pSelectors++;
+                     const int yd = pInten_table[s];
+                     // Compute actual delta being applied to each pixel, taking into account clamping.
+                     delta_sum_r += rg_etc1::clamp<int>(base_color.r + yd, 0, 255) - base_color.r;
+                     delta_sum_g += rg_etc1::clamp<int>(base_color.g + yd, 0, 255) - base_color.g;
+                     delta_sum_b += rg_etc1::clamp<int>(base_color.b + yd, 0, 255) - base_color.b;
+                  }
+                  if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
+                     break;
+                  const float avg_delta_r_f = static_cast<float>(delta_sum_r) / n;
+                  const float avg_delta_g_f = static_cast<float>(delta_sum_g) / n;
+                  const float avg_delta_b_f = static_cast<float>(delta_sum_b) / n;
+                  const int br1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
+                  const int bg1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
+                  const int bb1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
+                  
+                  bool skip = false;
+                  
+                  if ((mbr == br1) && (mbg == bg1) && (mbb == bb1))
+                     skip = true;
+                  else if ((br1 == m_best_solution.m_coords.m_unscaled_color.r) && (bg1 == m_best_solution.m_coords.m_unscaled_color.g) && (bb1 == m_best_solution.m_coords.m_unscaled_color.b))
+                     skip = true;
+                  else if ((m_br == br1) && (m_bg == bg1) && (m_bb == bb1))
+                     skip = true;
+
+                  if (skip)
+                     break;
+
+                  etc1_solution_coordinates coords1(br1, bg1, bb1, 0, m_pParams->m_use_color4);
+                  if (m_pParams->m_quality == cHighQuality)
+                  {
+                     if (!evaluate_solution(coords1, m_trial_solution, &m_best_solution)) 
+                        break;
+                  }
+                  else
+                  {
+                     if (!evaluate_solution_fast(coords1, m_trial_solution, &m_best_solution))
+                        break;
+                  }
+
+               }  // refinement_trial
+
+            } // xdi
+         } // ydi
+      } // zdi
+
+      if (!m_best_solution.m_valid)
+      {
+         m_pResult->m_error = cUINT32_MAX;
+         return false;
+      }
+      
+      const uint8* pSelectors = m_best_solution.m_selectors;
+
+#ifdef RG_ETC1_BUILD_DEBUG
+      {
+         color_quad_u8 block_colors[4];
+         m_best_solution.m_coords.get_block_colors(block_colors);
+
+         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
+         uint64 actual_error = 0;
+         for (uint i = 0; i < n; i++)
+            actual_error += pSrc_pixels[i].squared_distance_rgb(block_colors[pSelectors[i]]);
+         
+         RG_ETC1_ASSERT(actual_error == m_best_solution.m_error);
+      }
+#endif      
+      
+      m_pResult->m_error = m_best_solution.m_error;
+
+      m_pResult->m_block_color_unscaled = m_best_solution.m_coords.m_unscaled_color;
+      m_pResult->m_block_color4 = m_best_solution.m_coords.m_color4;
+      
+      m_pResult->m_block_inten_table = m_best_solution.m_coords.m_inten_table;
+      memcpy(m_pResult->m_pSelectors, pSelectors, n);
+      m_pResult->m_n = n;
+
+      return true;
+   }
+
+   void etc1_optimizer::init(const params& p, results& r)
+   {
+      // This version is hardcoded for 8 pixel subblocks.
+      RG_ETC1_ASSERT(p.m_num_src_pixels == 8);
+      
+      m_pParams = &p;
+      m_pResult = &r;
+                  
+      const uint n = 8;
+      
+      m_limit = m_pParams->m_use_color4 ? 15 : 31;
+
+      vec3F avg_color(0.0f);
+
+      for (uint i = 0; i < n; i++)
+      {
+         const color_quad_u8& c = m_pParams->m_pSrc_pixels[i];
+         const vec3F fc(c.r, c.g, c.b);
+
+         avg_color += fc;
+
+         m_luma[i] = static_cast<uint16>(c.r + c.g + c.b);
+         m_sorted_luma[0][i] = i;
+      }
+      avg_color *= (1.0f / static_cast<float>(n));
+      m_avg_color = avg_color;
+
+      m_br = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
+      m_bg = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
+      m_bb = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
+
+      if (m_pParams->m_quality <= cMediumQuality)
+      {
+         m_pSorted_luma_indices = indirect_radix_sort(n, m_sorted_luma[0], m_sorted_luma[1], m_luma, 0, sizeof(m_luma[0]), false);
+         m_pSorted_luma = m_sorted_luma[0];
+         if (m_pSorted_luma_indices == m_sorted_luma[0])
+            m_pSorted_luma = m_sorted_luma[1];
+      
+         for (uint i = 0; i < n; i++)
+            m_pSorted_luma[i] = m_luma[m_pSorted_luma_indices[i]];
+      }
+      
+      m_best_solution.m_coords.clear();
+      m_best_solution.m_valid = false;
+      m_best_solution.m_error = cUINT64_MAX;
+   }
+
+   bool etc1_optimizer::evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+   {
+      trial_solution.m_valid = false;
+
+      if (m_pParams->m_constrain_against_base_color5)
+      {
+         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
+         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
+         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
+
+         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
+            return false;
+      }
+
+      const color_quad_u8 base_color(coords.get_scaled_color());
+      
+      const uint n = 8;
+            
+      trial_solution.m_error = cUINT64_MAX;
+            
+      for (uint inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
+      {
+         const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+         color_quad_u8 block_colors[4];
+         for (uint s = 0; s < 4; s++)
+         {
+            const int yd = pInten_table[s];
+            block_colors[s].set(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
+         }
+         
+         uint64 total_error = 0;
+         
+         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
+         for (uint c = 0; c < n; c++)
+         {
+            const color_quad_u8& src_pixel = *pSrc_pixels++;
+            
+            uint best_selector_index = 0;
+            uint best_error = rg_etc1::square(src_pixel.r - block_colors[0].r) + rg_etc1::square(src_pixel.g - block_colors[0].g) + rg_etc1::square(src_pixel.b - block_colors[0].b);
+
+            uint trial_error = rg_etc1::square(src_pixel.r - block_colors[1].r) + rg_etc1::square(src_pixel.g - block_colors[1].g) + rg_etc1::square(src_pixel.b - block_colors[1].b);
+            if (trial_error < best_error)
+            {
+               best_error = trial_error;
+               best_selector_index = 1;
+            }
+
+            trial_error = rg_etc1::square(src_pixel.r - block_colors[2].r) + rg_etc1::square(src_pixel.g - block_colors[2].g) + rg_etc1::square(src_pixel.b - block_colors[2].b);
+            if (trial_error < best_error)
+            {
+               best_error = trial_error;
+               best_selector_index = 2;
+            }
+
+            trial_error = rg_etc1::square(src_pixel.r - block_colors[3].r) + rg_etc1::square(src_pixel.g - block_colors[3].g) + rg_etc1::square(src_pixel.b - block_colors[3].b);
+            if (trial_error < best_error)
+            {
+               best_error = trial_error;
+               best_selector_index = 3;
+            }
+
+            m_temp_selectors[c] = static_cast<uint8>(best_selector_index);
+
+            total_error += best_error;
+            if (total_error >= trial_solution.m_error)
+               break;
+         }
+         
+         if (total_error < trial_solution.m_error)
+         {
+            trial_solution.m_error = total_error;
+            trial_solution.m_coords.m_inten_table = inten_table;
+            memcpy(trial_solution.m_selectors, m_temp_selectors, 8);
+            trial_solution.m_valid = true;
+         }
+      }
+      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+
+      bool success = false;
+      if (pBest_solution)
+      {
+         if (trial_solution.m_error < pBest_solution->m_error)
+         {
+            *pBest_solution = trial_solution;
+            success = true;
+         }
+      }
+
+      return success;
+   }
+
+   bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+   {
+      if (m_pParams->m_constrain_against_base_color5)
+      {
+         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
+         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
+         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
+
+         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
+         {
+            trial_solution.m_valid = false;
+            return false;
+         }
+      }
+
+      const color_quad_u8 base_color(coords.get_scaled_color());
+
+      const uint n = 8;
+      
+      trial_solution.m_error = cUINT64_MAX;
+
+      for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
+      {
+         const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+         uint block_inten[4];
+         color_quad_u8 block_colors[4];
+         for (uint s = 0; s < 4; s++)
+         {
+            const int yd = pInten_table[s];
+            color_quad_u8 block_color(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
+            block_colors[s] = block_color;
+            block_inten[s] = block_color.r + block_color.g + block_color.b;
+         }
+
+         // evaluate_solution_fast() enforces/assumesd a total ordering of the input colors along the intensity (1,1,1) axis to more quickly classify the inputs to selectors.
+         // The inputs colors have been presorted along the projection onto this axis, and ETC1 block colors are always ordered along the intensity axis, so this classification is fast.
+         // 0   1   2   3
+         //   01  12  23
+         const uint block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
+
+         uint64 total_error = 0;
+         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
+         if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
+         {
+            if (block_inten[0] > m_pSorted_luma[n - 1])
+            {
+           const uint min_error = intabs(block_inten[0] - m_pSorted_luma[n - 1]);
+               if (min_error >= trial_solution.m_error)
+                  continue;
+            }
+
+            memset(&m_temp_selectors[0], 0, n);
+
+            for (uint c = 0; c < n; c++)
+               total_error += block_colors[0].squared_distance_rgb(pSrc_pixels[c]);
+         }
+         else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
+         {
+            if (m_pSorted_luma[0] > block_inten[3])
+            {
+           const uint min_error = intabs(m_pSorted_luma[0] - block_inten[3]);
+               if (min_error >= trial_solution.m_error)
+                  continue;
+            }
+
+            memset(&m_temp_selectors[0], 3, n);
+
+            for (uint c = 0; c < n; c++)
+               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[c]);
+         }
+         else
+         {
+            uint cur_selector = 0, c;
+            for (c = 0; c < n; c++)
+            {
+               const uint y = m_pSorted_luma[c];
+               while ((y * 2) >= block_inten_midpoints[cur_selector])
+                  if (++cur_selector > 2)
+                     goto done;
+               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
+               m_temp_selectors[sorted_pixel_index] = static_cast<uint8>(cur_selector);
+               total_error += block_colors[cur_selector].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
+            }
+done:
+            while (c < n)
+            {
+               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
+               m_temp_selectors[sorted_pixel_index] = 3;
+               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
+               ++c;
+            }
+         }
+
+         if (total_error < trial_solution.m_error)
+         {
+            trial_solution.m_error = total_error;
+            trial_solution.m_coords.m_inten_table = inten_table;
+            memcpy(trial_solution.m_selectors, m_temp_selectors, n);
+            trial_solution.m_valid = true;
+            if (!total_error)
+               break;
+         }
+      }
+      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+      
+      bool success = false;
+      if (pBest_solution)
+      {
+         if (trial_solution.m_error < pBest_solution->m_error)
+         {
+            *pBest_solution = trial_solution;
+            success = true;
+         }
+      }
+
+      return success;
+   }
+         
+   static uint etc1_decode_value(uint diff, uint inten, uint selector, uint packed_c)
+   {
+      const uint limit = diff ? 32 : 16; limit;
+      RG_ETC1_ASSERT((diff < 2) && (inten < 8) && (selector < 4) && (packed_c < limit));
+      int c;
+      if (diff)
+         c = (packed_c >> 2) | (packed_c << 3);
+      else 
+         c = packed_c | (packed_c << 4);
+      c += g_etc1_inten_tables[inten][selector];
+      c = rg_etc1::clamp<int>(c, 0, 255);
+      return c;
+   }
+
+   static inline int mul_8bit(int a, int b) { int t = a*b + 128; return (t + (t >> 8)) >> 8; }
+
+   void pack_etc1_block_init()
+   {
+      for (uint diff = 0; diff < 2; diff++)
+      {
+         const uint limit = diff ? 32 : 16;
+
+         for (uint inten = 0; inten < 8; inten++)
+         {
+            for (uint selector = 0; selector < 4; selector++)
+            {
+               const uint inverse_table_index = diff + (inten << 1) + (selector << 4);
+               for (uint color = 0; color < 256; color++)
+               {
+                  uint best_error = cUINT32_MAX, best_packed_c = 0;
+                  for (uint packed_c = 0; packed_c < limit; packed_c++)
+                  {
+                     int v = etc1_decode_value(diff, inten, selector, packed_c);
+                     uint err = labs(v - static_cast<int>(color));
+		     //printf("err: %d - %u = %u\n",v,color,err);
+                     if (err < best_error)
+                     {
+                        best_error = err;
+                        best_packed_c = packed_c;
+                        if (!best_error) 
+                           break;
+                     }
+                  }
+                  RG_ETC1_ASSERT(best_error <= 255);
+                  g_etc1_inverse_lookup[inverse_table_index][color] = static_cast<uint16>(best_packed_c | (best_error << 8));
+               }
+            }
+         }
+      }
+      
+      uint expand5[32];
+      for(int i = 0; i < 32; i++)
+         expand5[i] = (i << 3) | (i >> 2);
+
+      for(int i = 0; i < 256 + 16; i++)
+      {
+         int v = clamp<int>(i - 8, 0, 255);
+         g_quant5_tab[i] = static_cast<uint8>(expand5[mul_8bit(v,31)]);
+      }
+   }
+
+   // Packs solid color blocks efficiently using a set of small precomputed tables.
+   // For random 888 inputs, MSE results are better than Erricson's ETC1 packer in "slow" mode ~9.5% of the time, is slightly worse only ~.01% of the time, and is equal the rest of the time.
+   static uint64 pack_etc1_block_solid_color(etc1_block& block, const uint8* pColor, etc1_pack_params& pack_params)
+   {
+      pack_params;
+      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
+            
+      static uint s_next_comp[4] = { 1, 2, 0, 1 };
+            
+      uint best_error = cUINT32_MAX, best_i = 0;
+      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
+
+      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
+      for (uint i = 0; i < 3; i++)
+      {
+         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
+
+         const int delta_range = 1;
+         for (int delta = -delta_range; delta <= delta_range; delta++)
+         {
+            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
+
+            const uint16* pTable;
+            if (!c_plus_delta)
+               pTable = g_color8_to_etc_block_config_0_255[0];
+            else if (c_plus_delta == 255)
+               pTable = g_color8_to_etc_block_config_0_255[1];
+            else
+               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
+
+            do
+            {
+               const uint x = *pTable++;
+
+#ifdef RG_ETC1_BUILD_DEBUG
+               const uint diff = x & 1;
+               const uint inten = (x >> 1) & 7;
+               const uint selector = (x >> 4) & 3;
+               const uint p0 = (x >> 8) & 255;
+               RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
+#endif
+
+               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
+               uint16 p1 = pInverse_table[c1];
+               uint16 p2 = pInverse_table[c2];
+               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
+               if (trial_error < best_error)
+               {
+                  best_error = trial_error;
+                  best_x = x;
+                  best_packed_c1 = p1 & 0xFF;
+                  best_packed_c2 = p2 & 0xFF;
+                  best_i = i;
+                  if (!best_error)
+                     goto found_perfect_match;
+               }
+            } while (*pTable != 0xFFFF);
+         }
+      }
+found_perfect_match:
+
+      const uint diff = best_x & 1;
+      const uint inten = (best_x >> 1) & 7;
+
+      block.m_bytes[3] = static_cast<uint8>(((inten | (inten << 3)) << 2) | (diff << 1));
+                        
+      const uint etc1_selector = g_selector_index_to_etc1[(best_x >> 4) & 3];
+      *reinterpret_cast<uint16*>(&block.m_bytes[4]) = (etc1_selector & 2) ? 0xFFFF : 0;
+      *reinterpret_cast<uint16*>(&block.m_bytes[6]) = (etc1_selector & 1) ? 0xFFFF : 0;
+
+      const uint best_packed_c0 = (best_x >> 8) & 255;
+      if (diff)
+      {
+         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 << 3);
+         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 << 3);
+         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 << 3);
+      }
+      else
+      {
+         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 | (best_packed_c0 << 4));
+         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 | (best_packed_c1 << 4));
+         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 | (best_packed_c2 << 4));
+      }
+
+      return best_error;
+   }
+      
+   static uint pack_etc1_block_solid_color_constrained(
+      etc1_optimizer::results& results, 
+      uint num_colors, const uint8* pColor, 
+      etc1_pack_params& pack_params, 
+      bool use_diff,
+      const color_quad_u8* pBase_color5_unscaled)
+   {
+      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
+
+      pack_params;
+      static uint s_next_comp[4] = { 1, 2, 0, 1 };
+
+      uint best_error = cUINT32_MAX, best_i = 0;
+      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
+
+      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
+      for (uint i = 0; i < 3; i++)
+      {
+         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
+
+         const int delta_range = 1;
+         for (int delta = -delta_range; delta <= delta_range; delta++)
+         {
+            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
+
+            const uint16* pTable;
+            if (!c_plus_delta)
+               pTable = g_color8_to_etc_block_config_0_255[0];
+            else if (c_plus_delta == 255)
+               pTable = g_color8_to_etc_block_config_0_255[1];
+            else
+               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
+
+            do
+            {
+               const uint x = *pTable++;
+               const uint diff = x & 1;
+               if (static_cast<uint>(use_diff) != diff)
+               {
+                  if (*pTable == 0xFFFF)
+                     break;
+                  continue;
+               }
+
+               if ((diff) && (pBase_color5_unscaled))
+               {
+                  const int p0 = (x >> 8) & 255;
+                  int delta = p0 - static_cast<int>(pBase_color5_unscaled->c[i]);
+                  if ((delta < cETC1ColorDeltaMin) || (delta > cETC1ColorDeltaMax))
+                  {
+                     if (*pTable == 0xFFFF)
+                        break;
+                     continue;
+                  }
+               }
+
+#ifdef RG_ETC1_BUILD_DEBUG
+               {
+                  const uint inten = (x >> 1) & 7;
+                  const uint selector = (x >> 4) & 3;
+                  const uint p0 = (x >> 8) & 255;
+                  RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
+               }
+#endif
+
+               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
+               uint16 p1 = pInverse_table[c1];
+               uint16 p2 = pInverse_table[c2];
+
+               if ((diff) && (pBase_color5_unscaled))
+               {
+                  int delta1 = (p1 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i]]);
+                  int delta2 = (p2 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i + 1]]);
+                  if ((delta1 < cETC1ColorDeltaMin) || (delta1 > cETC1ColorDeltaMax) || (delta2 < cETC1ColorDeltaMin) || (delta2 > cETC1ColorDeltaMax))
+                  {
+                     if (*pTable == 0xFFFF)
+                        break;
+                     continue;
+                  }
+               }
+
+               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
+               if (trial_error < best_error)
+               {
+                  best_error = trial_error;
+                  best_x = x;
+                  best_packed_c1 = p1 & 0xFF;
+                  best_packed_c2 = p2 & 0xFF;
+                  best_i = i;
+                  if (!best_error)
+                     goto found_perfect_match;
+               }
+            } while (*pTable != 0xFFFF);
+         }
+      }
+found_perfect_match:
+
+      if (best_error == cUINT32_MAX)
+         return best_error;
+
+      best_error *= num_colors;
+
+      results.m_n = num_colors;
+      results.m_block_color4 = !(best_x & 1);
+      results.m_block_inten_table = (best_x >> 1) & 7;
+      memset(results.m_pSelectors, (best_x >> 4) & 3, num_colors);
+
+      const uint best_packed_c0 = (best_x >> 8) & 255;
+      results.m_block_color_unscaled[best_i] = static_cast<uint8>(best_packed_c0);
+      results.m_block_color_unscaled[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1);
+      results.m_block_color_unscaled[s_next_comp[best_i + 1]] = static_cast<uint8>(best_packed_c2);
+      results.m_error = best_error;
+      
+      return best_error;
+   }
+
+   // Function originally from RYG's public domain real-time DXT1 compressor, modified for 555.
+   static void dither_block_555(color_quad_u8* dest, const color_quad_u8* block)
+   {
+      int err[8],*ep1 = err,*ep2 = err+4;
+      uint8 *quant = g_quant5_tab+8;
+
+      memset(dest, 0xFF, sizeof(color_quad_u8)*16);
+
+      // process channels seperately
+      for(int ch=0;ch<3;ch++)
+      {
+         uint8* bp = (uint8*)block;
+         uint8* dp = (uint8*)dest;
+
+         bp += ch; dp += ch;
+
+         memset(err,0, sizeof(err));
+         for(int y = 0; y < 4; y++)
+         {
+            // pixel 0
+            dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
+            ep1[0] = bp[ 0] - dp[ 0];
+
+            // pixel 1
+            dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
+            ep1[1] = bp[ 4] - dp[ 4];
+
+            // pixel 2
+            dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
+            ep1[2] = bp[ 8] - dp[ 8];
+
+            // pixel 3
+            dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
+            ep1[3] = bp[12] - dp[12];
+
+            // advance to next line
+            int* tmp = ep1; ep1 = ep2; ep2 = tmp;
+            bp += 16;
+            dp += 16;
+         }
+      }
+   }
+
+   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params)
+   {
+      const color_quad_u8* pSrc_pixels = reinterpret_cast<const color_quad_u8*>(pSrc_pixels_rgba);
+      etc1_block& dst_block = *static_cast<etc1_block*>(pETC1_block);
+
+#ifdef RG_ETC1_BUILD_DEBUG
+      // Ensure all alpha values are 0xFF.
+      for (uint i = 0; i < 16; i++)
+      {
+         RG_ETC1_ASSERT(pSrc_pixels[i].a == 255);
+      }
+#endif
+
+      color_quad_u8 src_pixel0(pSrc_pixels[0]);
+
+      // Check for solid block.
+      const uint32 first_pixel_u32 = pSrc_pixels->m_u32;
+      int r;
+      for (r = 15; r >= 1; --r)
+         if (pSrc_pixels[r].m_u32 != first_pixel_u32)
+            break;
+      if (!r)
+         return static_cast<unsigned int>(16 * pack_etc1_block_solid_color(dst_block, &pSrc_pixels[0].r, pack_params));
+      
+      color_quad_u8 dithered_pixels[16];
+      if (pack_params.m_dithering)
+      {
+         dither_block_555(dithered_pixels, pSrc_pixels);
+         pSrc_pixels = dithered_pixels;
+      }
+
+      etc1_optimizer optimizer;
+
+      uint64 best_error = cUINT64_MAX;
+      uint best_flip = false, best_use_color4 = false;
+      
+      uint8 best_selectors[2][8];
+      etc1_optimizer::results best_results[2];
+      for (uint i = 0; i < 2; i++)
+      {
+         best_results[i].m_n = 8;
+         best_results[i].m_pSelectors = best_selectors[i];
+      }
+      
+      uint8 selectors[3][8];
+      etc1_optimizer::results results[3];
+      
+      for (uint i = 0; i < 3; i++)
+      {
+         results[i].m_n = 8;
+         results[i].m_pSelectors = selectors[i];
+      }
+            
+      color_quad_u8 subblock_pixels[8];
+
+      etc1_optimizer::params params(pack_params);
+      params.m_num_src_pixels = 8;
+      params.m_pSrc_pixels = subblock_pixels;
+
+      for (uint flip = 0; flip < 2; flip++)
+      {
+         for (uint use_color4 = 0; use_color4 < 2; use_color4++)
+         {
+            uint64 trial_error = 0;
+
+            uint subblock;
+            for (subblock = 0; subblock < 2; subblock++)
+            {
+               if (flip)
+                  memcpy(subblock_pixels, pSrc_pixels + subblock * 8, sizeof(color_quad_u8) * 8);
+               else
+               {
+                  const color_quad_u8* pSrc_col = pSrc_pixels + subblock * 2;
+                  subblock_pixels[0] = pSrc_col[0]; subblock_pixels[1] = pSrc_col[4]; subblock_pixels[2] = pSrc_col[8]; subblock_pixels[3] = pSrc_col[12];
+                  subblock_pixels[4] = pSrc_col[1]; subblock_pixels[5] = pSrc_col[5]; subblock_pixels[6] = pSrc_col[9]; subblock_pixels[7] = pSrc_col[13];
+               }
+
+               results[2].m_error = cUINT64_MAX;
+               if ((params.m_quality >= cMediumQuality) && ((subblock) || (use_color4)))
+               {
+                  const uint32 subblock_pixel0_u32 = subblock_pixels[0].m_u32;
+                  for (r = 7; r >= 1; --r)
+                     if (subblock_pixels[r].m_u32 != subblock_pixel0_u32)
+                        break;
+                  if (!r)
+                  {
+                     pack_etc1_block_solid_color_constrained(results[2], 8, &subblock_pixels[0].r, pack_params, !use_color4, (subblock && !use_color4) ? &results[0].m_block_color_unscaled : NULL);
+                  }
+               }
+
+               params.m_use_color4 = (use_color4 != 0);
+               params.m_constrain_against_base_color5 = false;
+
+               if ((!use_color4) && (subblock))
+               {
+                  params.m_constrain_against_base_color5 = true;
+                  params.m_base_color5 = results[0].m_block_color_unscaled;
+               }
+                              
+               if (params.m_quality == cHighQuality)
+               {
+                  static const int s_scan_delta_0_to_4[] = { -4, -3, -2, -1, 0, 1, 2, 3, 4 };
+                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_4);
+                  params.m_pScan_deltas = s_scan_delta_0_to_4;
+               }
+               else if (params.m_quality == cMediumQuality)
+               {
+                  static const int s_scan_delta_0_to_1[] = { -1, 0, 1 };
+                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_1);
+                  params.m_pScan_deltas = s_scan_delta_0_to_1;
+               }
+               else
+               {
+                  static const int s_scan_delta_0[] = { 0 };
+                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0);
+                  params.m_pScan_deltas = s_scan_delta_0;
+               }
+               
+               optimizer.init(params, results[subblock]);
+               if (!optimizer.compute())
+                  break;
+                              
+               if (params.m_quality >= cMediumQuality)
+               {
+                  // TODO: Fix fairly arbitrary/unrefined thresholds that control how far away to scan for potentially better solutions.
+                  const uint refinement_error_thresh0 = 3000;
+                  const uint refinement_error_thresh1 = 6000;
+                  if (results[subblock].m_error > refinement_error_thresh0)
+                  {
+                     if (params.m_quality == cMediumQuality)
+                     {
+                        static const int s_scan_delta_2_to_3[] = { -3, -2, 2, 3 };
+                        params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_2_to_3);
+                        params.m_pScan_deltas = s_scan_delta_2_to_3;
+                     }
+                     else
+                     {
+                        static const int s_scan_delta_5_to_5[] = { -5, 5 };
+                        static const int s_scan_delta_5_to_8[] = { -8, -7, -6, -5, 5, 6, 7, 8 };
+                        if (results[subblock].m_error > refinement_error_thresh1)
+                        {
+                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_8);
+                           params.m_pScan_deltas = s_scan_delta_5_to_8;
+                        }
+                        else
+                        {
+                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_5);
+                           params.m_pScan_deltas = s_scan_delta_5_to_5;
+                        }
+                     }
+
+                     if (!optimizer.compute())
+                        break;
+                  }
+
+                  if (results[2].m_error < results[subblock].m_error)
+                     results[subblock] = results[2];
+               }
+                            
+               trial_error += results[subblock].m_error;
+               if (trial_error >= best_error)
+                  break;
+            }
+
+            if (subblock < 2)
+               continue;
+
+            best_error = trial_error;
+            best_results[0] = results[0];
+            best_results[1] = results[1];
+            best_flip = flip;
+            best_use_color4 = use_color4;
+            
+         } // use_color4
+
+      } // flip
+
+      int dr = best_results[1].m_block_color_unscaled.r - best_results[0].m_block_color_unscaled.r;
+      int dg = best_results[1].m_block_color_unscaled.g - best_results[0].m_block_color_unscaled.g;
+      int db = best_results[1].m_block_color_unscaled.b - best_results[0].m_block_color_unscaled.b;
+      RG_ETC1_ASSERT(best_use_color4 || ((rg_etc1::minimum(dr, dg, db) >= cETC1ColorDeltaMin) && (rg_etc1::maximum(dr, dg, db) <= cETC1ColorDeltaMax)));
+           
+      if (best_use_color4)
+      {
+         dst_block.m_bytes[0] = static_cast<uint8>(best_results[1].m_block_color_unscaled.r | (best_results[0].m_block_color_unscaled.r << 4));
+         dst_block.m_bytes[1] = static_cast<uint8>(best_results[1].m_block_color_unscaled.g | (best_results[0].m_block_color_unscaled.g << 4));
+         dst_block.m_bytes[2] = static_cast<uint8>(best_results[1].m_block_color_unscaled.b | (best_results[0].m_block_color_unscaled.b << 4));
+      }
+      else
+      {
+         if (dr < 0) dr += 8; dst_block.m_bytes[0] = static_cast<uint8>((best_results[0].m_block_color_unscaled.r << 3) | dr);
+         if (dg < 0) dg += 8; dst_block.m_bytes[1] = static_cast<uint8>((best_results[0].m_block_color_unscaled.g << 3) | dg);
+         if (db < 0) db += 8; dst_block.m_bytes[2] = static_cast<uint8>((best_results[0].m_block_color_unscaled.b << 3) | db);
+      }
+      
+      dst_block.m_bytes[3] = static_cast<uint8>( (best_results[1].m_block_inten_table << 2) | (best_results[0].m_block_inten_table << 5) | ((~best_use_color4 & 1) << 1) | best_flip );
+      
+      uint selector0 = 0, selector1 = 0;
+      if (best_flip)
+      {
+         // flipped:
+         // { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },               
+         // { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 } 
+         //
+         // { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
+         // { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
+         const uint8* pSelectors0 = best_results[0].m_pSelectors;
+         const uint8* pSelectors1 = best_results[1].m_pSelectors;
+         for (int x = 3; x >= 0; --x)
+         {
+            uint b;
+            b = g_selector_index_to_etc1[pSelectors1[4 + x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+            b = g_selector_index_to_etc1[pSelectors1[x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+            b = g_selector_index_to_etc1[pSelectors0[4 + x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+            b = g_selector_index_to_etc1[pSelectors0[x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+         }
+      }
+      else
+      {
+         // non-flipped:
+         // { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
+         // { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
+         //
+         // { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+         // { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
+         for (int subblock = 1; subblock >= 0; --subblock)
+         {
+            const uint8* pSelectors = best_results[subblock].m_pSelectors + 4;
+            for (uint i = 0; i < 2; i++)
+            {
+               uint b;
+               b = g_selector_index_to_etc1[pSelectors[3]];
+               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+               b = g_selector_index_to_etc1[pSelectors[2]];
+               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+               b = g_selector_index_to_etc1[pSelectors[1]];
+               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+               b = g_selector_index_to_etc1[pSelectors[0]];
+               selector0 = (selector0 << 1) | (b & 1);selector1 = (selector1 << 1) | (b >> 1);
+
+               pSelectors -= 4;
+            }
+         }
+      }
+                  
+      dst_block.m_bytes[4] = static_cast<uint8>(selector1 >> 8); dst_block.m_bytes[5] = static_cast<uint8>(selector1 & 0xFF);
+      dst_block.m_bytes[6] = static_cast<uint8>(selector0 >> 8); dst_block.m_bytes[7] = static_cast<uint8>(selector0 & 0xFF);
+
+      return static_cast<unsigned int>(best_error);
+   }
+
+} // namespace rg_etc1

+ 76 - 76
drivers/etc1/rg_etc1.h

@@ -1,76 +1,76 @@
-// File: rg_etc1.h - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
-// Please see ZLIB license at the end of this file.
-#pragma once
-
-namespace rg_etc1
-{
-   // Unpacks an 8-byte ETC1 compressed block to a block of 4x4 32bpp RGBA pixels.
-   // Returns false if the block is invalid. Invalid blocks will still be unpacked with clamping.
-   // This function is thread safe, and does not dynamically allocate any memory.
-   // If preserve_alpha is true, the alpha channel of the destination pixels will not be overwritten. Otherwise, alpha will be set to 255.
-   bool unpack_etc1_block(const void *pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha = false);
-
-   // Quality setting = the higher the quality, the slower. 
-   // To pack large textures, it is highly recommended to call pack_etc1_block() in parallel, on different blocks, from multiple threads (particularly when using cHighQuality).
-   enum etc1_quality
-   { 
-      cLowQuality,
-      cMediumQuality,
-      cHighQuality,
-   };
-      
-   struct etc1_pack_params
-   {
-      etc1_quality m_quality;
-      bool m_dithering;
-                              
-      inline etc1_pack_params() 
-      {
-         clear();
-      }
-
-      void clear()
-      {
-         m_quality = cHighQuality;
-         m_dithering = false;
-      }
-   };
-
-   // Important: pack_etc1_block_init() must be called before calling pack_etc1_block().
-   void pack_etc1_block_init();
-
-   // Packs a 4x4 block of 32bpp RGBA pixels to an 8-byte ETC1 block.
-   // 32-bit RGBA pixels must always be arranged as (R,G,B,A) (R first, A last) in memory, independent of platform endianness. A should always be 255.
-   // Returns squared error of result.
-   // This function is thread safe, and does not dynamically allocate any memory.
-   // pack_etc1_block() does not currently support "perceptual" colorspace metrics - it primarily optimizes for RGB RMSE.
-   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params);
-            
-} // namespace rg_etc1
-
-//------------------------------------------------------------------------------
-//
-// rg_etc1 uses the ZLIB license:
-// http://opensource.org/licenses/Zlib
-//
-// Copyright (c) 2012 Rich Geldreich
-//
-// This software is provided 'as-is', without any express or implied
-// warranty.  In no event will the authors be held liable for any damages
-// arising from the use of this software.
-//
-// Permission is granted to anyone to use this software for any purpose,
-// including commercial applications, and to alter it and redistribute it
-// freely, subject to the following restrictions:
-//
-// 1. The origin of this software must not be misrepresented; you must not
-// claim that you wrote the original software. If you use this software
-// in a product, an acknowledgment in the product documentation would be
-// appreciated but is not required.
-//
-// 2. Altered source versions must be plainly marked as such, and must not be
-// misrepresented as being the original software.
-//
-// 3. This notice may not be removed or altered from any source distribution.
-//
-//------------------------------------------------------------------------------
+// File: rg_etc1.h - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
+// Please see ZLIB license at the end of this file.
+#pragma once
+
+namespace rg_etc1
+{
+   // Unpacks an 8-byte ETC1 compressed block to a block of 4x4 32bpp RGBA pixels.
+   // Returns false if the block is invalid. Invalid blocks will still be unpacked with clamping.
+   // This function is thread safe, and does not dynamically allocate any memory.
+   // If preserve_alpha is true, the alpha channel of the destination pixels will not be overwritten. Otherwise, alpha will be set to 255.
+   bool unpack_etc1_block(const void *pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha = false);
+
+   // Quality setting = the higher the quality, the slower. 
+   // To pack large textures, it is highly recommended to call pack_etc1_block() in parallel, on different blocks, from multiple threads (particularly when using cHighQuality).
+   enum etc1_quality
+   { 
+      cLowQuality,
+      cMediumQuality,
+      cHighQuality,
+   };
+      
+   struct etc1_pack_params
+   {
+      etc1_quality m_quality;
+      bool m_dithering;
+                              
+      inline etc1_pack_params() 
+      {
+         clear();
+      }
+
+      void clear()
+      {
+         m_quality = cHighQuality;
+         m_dithering = false;
+      }
+   };
+
+   // Important: pack_etc1_block_init() must be called before calling pack_etc1_block().
+   void pack_etc1_block_init();
+
+   // Packs a 4x4 block of 32bpp RGBA pixels to an 8-byte ETC1 block.
+   // 32-bit RGBA pixels must always be arranged as (R,G,B,A) (R first, A last) in memory, independent of platform endianness. A should always be 255.
+   // Returns squared error of result.
+   // This function is thread safe, and does not dynamically allocate any memory.
+   // pack_etc1_block() does not currently support "perceptual" colorspace metrics - it primarily optimizes for RGB RMSE.
+   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params);
+            
+} // namespace rg_etc1
+
+//------------------------------------------------------------------------------
+//
+// rg_etc1 uses the ZLIB license:
+// http://opensource.org/licenses/Zlib
+//
+// Copyright (c) 2012 Rich Geldreich
+//
+// This software is provided 'as-is', without any express or implied
+// warranty.  In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+// claim that you wrote the original software. If you use this software
+// in a product, an acknowledgment in the product documentation would be
+// appreciated but is not required.
+//
+// 2. Altered source versions must be plainly marked as such, and must not be
+// misrepresented as being the original software.
+//
+// 3. This notice may not be removed or altered from any source distribution.
+//
+//------------------------------------------------------------------------------

+ 0 - 1
drivers/gles2/rasterizer_gles2.cpp

@@ -4160,7 +4160,6 @@ void RasterizerGLES2::begin_frame() {
 	time_delta=time-last_time;
 	last_time=time;
 	frame++;
-	clear_viewport(Color(1,0,0.5));
 
 	_rinfo.vertex_count=0;
 	_rinfo.object_count=0;

+ 5814 - 5814
drivers/nedmalloc/malloc.c.h

@@ -1,5814 +1,5814 @@
-#ifdef NEDMALLOC_ENABLED
-/*
-  This is a version (aka dlmalloc) of malloc/free/realloc written by
-  Doug Lea and released to the public domain, as explained at
-  http://creativecommons.org/licenses/publicdomain.  Send questions,
-  comments, complaints, performance data, etc to [email protected]
-
-* Version 2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
-
-   Note: There may be an updated version of this malloc obtainable at
-           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
-         Check before installing!
-
-* Quickstart
-
-  This library is all in one file to simplify the most common usage:
-  ftp it, compile it (-O3), and link it into another program. All of
-  the compile-time options default to reasonable values for use on
-  most platforms.  You might later want to step through various
-  compile-time and dynamic tuning options.
-
-  For convenience, an include file for code using this malloc is at:
-     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.4.h
-  You don't really need this .h file unless you call functions not
-  defined in your system include files.  The .h file contains only the
-  excerpts from this file needed for using this malloc on ANSI C/C++
-  systems, so long as you haven't changed compile-time options about
-  naming and tuning parameters.  If you do, then you can create your
-  own malloc.h that does include all settings by cutting at the point
-  indicated below. Note that you may already by default be using a C
-  library containing a malloc that is based on some version of this
-  malloc (for example in linux). You might still want to use the one
-  in this file to customize settings or to avoid overheads associated
-  with library versions.
-
-* Vital statistics:
-
-  Supported pointer/size_t representation:       4 or 8 bytes
-       size_t MUST be an unsigned type of the same width as
-       pointers. (If you are using an ancient system that declares
-       size_t as a signed type, or need it to be a different width
-       than pointers, you can use a previous release of this malloc
-       (e.g. 2.7.2) supporting these.)
-
-  Alignment:                                     8 bytes (default)
-       This suffices for nearly all current machines and C compilers.
-       However, you can define MALLOC_ALIGNMENT to be wider than this
-       if necessary (up to 128bytes), at the expense of using more space.
-
-  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
-                                          8 or 16 bytes (if 8byte sizes)
-       Each malloced chunk has a hidden word of overhead holding size
-       and status information, and additional cross-check word
-       if FOOTERS is defined.
-
-  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
-                          8-byte ptrs:  32 bytes    (including overhead)
-
-       Even a request for zero bytes (i.e., malloc(0)) returns a
-       pointer to something of the minimum allocatable size.
-       The maximum overhead wastage (i.e., number of extra bytes
-       allocated than were requested in malloc) is less than or equal
-       to the minimum size, except for requests >= mmap_threshold that
-       are serviced via mmap(), where the worst case wastage is about
-       32 bytes plus the remainder from a system page (the minimal
-       mmap unit); typically 4096 or 8192 bytes.
-
-  Security: static-safe; optionally more or less
-       The "security" of malloc refers to the ability of malicious
-       code to accentuate the effects of errors (for example, freeing
-       space that is not currently malloc'ed or overwriting past the
-       ends of chunks) in code that calls malloc.  This malloc
-       guarantees not to modify any memory locations below the base of
-       heap, i.e., static variables, even in the presence of usage
-       errors.  The routines additionally detect most improper frees
-       and reallocs.  All this holds as long as the static bookkeeping
-       for malloc itself is not corrupted by some other means.  This
-       is only one aspect of security -- these checks do not, and
-       cannot, detect all possible programming errors.
-
-       If FOOTERS is defined nonzero, then each allocated chunk
-       carries an additional check word to verify that it was malloced
-       from its space.  These check words are the same within each
-       execution of a program using malloc, but differ across
-       executions, so externally crafted fake chunks cannot be
-       freed. This improves security by rejecting frees/reallocs that
-       could corrupt heap memory, in addition to the checks preventing
-       writes to statics that are always on.  This may further improve
-       security at the expense of time and space overhead.  (Note that
-       FOOTERS may also be worth using with MSPACES.)
-
-       By default detected errors cause the program to abort (calling
-       "abort()"). You can override this to instead proceed past
-       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
-       has no effect, and a malloc that encounters a bad address
-       caused by user overwrites will ignore the bad address by
-       dropping pointers and indices to all known memory. This may
-       be appropriate for programs that should continue if at all
-       possible in the face of programming errors, although they may
-       run out of memory because dropped memory is never reclaimed.
-
-       If you don't like either of these options, you can define
-       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
-       else. And if if you are sure that your program using malloc has
-       no errors or vulnerabilities, you can define INSECURE to 1,
-       which might (or might not) provide a small performance improvement.
-
-  Thread-safety: NOT thread-safe unless USE_LOCKS defined
-       When USE_LOCKS is defined, each public call to malloc, free,
-       etc is surrounded with either a pthread mutex or a win32
-       spinlock (depending on WIN32). This is not especially fast, and
-       can be a major bottleneck.  It is designed only to provide
-       minimal protection in concurrent environments, and to provide a
-       basis for extensions.  If you are using malloc in a concurrent
-       program, consider instead using nedmalloc
-       (http://www.nedprod.com/programs/portable/nedmalloc/) or
-       ptmalloc (See http://www.malloc.de), which are derived
-       from versions of this malloc.
-
-  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
-       This malloc can use unix sbrk or any emulation (invoked using
-       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
-       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
-       memory.  On most unix systems, it tends to work best if both
-       MORECORE and MMAP are enabled.  On Win32, it uses emulations
-       based on VirtualAlloc. It also uses common C library functions
-       like memset.
-
-  Compliance: I believe it is compliant with the Single Unix Specification
-       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
-       others as well.
-
-* Overview of algorithms
-
-  This is not the fastest, most space-conserving, most portable, or
-  most tunable malloc ever written. However it is among the fastest
-  while also being among the most space-conserving, portable and
-  tunable.  Consistent balance across these factors results in a good
-  general-purpose allocator for malloc-intensive programs.
-
-  In most ways, this malloc is a best-fit allocator. Generally, it
-  chooses the best-fitting existing chunk for a request, with ties
-  broken in approximately least-recently-used order. (This strategy
-  normally maintains low fragmentation.) However, for requests less
-  than 256bytes, it deviates from best-fit when there is not an
-  exactly fitting available chunk by preferring to use space adjacent
-  to that used for the previous small request, as well as by breaking
-  ties in approximately most-recently-used order. (These enhance
-  locality of series of small allocations.)  And for very large requests
-  (>= 256Kb by default), it relies on system memory mapping
-  facilities, if supported.  (This helps avoid carrying around and
-  possibly fragmenting memory used only for large chunks.)
-
-  All operations (except malloc_stats and mallinfo) have execution
-  times that are bounded by a constant factor of the number of bits in
-  a size_t, not counting any clearing in calloc or copying in realloc,
-  or actions surrounding MORECORE and MMAP that have times
-  proportional to the number of non-contiguous regions returned by
-  system allocation routines, which is often just 1. In real-time
-  applications, you can optionally suppress segment traversals using
-  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
-  system allocators return non-contiguous spaces, at the typical
-  expense of carrying around more memory and increased fragmentation.
-
-  The implementation is not very modular and seriously overuses
-  macros. Perhaps someday all C compilers will do as good a job
-  inlining modular code as can now be done by brute-force expansion,
-  but now, enough of them seem not to.
-
-  Some compilers issue a lot of warnings about code that is
-  dead/unreachable only on some platforms, and also about intentional
-  uses of negation on unsigned types. All known cases of each can be
-  ignored.
-
-  For a longer but out of date high-level description, see
-     http://gee.cs.oswego.edu/dl/html/malloc.html
-
-* MSPACES
-  If MSPACES is defined, then in addition to malloc, free, etc.,
-  this file also defines mspace_malloc, mspace_free, etc. These
-  are versions of malloc routines that take an "mspace" argument
-  obtained using create_mspace, to control all internal bookkeeping.
-  If ONLY_MSPACES is defined, only these versions are compiled.
-  So if you would like to use this allocator for only some allocations,
-  and your system malloc for others, you can compile with
-  ONLY_MSPACES and then do something like...
-    static mspace mymspace = create_mspace(0,0); // for example
-    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)
-
-  (Note: If you only need one instance of an mspace, you can instead
-  use "USE_DL_PREFIX" to relabel the global malloc.)
-
-  You can similarly create thread-local allocators by storing
-  mspaces as thread-locals. For example:
-    static __thread mspace tlms = 0;
-    void*  tlmalloc(size_t bytes) {
-      if (tlms == 0) tlms = create_mspace(0, 0);
-      return mspace_malloc(tlms, bytes);
-    }
-    void  tlfree(void* mem) { mspace_free(tlms, mem); }
-
-  Unless FOOTERS is defined, each mspace is completely independent.
-  You cannot allocate from one and free to another (although
-  conformance is only weakly checked, so usage errors are not always
-  caught). If FOOTERS is defined, then each chunk carries around a tag
-  indicating its originating mspace, and frees are directed to their
-  originating spaces.
-
- -------------------------  Compile-time options ---------------------------
-
-Be careful in setting #define values for numerical constants of type
-size_t. On some systems, literal values are not automatically extended
-to size_t precision unless they are explicitly casted. You can also
-use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.
-
-WIN32                    default: defined if _WIN32 defined
-  Defining WIN32 sets up defaults for MS environment and compilers.
-  Otherwise defaults are for unix. Beware that there seem to be some
-  cases where this malloc might not be a pure drop-in replacement for
-  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
-  SetDIBits()) may be due to bugs in some video driver implementations
-  when pixel buffers are malloc()ed, and the region spans more than
-  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
-  default granularity, pixel buffers may straddle virtual allocation
-  regions more often than when using the Microsoft allocator.  You can
-  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
-  buffers rather than using malloc().  If this is not possible,
-  recompile this malloc with a larger DEFAULT_GRANULARITY.
-
-MALLOC_ALIGNMENT         default: (size_t)8
-  Controls the minimum alignment for malloc'ed chunks.  It must be a
-  power of two and at least 8, even on machines for which smaller
-  alignments would suffice. It may be defined as larger than this
-  though. Note however that code and data structures are optimized for
-  the case of 8-byte alignment.
-
-MSPACES                  default: 0 (false)
-  If true, compile in support for independent allocation spaces.
-  This is only supported if HAVE_MMAP is true.
-
-ONLY_MSPACES             default: 0 (false)
-  If true, only compile in mspace versions, not regular versions.
-
-USE_LOCKS                default: 0 (false)
-  Causes each call to each public routine to be surrounded with
-  pthread or WIN32 mutex lock/unlock. (If set true, this can be
-  overridden on a per-mspace basis for mspace versions.) If set to a
-  non-zero value other than 1, locks are used, but their
-  implementation is left out, so lock functions must be supplied manually,
-  as described below.
-
-USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and on x86 using gcc or MSC
-  If true, uses custom spin locks for locking. This is currently
-  supported only for x86 platforms using gcc or recent MS compilers.
-  Otherwise, posix locks or win32 critical sections are used.
-
-FOOTERS                  default: 0
-  If true, provide extra checking and dispatching by placing
-  information in the footers of allocated chunks. This adds
-  space and time overhead.
-
-INSECURE                 default: 0
-  If true, omit checks for usage errors and heap space overwrites.
-
-USE_DL_PREFIX            default: NOT defined
-  Causes compiler to prefix all public routines with the string 'dl'.
-  This can be useful when you only want to use this malloc in one part
-  of a program, using your regular system malloc elsewhere.
-
-ABORT                    default: defined as abort()
-  Defines how to abort on failed checks.  On most systems, a failed
-  check cannot die with an "assert" or even print an informative
-  message, because the underlying print routines in turn call malloc,
-  which will fail again.  Generally, the best policy is to simply call
-  abort(). It's not very useful to do more than this because many
-  errors due to overwriting will show up as address faults (null, odd
-  addresses etc) rather than malloc-triggered checks, so will also
-  abort.  Also, most compilers know that abort() does not return, so
-  can better optimize code conditionally calling it.
-
-PROCEED_ON_ERROR           default: defined as 0 (false)
-  Controls whether detected bad addresses cause them to bypassed
-  rather than aborting. If set, detected bad arguments to free and
-  realloc are ignored. And all bookkeeping information is zeroed out
-  upon a detected overwrite of freed heap space, thus losing the
-  ability to ever return it from malloc again, but enabling the
-  application to proceed. If PROCEED_ON_ERROR is defined, the
-  static variable malloc_corruption_error_count is compiled in
-  and can be examined to see if errors have occurred. This option
-  generates slower code than the default abort policy.
-
-DEBUG                    default: NOT defined
-  The DEBUG setting is mainly intended for people trying to modify
-  this code or diagnose problems when porting to new platforms.
-  However, it may also be able to better isolate user errors than just
-  using runtime checks.  The assertions in the check routines spell
-  out in more detail the assumptions and invariants underlying the
-  algorithms.  The checking is fairly extensive, and will slow down
-  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
-  set will attempt to check every non-mmapped allocated and free chunk
-  in the course of computing the summaries.
-
-ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
-  Debugging assertion failures can be nearly impossible if your
-  version of the assert macro causes malloc to be called, which will
-  lead to a cascade of further failures, blowing the runtime stack.
-  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
-  which will usually make debugging easier.
-
-MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
-  The action to take before "return 0" when malloc fails to be able to
-  return memory because there is none available.
-
-HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
-  True if this system supports sbrk or an emulation of it.
-
-MORECORE                  default: sbrk
-  The name of the sbrk-style system routine to call to obtain more
-  memory.  See below for guidance on writing custom MORECORE
-  functions. The type of the argument to sbrk/MORECORE varies across
-  systems.  It cannot be size_t, because it supports negative
-  arguments, so it is normally the signed type of the same width as
-  size_t (sometimes declared as "intptr_t").  It doesn't much matter
-  though. Internally, we only call it with arguments less than half
-  the max value of a size_t, which should work across all reasonable
-  possibilities, although sometimes generating compiler warnings.
-
-MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
-  If true, take advantage of fact that consecutive calls to MORECORE
-  with positive arguments always return contiguous increasing
-  addresses.  This is true of unix sbrk. It does not hurt too much to
-  set it true anyway, since malloc copes with non-contiguities.
-  Setting it false when definitely non-contiguous saves time
-  and possibly wasted space it would take to discover this though.
-
-MORECORE_CANNOT_TRIM      default: NOT defined
-  True if MORECORE cannot release space back to the system when given
-  negative arguments. This is generally necessary only if you are
-  using a hand-crafted MORECORE function that cannot handle negative
-  arguments.
-
-NO_SEGMENT_TRAVERSAL       default: 0
-  If non-zero, suppresses traversals of memory segments
-  returned by either MORECORE or CALL_MMAP. This disables
-  merging of segments that are contiguous, and selectively
-  releasing them to the OS if unused, but bounds execution times.
-
-HAVE_MMAP                 default: 1 (true)
-  True if this system supports mmap or an emulation of it.  If so, and
-  HAVE_MORECORE is not true, MMAP is used for all system
-  allocation. If set and HAVE_MORECORE is true as well, MMAP is
-  primarily used to directly allocate very large blocks. It is also
-  used as a backup strategy in cases where MORECORE fails to provide
-  space from system. Note: A single call to MUNMAP is assumed to be
-  able to unmap memory that may have be allocated using multiple calls
-  to MMAP, so long as they are adjacent.
-
-HAVE_MREMAP               default: 1 on linux, else 0
-  If true realloc() uses mremap() to re-allocate large blocks and
-  extend or shrink allocation spaces.
-
-MMAP_CLEARS               default: 1 except on WINCE.
-  True if mmap clears memory so calloc doesn't need to. This is true
-  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.
-
-USE_BUILTIN_FFS            default: 0 (i.e., not used)
-  Causes malloc to use the builtin ffs() function to compute indices.
-  Some compilers may recognize and intrinsify ffs to be faster than the
-  supplied C version. Also, the case of x86 using gcc is special-cased
-  to an asm instruction, so is already as fast as it can be, and so
-  this setting has no effect. Similarly for Win32 under recent MS compilers.
-  (On most x86s, the asm version is only slightly faster than the C version.)
-
-malloc_getpagesize         default: derive from system includes, or 4096.
-  The system page size. To the extent possible, this malloc manages
-  memory from the system in page-size units.  This may be (and
-  usually is) a function rather than a constant. This is ignored
-  if WIN32, where page size is determined using getSystemInfo during
-  initialization. This may be several megabytes if ENABLE_LARGE_PAGES
-  is enabled.
-
-ENABLE_LARGE_PAGES         default: NOT defined
-  Causes the system page size to be the value of GetLargePageMinimum()
-  if that function is available (Windows Server 2003/Vista or later).
-  This allows the use of large page entries in the MMU which can
-  significantly improve performance in large working set applications
-  as TLB cache load is reduced by a factor of three. Note that enabling
-  this option is equal to locking the process' memory in current
-  implementations of Windows and requires the SE_LOCK_MEMORY_PRIVILEGE
-  to be held by the process in order to succeed.
-
-USE_DEV_RANDOM             default: 0 (i.e., not used)
-  Causes malloc to use /dev/random to initialize secure magic seed for
-  stamping footers. Otherwise, the current time is used.
-
-NO_MALLINFO                default: 0
-  If defined, don't compile "mallinfo". This can be a simple way
-  of dealing with mismatches between system declarations and
-  those in this file.
-
-MALLINFO_FIELD_TYPE        default: size_t
-  The type of the fields in the mallinfo struct. This was originally
-  defined as "int" in SVID etc, but is more usefully defined as
-  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set
-
-REALLOC_ZERO_BYTES_FREES    default: not defined
-  This should be set if a call to realloc with zero bytes should
-  be the same as a call to free. Some people think it should. Otherwise,
-  since this malloc returns a unique pointer for malloc(0), so does
-  realloc(p, 0).
-
-LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
-LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
-LACKS_STDLIB_H                default: NOT defined unless on WIN32
-  Define these if your system does not have these header files.
-  You might need to manually insert some of the declarations they provide.
-
-DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
-                                system_info.dwAllocationGranularity in WIN32,
-                                GetLargePageMinimum() if ENABLE_LARGE_PAGES,
-                                otherwise 64K.
-      Also settable using mallopt(M_GRANULARITY, x)
-  The unit for allocating and deallocating memory from the system.  On
-  most systems with contiguous MORECORE, there is no reason to
-  make this more than a page. However, systems with MMAP tend to
-  either require or encourage larger granularities.  You can increase
-  this value to prevent system allocation functions to be called so
-  often, especially if they are slow.  The value must be at least one
-  page and must be a power of two.  Setting to 0 causes initialization
-  to either page size or win32 region size.  (Note: In previous
-  versions of malloc, the equivalent of this option was called
-  "TOP_PAD")
-
-DEFAULT_GRANULARITY_ALIGNED default: undefined (which means page size)
-  Whether to enforce alignment when allocating and deallocating memory
-  from the system i.e. the base address of all allocations will be
-  aligned to DEFAULT_GRANULARITY if it is set. Note that enabling this carries
-  some overhead as multiple calls must now be made when probing for a valid
-  aligned value, however it does greatly ease the checking for whether
-  a given memory pointer was allocated by this allocator rather than
-  some other.
-
-DEFAULT_TRIM_THRESHOLD    default: 2MB
-      Also settable using mallopt(M_TRIM_THRESHOLD, x)
-  The maximum amount of unused top-most memory to keep before
-  releasing via malloc_trim in free().  Automatic trimming is mainly
-  useful in long-lived programs using contiguous MORECORE.  Because
-  trimming via sbrk can be slow on some systems, and can sometimes be
-  wasteful (in cases where programs immediately afterward allocate
-  more large chunks) the value should be high enough so that your
-  overall system performance would improve by releasing this much
-  memory.  As a rough guide, you might set to a value close to the
-  average size of a process (program) running on your system.
-  Releasing this much memory would allow such a process to run in
-  memory.  Generally, it is worth tuning trim thresholds when a
-  program undergoes phases where several large chunks are allocated
-  and released in ways that can reuse each other's storage, perhaps
-  mixed with phases where there are no such chunks at all. The trim
-  value must be greater than page size to have any useful effect.  To
-  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
-  some people use of mallocing a huge space and then freeing it at
-  program startup, in an attempt to reserve system memory, doesn't
-  have the intended effect under automatic trimming, since that memory
-  will immediately be returned to the system.
-
-DEFAULT_MMAP_THRESHOLD       default: 256K
-      Also settable using mallopt(M_MMAP_THRESHOLD, x)
-  The request size threshold for using MMAP to directly service a
-  request. Requests of at least this size that cannot be allocated
-  using already-existing space will be serviced via mmap.  (If enough
-  normal freed space already exists it is used instead.)  Using mmap
-  segregates relatively large chunks of memory so that they can be
-  individually obtained and released from the host system. A request
-  serviced through mmap is never reused by any other request (at least
-  not directly; the system may just so happen to remap successive
-  requests to the same locations).  Segregating space in this way has
-  the benefits that: Mmapped space can always be individually released
-  back to the system, which helps keep the system level memory demands
-  of a long-lived program low.  Also, mapped memory doesn't become
-  `locked' between other chunks, as can happen with normally allocated
-  chunks, which means that even trimming via malloc_trim would not
-  release them.  However, it has the disadvantage that the space
-  cannot be reclaimed, consolidated, and then used to service later
-  requests, as happens with normal chunks.  The advantages of mmap
-  nearly always outweigh disadvantages for "large" chunks, but the
-  value of "large" may vary across systems.  The default is an
-  empirically derived value that works well in most systems. You can
-  disable mmap by setting to MAX_SIZE_T.
-
-MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
-  The number of consolidated frees between checks to release
-  unused segments when freeing. When using non-contiguous segments,
-  especially with multiple mspaces, checking only for topmost space
-  doesn't always suffice to trigger trimming. To compensate for this,
-  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
-  current number of segments, if greater) try to release unused
-  segments to the OS when freeing chunks that result in
-  consolidation. The best value for this parameter is a compromise
-  between slowing down frees with relatively costly checks that
-  rarely trigger versus holding on to unused memory. To effectively
-  disable, set to MAX_SIZE_T. This may lead to a very slight speed
-  improvement at the expense of carrying around more memory.
-*/
-
-/* Version identifier to allow people to support multiple versions */
-#ifndef DLMALLOC_VERSION
-#define DLMALLOC_VERSION 20804
-#endif /* DLMALLOC_VERSION */
-
-#ifndef WIN32
-#ifdef _WIN32
-#define WIN32 1
-#endif  /* _WIN32 */
-#ifdef _WIN32_WCE
-#define LACKS_FCNTL_H
-#define WIN32 1
-#endif /* _WIN32_WCE */
-#endif  /* WIN32 */
-#ifdef WIN32
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-#include <tchar.h>
-#define HAVE_MMAP 1
-#define HAVE_MORECORE 0
-#define LACKS_UNISTD_H
-#define LACKS_SYS_PARAM_H
-#define LACKS_SYS_MMAN_H
-#define LACKS_STRING_H
-#define LACKS_STRINGS_H
-#define LACKS_SYS_TYPES_H
-#define LACKS_ERRNO_H
-#ifndef MALLOC_FAILURE_ACTION
-#define MALLOC_FAILURE_ACTION
-#endif /* MALLOC_FAILURE_ACTION */
-#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
-#define MMAP_CLEARS 0
-#else
-#define MMAP_CLEARS 1
-#endif /* _WIN32_WCE */
-#endif  /* WIN32 */
-
-#if defined(DARWIN) || defined(_DARWIN)
-/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
-#ifndef HAVE_MORECORE
-#define HAVE_MORECORE 0
-#define HAVE_MMAP 1
-/* OSX allocators provide 16 byte alignment */
-#ifndef MALLOC_ALIGNMENT
-#define MALLOC_ALIGNMENT ((size_t)16U)
-#endif
-#endif  /* HAVE_MORECORE */
-#endif  /* DARWIN */
-
-#ifndef LACKS_SYS_TYPES_H
-#include <sys/types.h>  /* For size_t */
-#endif  /* LACKS_SYS_TYPES_H */
-
-#if (defined(__GNUC__) && ((defined(__i386__) || defined(__x86_64__)))) || (defined(_MSC_VER) && _MSC_VER>=1310)
-#define SPIN_LOCKS_AVAILABLE 1
-#else
-#define SPIN_LOCKS_AVAILABLE 0
-#endif
-
-/* The maximum possible size_t value has all bits set */
-#define MAX_SIZE_T           (~(size_t)0)
-
-#ifndef ONLY_MSPACES
-#define ONLY_MSPACES 0     /* define to a value */
-#else
-#define ONLY_MSPACES 1
-#endif  /* ONLY_MSPACES */
-#ifndef MSPACES
-#if ONLY_MSPACES
-#define MSPACES 1
-#else   /* ONLY_MSPACES */
-#define MSPACES 0
-#endif  /* ONLY_MSPACES */
-#endif  /* MSPACES */
-#ifndef MALLOC_ALIGNMENT
-#define MALLOC_ALIGNMENT ((size_t)8U)
-#endif  /* MALLOC_ALIGNMENT */
-#ifndef FOOTERS
-#define FOOTERS 0
-#endif  /* FOOTERS */
-#ifndef ABORT
-#define ABORT  abort()
-#endif  /* ABORT */
-#ifndef ABORT_ON_ASSERT_FAILURE
-#define ABORT_ON_ASSERT_FAILURE 1
-#endif  /* ABORT_ON_ASSERT_FAILURE */
-#ifndef PROCEED_ON_ERROR
-#define PROCEED_ON_ERROR 0
-#endif  /* PROCEED_ON_ERROR */
-#ifndef USE_LOCKS
-#define USE_LOCKS 0
-#endif  /* USE_LOCKS */
-#ifndef USE_SPIN_LOCKS
-#if USE_LOCKS && SPIN_LOCKS_AVAILABLE
-#define USE_SPIN_LOCKS 1
-#else
-#define USE_SPIN_LOCKS 0
-#endif /* USE_LOCKS && SPIN_LOCKS_AVAILABLE. */
-#endif /* USE_SPIN_LOCKS */
-#ifndef INSECURE
-#define INSECURE 0
-#endif  /* INSECURE */
-#ifndef HAVE_MMAP
-#define HAVE_MMAP 1
-#endif  /* HAVE_MMAP */
-#ifndef MMAP_CLEARS
-#define MMAP_CLEARS 1
-#endif  /* MMAP_CLEARS */
-#ifndef HAVE_MREMAP
-#ifdef linux
-#define HAVE_MREMAP 1
-#else   /* linux */
-#define HAVE_MREMAP 0
-#endif  /* linux */
-#endif  /* HAVE_MREMAP */
-#ifndef MALLOC_FAILURE_ACTION
-#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
-#endif  /* MALLOC_FAILURE_ACTION */
-#ifndef HAVE_MORECORE
-#if ONLY_MSPACES
-#define HAVE_MORECORE 0
-#else   /* ONLY_MSPACES */
-#define HAVE_MORECORE 1
-#endif  /* ONLY_MSPACES */
-#endif  /* HAVE_MORECORE */
-#if !HAVE_MORECORE
-#define MORECORE_CONTIGUOUS 0
-#else   /* !HAVE_MORECORE */
-#define MORECORE_DEFAULT sbrk
-#ifndef MORECORE_CONTIGUOUS
-#define MORECORE_CONTIGUOUS 1
-#endif  /* MORECORE_CONTIGUOUS */
-#endif  /* HAVE_MORECORE */
-#ifndef DEFAULT_GRANULARITY
-#if (MORECORE_CONTIGUOUS || defined(WIN32))
-#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
-#else   /* MORECORE_CONTIGUOUS */
-#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
-#endif  /* MORECORE_CONTIGUOUS */
-#endif  /* DEFAULT_GRANULARITY */
-#ifndef DEFAULT_TRIM_THRESHOLD
-#ifndef MORECORE_CANNOT_TRIM
-#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
-#else   /* MORECORE_CANNOT_TRIM */
-#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
-#endif  /* MORECORE_CANNOT_TRIM */
-#endif  /* DEFAULT_TRIM_THRESHOLD */
-#ifndef DEFAULT_MMAP_THRESHOLD
-#if HAVE_MMAP
-#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
-#else   /* HAVE_MMAP */
-#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
-#endif  /* HAVE_MMAP */
-#endif  /* DEFAULT_MMAP_THRESHOLD */
-#ifndef MAX_RELEASE_CHECK_RATE
-#if HAVE_MMAP
-#define MAX_RELEASE_CHECK_RATE 4095
-#else
-#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
-#endif /* HAVE_MMAP */
-#endif /* MAX_RELEASE_CHECK_RATE */
-#ifndef USE_BUILTIN_FFS
-#define USE_BUILTIN_FFS 0
-#endif  /* USE_BUILTIN_FFS */
-#ifndef USE_DEV_RANDOM
-#define USE_DEV_RANDOM 0
-#endif  /* USE_DEV_RANDOM */
-#ifndef NO_MALLINFO
-#define NO_MALLINFO 0
-#endif  /* NO_MALLINFO */
-#ifndef MALLINFO_FIELD_TYPE
-#define MALLINFO_FIELD_TYPE size_t
-#endif  /* MALLINFO_FIELD_TYPE */
-#ifndef NO_SEGMENT_TRAVERSAL
-#define NO_SEGMENT_TRAVERSAL 0
-#endif /* NO_SEGMENT_TRAVERSAL */
-
-/*
-  mallopt tuning options.  SVID/XPG defines four standard parameter
-  numbers for mallopt, normally defined in malloc.h.  None of these
-  are used in this malloc, so setting them has no effect. But this
-  malloc does support the following options.
-*/
-
-#define M_TRIM_THRESHOLD     (-1)
-#define M_GRANULARITY        (-2)
-#define M_MMAP_THRESHOLD     (-3)
-
-/* ------------------------ Mallinfo declarations ------------------------ */
-
-#if !NO_MALLINFO
-/*
-  This version of malloc supports the standard SVID/XPG mallinfo
-  routine that returns a struct containing usage properties and
-  statistics. It should work on any system that has a
-  /usr/include/malloc.h defining struct mallinfo.  The main
-  declaration needed is the mallinfo struct that is returned (by-copy)
-  by mallinfo().  The malloinfo struct contains a bunch of fields that
-  are not even meaningful in this version of malloc.  These fields are
-  are instead filled by mallinfo() with other numbers that might be of
-  interest.
-
-  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
-  /usr/include/malloc.h file that includes a declaration of struct
-  mallinfo.  If so, it is included; else a compliant version is
-  declared below.  These must be precisely the same for mallinfo() to
-  work.  The original SVID version of this struct, defined on most
-  systems with mallinfo, declares all fields as ints. But some others
-  define as unsigned long. If your system defines the fields using a
-  type of different width than listed here, you MUST #include your
-  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
-*/
-
-/* #define HAVE_USR_INCLUDE_MALLOC_H */
-
-#ifdef HAVE_USR_INCLUDE_MALLOC_H
-#include "/usr/include/malloc.h"
-#else /* HAVE_USR_INCLUDE_MALLOC_H */
-#ifndef STRUCT_MALLINFO_DECLARED
-#define STRUCT_MALLINFO_DECLARED 1
-struct mallinfo {
-  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
-  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
-  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
-  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
-  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
-  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
-  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
-  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
-  MALLINFO_FIELD_TYPE fordblks; /* total free space */
-  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
-};
-#endif /* STRUCT_MALLINFO_DECLARED */
-#endif /* HAVE_USR_INCLUDE_MALLOC_H */
-#endif /* NO_MALLINFO */
-
-/*
-  Try to persuade compilers to inline. The most critical functions for
-  inlining are defined as macros, so these aren't used for them.
-*/
-
-#ifndef FORCEINLINE
-  #if defined(__GNUC__)
-#define FORCEINLINE __inline __attribute__ ((always_inline))
-  #elif defined(_MSC_VER)
-    #define FORCEINLINE __forceinline
-  #endif
-#endif
-#ifndef NOINLINE
-  #if defined(__GNUC__)
-    #define NOINLINE __attribute__ ((noinline))
-  #elif defined(_MSC_VER)
-    #define NOINLINE __declspec(noinline)
-  #else
-    #define NOINLINE
-  #endif
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#ifndef FORCEINLINE
- #define FORCEINLINE inline
-#endif
-#endif /* __cplusplus */
-#ifndef FORCEINLINE
- #define FORCEINLINE
-#endif
-
-#if !ONLY_MSPACES
-
-/* ------------------- Declarations of public routines ------------------- */
-
-#ifndef USE_DL_PREFIX
-#define dlcalloc               calloc
-#define dlfree                 free
-#define dlmalloc               malloc
-#define dlmemalign             memalign
-#define dlrealloc              realloc
-#define dlvalloc               valloc
-#define dlpvalloc              pvalloc
-#define dlmallinfo             mallinfo
-#define dlmallopt              mallopt
-#define dlmalloc_trim          malloc_trim
-#define dlmalloc_stats         malloc_stats
-#define dlmalloc_usable_size   malloc_usable_size
-#define dlmalloc_footprint     malloc_footprint
-#define dlmalloc_max_footprint malloc_max_footprint
-#define dlindependent_calloc   independent_calloc
-#define dlindependent_comalloc independent_comalloc
-#endif /* USE_DL_PREFIX */
-
-
-/*
-  malloc(size_t n)
-  Returns a pointer to a newly allocated chunk of at least n bytes, or
-  null if no space is available, in which case errno is set to ENOMEM
-  on ANSI C systems.
-
-  If n is zero, malloc returns a minimum-sized chunk. (The minimum
-  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
-  systems.)  Note that size_t is an unsigned type, so calls with
-  arguments that would be negative if signed are interpreted as
-  requests for huge amounts of space, which will often fail. The
-  maximum supported value of n differs across systems, but is in all
-  cases less than the maximum representable value of a size_t.
-*/
-void* dlmalloc(size_t);
-
-/*
-  free(void* p)
-  Releases the chunk of memory pointed to by p, that had been previously
-  allocated using malloc or a related routine such as realloc.
-  It has no effect if p is null. If p was not malloced or already
-  freed, free(p) will by default cause the current program to abort.
-*/
-void  dlfree(void*);
-
-/*
-  calloc(size_t n_elements, size_t element_size);
-  Returns a pointer to n_elements * element_size bytes, with all locations
-  set to zero.
-*/
-void* dlcalloc(size_t, size_t);
-
-/*
-  realloc(void* p, size_t n)
-  Returns a pointer to a chunk of size n that contains the same data
-  as does chunk p up to the minimum of (n, p's size) bytes, or null
-  if no space is available.
-
-  The returned pointer may or may not be the same as p. The algorithm
-  prefers extending p in most cases when possible, otherwise it
-  employs the equivalent of a malloc-copy-free sequence.
-
-  If p is null, realloc is equivalent to malloc.
-
-  If space is not available, realloc returns null, errno is set (if on
-  ANSI) and p is NOT freed.
-
-  if n is for fewer bytes than already held by p, the newly unused
-  space is lopped off and freed if possible.  realloc with a size
-  argument of zero (re)allocates a minimum-sized chunk.
-
-  The old unix realloc convention of allowing the last-free'd chunk
-  to be used as an argument to realloc is not supported.
-*/
-
-void* dlrealloc(void*, size_t);
-
-/*
-  memalign(size_t alignment, size_t n);
-  Returns a pointer to a newly allocated chunk of n bytes, aligned
-  in accord with the alignment argument.
-
-  The alignment argument should be a power of two. If the argument is
-  not a power of two, the nearest greater power is used.
-  8-byte alignment is guaranteed by normal malloc calls, so don't
-  bother calling memalign with an argument of 8 or less.
-
-  Overreliance on memalign is a sure way to fragment space.
-*/
-void* dlmemalign(size_t, size_t);
-
-/*
-  valloc(size_t n);
-  Equivalent to memalign(pagesize, n), where pagesize is the page
-  size of the system. If the pagesize is unknown, 4096 is used.
-*/
-void* dlvalloc(size_t);
-
-/*
-  mallopt(int parameter_number, int parameter_value)
-  Sets tunable parameters The format is to provide a
-  (parameter-number, parameter-value) pair.  mallopt then sets the
-  corresponding parameter to the argument value if it can (i.e., so
-  long as the value is meaningful), and returns 1 if successful else
-  0.  To workaround the fact that mallopt is specified to use int,
-  not size_t parameters, the value -1 is specially treated as the
-  maximum unsigned size_t value.
-
-  SVID/XPG/ANSI defines four standard param numbers for mallopt,
-  normally defined in malloc.h.  None of these are use in this malloc,
-  so setting them has no effect. But this malloc also supports other
-  options in mallopt. See below for details.  Briefly, supported
-  parameters are as follows (listed defaults are for "typical"
-  configurations).
-
-  Symbol            param #  default    allowed param values
-  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
-  M_GRANULARITY        -2     page size   any power of 2 >= page size
-  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
-*/
-int dlmallopt(int, int);
-
-/*
-  malloc_footprint();
-  Returns the number of bytes obtained from the system.  The total
-  number of bytes allocated by malloc, realloc etc., is less than this
-  value. Unlike mallinfo, this function returns only a precomputed
-  result, so can be called frequently to monitor memory consumption.
-  Even if locks are otherwise defined, this function does not use them,
-  so results might not be up to date.
-*/
-size_t dlmalloc_footprint(void);
-
-/*
-  malloc_max_footprint();
-  Returns the maximum number of bytes obtained from the system. This
-  value will be greater than current footprint if deallocated space
-  has been reclaimed by the system. The peak number of bytes allocated
-  by malloc, realloc etc., is less than this value. Unlike mallinfo,
-  this function returns only a precomputed result, so can be called
-  frequently to monitor memory consumption.  Even if locks are
-  otherwise defined, this function does not use them, so results might
-  not be up to date.
-*/
-size_t dlmalloc_max_footprint(void);
-
-#if !NO_MALLINFO
-/*
-  mallinfo()
-  Returns (by copy) a struct containing various summary statistics:
-
-  arena:     current total non-mmapped bytes allocated from system
-  ordblks:   the number of free chunks
-  smblks:    always zero.
-  hblks:     current number of mmapped regions
-  hblkhd:    total bytes held in mmapped regions
-  usmblks:   the maximum total allocated space. This will be greater
-                than current total if trimming has occurred.
-  fsmblks:   always zero
-  uordblks:  current total allocated space (normal or mmapped)
-  fordblks:  total free space
-  keepcost:  the maximum number of bytes that could ideally be released
-               back to system via malloc_trim. ("ideally" means that
-               it ignores page restrictions etc.)
-
-  Because these fields are ints, but internal bookkeeping may
-  be kept as longs, the reported values may wrap around zero and
-  thus be inaccurate.
-*/
-struct mallinfo dlmallinfo(void);
-#endif /* NO_MALLINFO */
-
-/*
-  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);
-
-  independent_calloc is similar to calloc, but instead of returning a
-  single cleared space, it returns an array of pointers to n_elements
-  independent elements that can hold contents of size elem_size, each
-  of which starts out cleared, and can be independently freed,
-  realloc'ed etc. The elements are guaranteed to be adjacently
-  allocated (this is not guaranteed to occur with multiple callocs or
-  mallocs), which may also improve cache locality in some
-  applications.
-
-  The "chunks" argument is optional (i.e., may be null, which is
-  probably the most typical usage). If it is null, the returned array
-  is itself dynamically allocated and should also be freed when it is
-  no longer needed. Otherwise, the chunks array must be of at least
-  n_elements in length. It is filled in with the pointers to the
-  chunks.
-
-  In either case, independent_calloc returns this pointer array, or
-  null if the allocation failed.  If n_elements is zero and "chunks"
-  is null, it returns a chunk representing an array with zero elements
-  (which should be freed if not wanted).
-
-  Each element must be individually freed when it is no longer
-  needed. If you'd like to instead be able to free all at once, you
-  should instead use regular calloc and assign pointers into this
-  space to represent elements.  (In this case though, you cannot
-  independently free elements.)
-
-  independent_calloc simplifies and speeds up implementations of many
-  kinds of pools.  It may also be useful when constructing large data
-  structures that initially have a fixed number of fixed-sized nodes,
-  but the number is not known at compile time, and some of the nodes
-  may later need to be freed. For example:
-
-  struct Node { int item; struct Node* next; };
-
-  struct Node* build_list() {
-    struct Node** pool;
-    int n = read_number_of_nodes_needed();
-    if (n <= 0) return 0;
-    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
-    if (pool == 0) die();
-    // organize into a linked list...
-    struct Node* first = pool[0];
-    for (i = 0; i < n-1; ++i)
-      pool[i]->next = pool[i+1];
-    free(pool);     // Can now free the array (or not, if it is needed later)
-    return first;
-  }
-*/
-void** dlindependent_calloc(size_t, size_t, void**);
-
-/*
-  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);
-
-  independent_comalloc allocates, all at once, a set of n_elements
-  chunks with sizes indicated in the "sizes" array.    It returns
-  an array of pointers to these elements, each of which can be
-  independently freed, realloc'ed etc. The elements are guaranteed to
-  be adjacently allocated (this is not guaranteed to occur with
-  multiple callocs or mallocs), which may also improve cache locality
-  in some applications.
-
-  The "chunks" argument is optional (i.e., may be null). If it is null
-  the returned array is itself dynamically allocated and should also
-  be freed when it is no longer needed. Otherwise, the chunks array
-  must be of at least n_elements in length. It is filled in with the
-  pointers to the chunks.
-
-  In either case, independent_comalloc returns this pointer array, or
-  null if the allocation failed.  If n_elements is zero and chunks is
-  null, it returns a chunk representing an array with zero elements
-  (which should be freed if not wanted).
-
-  Each element must be individually freed when it is no longer
-  needed. If you'd like to instead be able to free all at once, you
-  should instead use a single regular malloc, and assign pointers at
-  particular offsets in the aggregate space. (In this case though, you
-  cannot independently free elements.)
-
-  independent_comallac differs from independent_calloc in that each
-  element may have a different size, and also that it does not
-  automatically clear elements.
-
-  independent_comalloc can be used to speed up allocation in cases
-  where several structs or objects must always be allocated at the
-  same time.  For example:
-
-  struct Head { ... }
-  struct Foot { ... }
-
-  void send_message(char* msg) {
-    int msglen = strlen(msg);
-    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
-    void* chunks[3];
-    if (independent_comalloc(3, sizes, chunks) == 0)
-      die();
-    struct Head* head = (struct Head*)(chunks[0]);
-    char*        body = (char*)(chunks[1]);
-    struct Foot* foot = (struct Foot*)(chunks[2]);
-    // ...
-  }
-
-  In general though, independent_comalloc is worth using only for
-  larger values of n_elements. For small values, you probably won't
-  detect enough difference from series of malloc calls to bother.
-
-  Overuse of independent_comalloc can increase overall memory usage,
-  since it cannot reuse existing noncontiguous small chunks that
-  might be available for some of the elements.
-*/
-void** dlindependent_comalloc(size_t, size_t*, void**);
-
-
-/*
-  pvalloc(size_t n);
-  Equivalent to valloc(minimum-page-that-holds(n)), that is,
-  round up n to nearest pagesize.
- */
-void*  dlpvalloc(size_t);
-
-/*
-  malloc_trim(size_t pad);
-
-  If possible, gives memory back to the system (via negative arguments
-  to sbrk) if there is unused memory at the `high' end of the malloc
-  pool or in unused MMAP segments. You can call this after freeing
-  large blocks of memory to potentially reduce the system-level memory
-  requirements of a program. However, it cannot guarantee to reduce
-  memory. Under some allocation patterns, some large free blocks of
-  memory will be locked between two used chunks, so they cannot be
-  given back to the system.
-
-  The `pad' argument to malloc_trim represents the amount of free
-  trailing space to leave untrimmed. If this argument is zero, only
-  the minimum amount of memory to maintain internal data structures
-  will be left. Non-zero arguments can be supplied to maintain enough
-  trailing space to service future expected allocations without having
-  to re-obtain memory from the system.
-
-  Malloc_trim returns 1 if it actually released any memory, else 0.
-*/
-int  dlmalloc_trim(size_t);
-
-/*
-  malloc_stats();
-  Prints on stderr the amount of space obtained from the system (both
-  via sbrk and mmap), the maximum amount (which may be more than
-  current if malloc_trim and/or munmap got called), and the current
-  number of bytes allocated via malloc (or realloc, etc) but not yet
-  freed. Note that this is the number of bytes allocated, not the
-  number requested. It will be larger than the number requested
-  because of alignment and bookkeeping overhead. Because it includes
-  alignment wastage as being in use, this figure may be greater than
-  zero even when no user-level chunks are allocated.
-
-  The reported current and maximum system memory can be inaccurate if
-  a program makes other calls to system memory allocation functions
-  (normally sbrk) outside of malloc.
-
-  malloc_stats prints only the most commonly interesting statistics.
-  More information can be obtained by calling mallinfo.
-*/
-void  dlmalloc_stats(void);
-
-#endif /* ONLY_MSPACES */
-
-/*
-  malloc_usable_size(void* p);
-
-  Returns the number of bytes you can actually use in
-  an allocated chunk, which may be more than you requested (although
-  often not) due to alignment and minimum size constraints.
-  You can use this many bytes without worrying about
-  overwriting other allocated objects. This is not a particularly great
-  programming practice. malloc_usable_size can be more useful in
-  debugging and assertions, for example:
-
-  p = malloc(n);
-  assert(malloc_usable_size(p) >= 256);
-*/
-size_t dlmalloc_usable_size(void*);
-
-
-#if MSPACES
-
-/*
-  mspace is an opaque type representing an independent
-  region of space that supports mspace_malloc, etc.
-*/
-typedef void* mspace;
-
-/*
-  create_mspace creates and returns a new independent space with the
-  given initial capacity, or, if 0, the default granularity size.  It
-  returns null if there is no system memory available to create the
-  space.  If argument locked is non-zero, the space uses a separate
-  lock to control access. The capacity of the space will grow
-  dynamically as needed to service mspace_malloc requests.  You can
-  control the sizes of incremental increases of this space by
-  compiling with a different DEFAULT_GRANULARITY or dynamically
-  setting with mallopt(M_GRANULARITY, value).
-*/
-mspace create_mspace(size_t capacity, int locked);
-
-/*
-  destroy_mspace destroys the given space, and attempts to return all
-  of its memory back to the system, returning the total number of
-  bytes freed. After destruction, the results of access to all memory
-  used by the space become undefined.
-*/
-size_t destroy_mspace(mspace msp);
-
-/*
-  create_mspace_with_base uses the memory supplied as the initial base
-  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
-  space is used for bookkeeping, so the capacity must be at least this
-  large. (Otherwise 0 is returned.) When this initial space is
-  exhausted, additional memory will be obtained from the system.
-  Destroying this space will deallocate all additionally allocated
-  space (if possible) but not the initial base.
-*/
-mspace create_mspace_with_base(void* base, size_t capacity, int locked);
-
-/*
-  mspace_track_large_chunks controls whether requests for large chunks
-  are allocated in their own untracked mmapped regions, separate from
-  others in this mspace. By default large chunks are not tracked,
-  which reduces fragmentation. However, such chunks are not
-  necessarily released to the system upon destroy_mspace.  Enabling
-  tracking by setting to true may increase fragmentation, but avoids
-  leakage when relying on destroy_mspace to release all memory
-  allocated using this space.  The function returns the previous
-  setting.
-*/
-int mspace_track_large_chunks(mspace msp, int enable);
-
-
-/*
-  mspace_malloc behaves as malloc, but operates within
-  the given space.
-*/
-void* mspace_malloc(mspace msp, size_t bytes);
-
-/*
-  mspace_free behaves as free, but operates within
-  the given space.
-
-  If compiled with FOOTERS==1, mspace_free is not actually needed.
-  free may be called instead of mspace_free because freed chunks from
-  any space are handled by their originating spaces.
-*/
-void mspace_free(mspace msp, void* mem);
-
-/*
-  mspace_realloc behaves as realloc, but operates within
-  the given space.
-
-  If compiled with FOOTERS==1, mspace_realloc is not actually
-  needed.  realloc may be called instead of mspace_realloc because
-  realloced chunks from any space are handled by their originating
-  spaces.
-*/
-void* mspace_realloc(mspace msp, void* mem, size_t newsize);
-
-/*
-  mspace_calloc behaves as calloc, but operates within
-  the given space.
-*/
-void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);
-
-/*
-  mspace_memalign behaves as memalign, but operates within
-  the given space.
-*/
-void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);
-
-/*
-  mspace_independent_calloc behaves as independent_calloc, but
-  operates within the given space.
-*/
-void** mspace_independent_calloc(mspace msp, size_t n_elements,
-                                 size_t elem_size, void* chunks[]);
-
-/*
-  mspace_independent_comalloc behaves as independent_comalloc, but
-  operates within the given space.
-*/
-void** mspace_independent_comalloc(mspace msp, size_t n_elements,
-                                   size_t sizes[], void* chunks[]);
-
-/*
-  mspace_footprint() returns the number of bytes obtained from the
-  system for this space.
-*/
-size_t mspace_footprint(mspace msp);
-
-/*
-  mspace_max_footprint() returns the peak number of bytes obtained from the
-  system for this space.
-*/
-size_t mspace_max_footprint(mspace msp);
-
-
-#if !NO_MALLINFO
-/*
-  mspace_mallinfo behaves as mallinfo, but reports properties of
-  the given space.
-*/
-struct mallinfo mspace_mallinfo(mspace msp);
-#endif /* NO_MALLINFO */
-
-/*
-  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
-*/
-  size_t mspace_usable_size(void* mem);
-
-/*
-  mspace_malloc_stats behaves as malloc_stats, but reports
-  properties of the given space.
-*/
-void mspace_malloc_stats(mspace msp);
-
-/*
-  mspace_trim behaves as malloc_trim, but
-  operates within the given space.
-*/
-int mspace_trim(mspace msp, size_t pad);
-
-/*
-  An alias for mallopt.
-*/
-int mspace_mallopt(int, int);
-
-#endif /* MSPACES */
-
-#ifdef __cplusplus
-}  /* end of extern "C" */
-#endif /* __cplusplus */
-
-/*
-  ========================================================================
-  To make a fully customizable malloc.h header file, cut everything
-  above this line, put into file malloc.h, edit to suit, and #include it
-  on the next line, as well as in programs that use this malloc.
-  ========================================================================
-*/
-
-/* #include "malloc.h" */
-
-/*------------------------------ internal #includes ---------------------- */
-
-#ifdef WIN32
-#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
-#endif /* WIN32 */
-
-#include <stdio.h>       /* for printing in malloc_stats */
-
-#ifndef LACKS_ERRNO_H
-#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
-#endif /* LACKS_ERRNO_H */
-#if FOOTERS || DEBUG
-#include <time.h>        /* for magic initialization */
-#endif /* FOOTERS */
-#ifndef LACKS_STDLIB_H
-#include <stdlib.h>      /* for abort() */
-#endif /* LACKS_STDLIB_H */
-#ifdef DEBUG
-#if ABORT_ON_ASSERT_FAILURE
-#undef assert
-#define assert(x) if(!(x)) ABORT
-#else /* ABORT_ON_ASSERT_FAILURE */
-#include <assert.h>
-#endif /* ABORT_ON_ASSERT_FAILURE */
-#else  /* DEBUG */
-#ifndef assert
-#define assert(x)
-#endif
-#define DEBUG 0
-#endif /* DEBUG */
-#ifndef LACKS_STRING_H
-#include <string.h>      /* for memset etc */
-#endif  /* LACKS_STRING_H */
-#if USE_BUILTIN_FFS
-#ifndef LACKS_STRINGS_H
-#include <strings.h>     /* for ffs */
-#endif /* LACKS_STRINGS_H */
-#endif /* USE_BUILTIN_FFS */
-#if HAVE_MMAP
-#ifndef LACKS_SYS_MMAN_H
-/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
-#if (defined(linux) && !defined(__USE_GNU))
-#define __USE_GNU 1
-#include <sys/mman.h>    /* for mmap */
-#undef __USE_GNU
-#else
-#include <sys/mman.h>    /* for mmap */
-#endif /* linux */
-#endif /* LACKS_SYS_MMAN_H */
-#ifndef LACKS_FCNTL_H
-#include <fcntl.h>
-#endif /* LACKS_FCNTL_H */
-#endif /* HAVE_MMAP */
-#ifndef LACKS_UNISTD_H
-#include <unistd.h>     /* for sbrk, sysconf */
-#else /* LACKS_UNISTD_H */
-#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
-extern void*     sbrk(ptrdiff_t);
-#endif /* FreeBSD etc */
-#endif /* LACKS_UNISTD_H */
-
-/* Declarations for locking */
-#if USE_LOCKS
-#ifndef WIN32
-#include <pthread.h>
-#if defined (__SVR4) && defined (__sun)  /* solaris */
-#include <thread.h>
-#endif /* solaris */
-#else
-#ifndef _M_AMD64
-/* These are already defined on AMD64 builds */
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
-LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-#endif /* _M_AMD64 */
-#pragma intrinsic (_InterlockedCompareExchange)
-#pragma intrinsic (_InterlockedExchange)
-#define interlockedcompareexchange _InterlockedCompareExchange
-#define interlockedexchange _InterlockedExchange
-#endif /* Win32 */
-#endif /* USE_LOCKS */
-
-/* Declarations for bit scanning on win32 */
-#if defined(_MSC_VER) && _MSC_VER>=1300
-#ifndef BitScanForward	/* Try to avoid pulling in WinNT.h */
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
-unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#define BitScanForward _BitScanForward
-#define BitScanReverse _BitScanReverse
-#pragma intrinsic(_BitScanForward)
-#pragma intrinsic(_BitScanReverse)
-#endif /* BitScanForward */
-#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */
-
-#ifndef WIN32
-#ifndef malloc_getpagesize
-#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
-#    ifndef _SC_PAGE_SIZE
-#      define _SC_PAGE_SIZE _SC_PAGESIZE
-#    endif
-#  endif
-#  ifdef _SC_PAGE_SIZE
-#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
-#  else
-#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
-       extern size_t getpagesize();
-#      define malloc_getpagesize getpagesize()
-#    else
-#      ifdef WIN32 /* use supplied emulation of getpagesize */
-#        define malloc_getpagesize getpagesize()
-#      else
-#        ifndef LACKS_SYS_PARAM_H
-#          include <sys/param.h>
-#        endif
-#        ifdef EXEC_PAGESIZE
-#          define malloc_getpagesize EXEC_PAGESIZE
-#        else
-#          ifdef NBPG
-#            ifndef CLSIZE
-#              define malloc_getpagesize NBPG
-#            else
-#              define malloc_getpagesize (NBPG * CLSIZE)
-#            endif
-#          else
-#            ifdef NBPC
-#              define malloc_getpagesize NBPC
-#            else
-#              ifdef PAGESIZE
-#                define malloc_getpagesize PAGESIZE
-#              else /* just guess */
-#                define malloc_getpagesize ((size_t)4096U)
-#              endif
-#            endif
-#          endif
-#        endif
-#      endif
-#    endif
-#  endif
-#endif
-#endif
-
-
-
-/* ------------------- size_t and alignment properties -------------------- */
-
-/* The byte and bit size of a size_t */
-#define SIZE_T_SIZE         (sizeof(size_t))
-#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)
-
-/* Some constants coerced to size_t */
-/* Annoying but necessary to avoid errors on some platforms */
-#define SIZE_T_ZERO         ((size_t)0)
-#define SIZE_T_ONE          ((size_t)1)
-#define SIZE_T_TWO          ((size_t)2)
-#define SIZE_T_FOUR         ((size_t)4)
-#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
-#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
-#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
-#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)
-
-/* The bit mask value corresponding to MALLOC_ALIGNMENT */
-#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)
-
-/* True if address a has acceptable alignment */
-#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)
-
-/* the number of bytes to offset an address to align it */
-#define align_offset(A)\
- ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
-  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))
-
-/*
-  malloc_params holds global properties, including those that can be
-  dynamically set using mallopt. There is a single instance, mparams,
-  initialized in init_mparams. Note that the non-zeroness of "magic"
-  also serves as an initialization flag.
-*/
-typedef unsigned int flag_t;
-struct malloc_params {
-  volatile size_t magic;
-  size_t page_size;
-  size_t granularity;
-  size_t mmap_threshold;
-  size_t trim_threshold;
-  flag_t default_mflags;
-};
-
-static struct malloc_params mparams;
-
-/* Ensure mparams initialized */
-#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams())
-
-/* -------------------------- MMAP preliminaries ------------------------- */
-
-/*
-   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
-   checks to fail so compiler optimizer can delete code rather than
-   using so many "#if"s.
-*/
-
-
-/* MORECORE and MMAP must return MFAIL on failure */
-#define MFAIL                ((void*)(MAX_SIZE_T))
-#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */
-
-#if HAVE_MMAP
-
-#ifndef WIN32
-#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
-#define MAP_ANONYMOUS        MAP_ANON
-#endif /* MAP_ANON */
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-#define MMAP_IMPL mmap_aligned
-static void* lastAlignedmmap; /* Used as a hint */
-static void* mmap_aligned(void *start, size_t length, int prot, int flags, int fd, off_t offset) {
-  void* baseaddress = 0;
-  void* ptr = 0;
-  if(!start) {
-    baseaddress = lastAlignedmmap;
-    for(;;) {
-      if(baseaddress) flags|=MAP_FIXED;
-      ptr = mmap(baseaddress, length, prot, flags, fd, offset);
-      if(!ptr)
-        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
-      else if((size_t)ptr & (mparams.granularity - SIZE_T_ONE)) {
-        munmap(ptr, length);
-        baseaddress = (void*)(((size_t)ptr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
-      }
-      else break;
-    }
-  }
-  else ptr = mmap(start, length, prot, flags, fd, offset);
-  if(ptr) lastAlignedmmap = (void*)((size_t) ptr + mparams.granularity);
-  return ptr;
-}
-#else
-#define MMAP_IMPL mmap
-#endif /* DEFAULT_GRANULARITY_ALIGNED */
-#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
-#define MMAP_PROT            (PROT_READ|PROT_WRITE)
-#ifdef MAP_ANONYMOUS
-#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
-#define MMAP_DEFAULT(s)       MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
-#else /* MAP_ANONYMOUS */
-/*
-   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
-   is unlikely to be needed, but is supplied just in case.
-*/
-#define MMAP_FLAGS           (MAP_PRIVATE)
-static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
-#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
-           (dev_zero_fd = open("/dev/zero", O_RDWR), \
-            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
-            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
-#endif /* MAP_ANONYMOUS */
-
-#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)
-
-#else /* WIN32 */
-
-/* Win32 MMAP via VirtualAlloc */
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-static void* lastWin32mmap; /* Used as a hint */
-#endif /* DEFAULT_GRANULARITY_ALIGNED */
-#ifdef ENABLE_LARGE_PAGES
-static int largepagesavailable = 1;
-#endif /* ENABLE_LARGE_PAGES */
-static FORCEINLINE void* win32mmap(size_t size) {
-  void* baseaddress = 0;
-  void* ptr = 0;
-#ifdef ENABLE_LARGE_PAGES
-  /* Note that large pages are *always* allocated on a large page boundary.
-  If however granularity is small then don't waste a kernel call if size
-  isn't around the size of a large page */
-  if(largepagesavailable && size >= 1*1024*1024) {
-    ptr = VirtualAlloc(baseaddress, size, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
-    if(!ptr && ERROR_PRIVILEGE_NOT_HELD==GetLastError()) largepagesavailable=0;
-  }
-#endif
-  if(!ptr) {
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-    /* We try to avoid overhead by speculatively reserving at aligned
-    addresses until we succeed */
-    baseaddress = lastWin32mmap;
-    for(;;) {
-      void* reserveaddr = VirtualAlloc(baseaddress, size, MEM_RESERVE, PAGE_READWRITE);
-      if(!reserveaddr)
-        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
-      else if((size_t)reserveaddr & (mparams.granularity - SIZE_T_ONE)) {
-        VirtualFree(reserveaddr, 0, MEM_RELEASE);
-        baseaddress = (void*)(((size_t)reserveaddr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
-      }
-      else break;
-    }
-#endif
-    if(!ptr) ptr = VirtualAlloc(baseaddress, size, baseaddress ? MEM_COMMIT : MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
-#if DEBUG
-    if(lastWin32mmap && ptr!=lastWin32mmap) printf("Non-contiguous VirtualAlloc between %p and %p\n", ptr, lastWin32mmap);
-#endif
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-    if(ptr) lastWin32mmap = (void*)((size_t) ptr + mparams.granularity);
-#endif
-  }
-#if DEBUG
-#ifdef ENABLE_LARGE_PAGES
-  printf("VirtualAlloc returns %p size %u. LargePagesAvailable=%d\n", ptr, size, largepagesavailable);
-#else
-  printf("VirtualAlloc returns %p size %u\n", ptr, size);
-#endif
-#endif
-  return (ptr != 0)? ptr: MFAIL;
-}
-
-/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
-static FORCEINLINE void* win32direct_mmap(size_t size) {
-  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
-                           PAGE_READWRITE);
-  return (ptr != 0)? ptr: MFAIL;
-}
-
-/* This function supports releasing coalesed segments */
-static FORCEINLINE int win32munmap(void* ptr, size_t size) {
-  MEMORY_BASIC_INFORMATION minfo;
-  char* cptr = (char*)ptr;
-  while (size) {
-    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
-      return -1;
-    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
-        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
-      return -1;
-    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
-      return -1;
-    cptr += minfo.RegionSize;
-    size -= minfo.RegionSize;
-  }
-  return 0;
-}
-
-#define MMAP_DEFAULT(s)             win32mmap(s)
-#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
-#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
-#endif /* WIN32 */
-#endif /* HAVE_MMAP */
-
-#if HAVE_MREMAP
-#ifndef WIN32
-#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
-#endif /* WIN32 */
-#endif /* HAVE_MREMAP */
-
-
-/**
- * Define CALL_MORECORE
- */
-#if HAVE_MORECORE
-    #ifdef MORECORE
-        #define CALL_MORECORE(S)    MORECORE(S)
-    #else  /* MORECORE */
-        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
-    #endif /* MORECORE */
-#else  /* HAVE_MORECORE */
-    #define CALL_MORECORE(S)        MFAIL
-#endif /* HAVE_MORECORE */
-
-/**
- * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
- */
-#if HAVE_MMAP
-    #define USE_MMAP_BIT            (SIZE_T_ONE)
-
-    #ifdef MMAP
-        #define CALL_MMAP(s)        MMAP(s)
-    #else /* MMAP */
-        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
-    #endif /* MMAP */
-    #ifdef MUNMAP
-        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
-    #else /* MUNMAP */
-        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
-    #endif /* MUNMAP */
-    #ifdef DIRECT_MMAP
-        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
-    #else /* DIRECT_MMAP */
-        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
-    #endif /* DIRECT_MMAP */
-#else  /* HAVE_MMAP */
-    #define USE_MMAP_BIT            (SIZE_T_ZERO)
-
-    #define MMAP(s)                 MFAIL
-    #define MUNMAP(a, s)            (-1)
-    #define DIRECT_MMAP(s)          MFAIL
-    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
-    #define CALL_MMAP(s)            MMAP(s)
-    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
-#endif /* HAVE_MMAP */
-
-/**
- * Define CALL_MREMAP
- */
-#if HAVE_MMAP && HAVE_MREMAP
-    #ifdef MREMAP
-        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
-    #else /* MREMAP */
-        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
-    #endif /* MREMAP */
-#else  /* HAVE_MMAP && HAVE_MREMAP */
-    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
-#endif /* HAVE_MMAP && HAVE_MREMAP */
-
-/* mstate bit set if continguous morecore disabled or failed */
-#define USE_NONCONTIGUOUS_BIT (4U)
-
-/* segment bit set in create_mspace_with_base */
-#define EXTERN_BIT            (8U)
-
-
-/* --------------------------- Lock preliminaries ------------------------ */
-
-/*
-  When locks are defined, there is one global lock, plus
-  one per-mspace lock.
-
-  The global lock_ensures that mparams.magic and other unique
-  mparams values are initialized only once. It also protects
-  sequences of calls to MORECORE.  In many cases sys_alloc requires
-  two calls, that should not be interleaved with calls by other
-  threads.  This does not protect against direct calls to MORECORE
-  by other threads not using this lock, so there is still code to
-  cope the best we can on interference.
-
-  Per-mspace locks surround calls to malloc, free, etc.  To enable use
-  in layered extensions, per-mspace locks are reentrant.
-
-  Because lock-protected regions generally have bounded times, it is
-  OK to use the supplied simple spinlocks in the custom versions for
-  x86. Spinlocks are likely to improve performance for lightly
-  contended applications, but worsen performance under heavy
-  contention.
-
-  If USE_LOCKS is > 1, the definitions of lock routines here are
-  bypassed, in which case you will need to define the type MLOCK_T,
-  and at least INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK and possibly
-  TRY_LOCK (which is not used in this malloc, but commonly needed in
-  extensions.)  You must also declare a
-    static MLOCK_T malloc_global_mutex = { initialization values };.
-
-*/
-
-#if USE_LOCKS == 1
-
-#if USE_SPIN_LOCKS && SPIN_LOCKS_AVAILABLE
-#ifndef WIN32
-
-/* Custom pthread-style spin locks on x86 and x64 for gcc */
-struct pthread_mlock_t {
-  volatile unsigned int l;
-  char cachelinepadding[64];
-  unsigned int c;
-  pthread_t threadid;
-};
-#define MLOCK_T               struct pthread_mlock_t
-#define CURRENT_THREAD        pthread_self()
-#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
-#define ACQUIRE_LOCK(sl)      pthread_acquire_lock(sl)
-#define RELEASE_LOCK(sl)      pthread_release_lock(sl)
-#define TRY_LOCK(sl)          pthread_try_lock(sl)
-#define SPINS_PER_YIELD       63
-
-static MLOCK_T malloc_global_mutex = { 0, "", 0, 0};
-
-static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) {
-  int spins = 0;
-  volatile unsigned int* lp = &sl->l;
-  for (;;) {
-    if (*lp != 0) {
-      if (sl->threadid == CURRENT_THREAD) {
-        ++sl->c;
-        return 0;
-      }
-    }
-    else {
-      /* place args to cmpxchgl in locals to evade oddities in some gccs */
-      int cmp = 0;
-      int val = 1;
-      int ret;
-      __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
-                             : "=a" (ret)
-                             : "r" (val), "m" (*(lp)), "0"(cmp)
-                             : "memory", "cc");
-      if (!ret) {
-        assert(!sl->threadid);
-        sl->threadid = CURRENT_THREAD;
-        sl->c = 1;
-        return 0;
-      }
-    }
-    if ((++spins & SPINS_PER_YIELD) == 0) {
-#if defined (__SVR4) && defined (__sun) /* solaris */
-      thr_yield();
-#else
-#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
-      sched_yield();
-#else  /* no-op yield on unknown systems */
-      ;
-#endif /* __linux__ || __FreeBSD__ || __APPLE__ */
-#endif /* solaris */
-    }
-  }
-}
-
-static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) {
-  volatile unsigned int* lp = &sl->l;
-  assert(*lp != 0);
-  assert(sl->threadid == CURRENT_THREAD);
-  if (--sl->c == 0) {
-    sl->threadid = 0;
-    int prev = 0;
-    int ret;
-    __asm__ __volatile__ ("lock; xchgl %0, %1"
-                          : "=r" (ret)
-                          : "m" (*(lp)), "0"(prev)
-                          : "memory");
-  }
-}
-
-static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) {
-  volatile unsigned int* lp = &sl->l;
-  if (*lp != 0) {
-    if (sl->threadid == CURRENT_THREAD) {
-      ++sl->c;
-      return 1;
-    }
-  }
-  else {
-    int cmp = 0;
-    int val = 1;
-    int ret;
-    __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
-                           : "=a" (ret)
-                           : "r" (val), "m" (*(lp)), "0"(cmp)
-                           : "memory", "cc");
-    if (!ret) {
-      assert(!sl->threadid);
-      sl->threadid = CURRENT_THREAD;
-      sl->c = 1;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-
-#else /* WIN32 */
-/* Custom win32-style spin locks on x86 and x64 for MSC */
-struct win32_mlock_t {
-  volatile long l;
-  char cachelinepadding[64];
-  unsigned int c;
-  long threadid;
-};
-
-#define MLOCK_T               struct win32_mlock_t
-#define CURRENT_THREAD        ((long)GetCurrentThreadId())
-#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
-#define ACQUIRE_LOCK(sl)      win32_acquire_lock(sl)
-#define RELEASE_LOCK(sl)      win32_release_lock(sl)
-#define TRY_LOCK(sl)          win32_try_lock(sl)
-#define SPINS_PER_YIELD       63
-
-static MLOCK_T malloc_global_mutex = { 0, 0, 0};
-
-static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) {
-  int spins = 0;
-  for (;;) {
-    if (sl->l != 0) {
-      if (sl->threadid == CURRENT_THREAD) {
-        ++sl->c;
-        return 0;
-      }
-    }
-    else {
-      if (!interlockedexchange(&sl->l, 1)) {
-        assert(!sl->threadid);
-        sl->threadid = CURRENT_THREAD;
-        sl->c = 1;
-        return 0;
-      }
-    }
-    if ((++spins & SPINS_PER_YIELD) == 0)
-      SleepEx(0, FALSE);
-  }
-}
-
-static FORCEINLINE void win32_release_lock (MLOCK_T *sl) {
-  assert(sl->threadid == CURRENT_THREAD);
-  assert(sl->l != 0);
-  if (--sl->c == 0) {
-    sl->threadid = 0;
-    interlockedexchange (&sl->l, 0);
-  }
-}
-
-static FORCEINLINE int win32_try_lock (MLOCK_T *sl) {
-  if (sl->l != 0) {
-    if (sl->threadid == CURRENT_THREAD) {
-      ++sl->c;
-      return 1;
-    }
-  }
-  else {
-    if (!interlockedexchange(&sl->l, 1)){
-      assert(!sl->threadid);
-      sl->threadid = CURRENT_THREAD;
-      sl->c = 1;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-#endif /* WIN32 */
-#else /* USE_SPIN_LOCKS */
-
-#ifndef WIN32
-/* pthreads-based locks */
-
-#define MLOCK_T               pthread_mutex_t
-#define CURRENT_THREAD        pthread_self()
-#define INITIAL_LOCK(sl)      pthread_init_lock(sl)
-#define ACQUIRE_LOCK(sl)      pthread_mutex_lock(sl)
-#define RELEASE_LOCK(sl)      pthread_mutex_unlock(sl)
-#define TRY_LOCK(sl)          (!pthread_mutex_trylock(sl))
-
-static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-/* Cope with old-style linux recursive lock initialization by adding */
-/* skipped internal declaration from pthread.h */
-#ifdef linux
-#ifndef PTHREAD_MUTEX_RECURSIVE
-extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
-					   int __kind));
-#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
-#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
-#endif
-#endif
-
-static int pthread_init_lock (MLOCK_T *sl) {
-  pthread_mutexattr_t attr;
-  if (pthread_mutexattr_init(&attr)) return 1;
-  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
-  if (pthread_mutex_init(sl, &attr)) return 1;
-  if (pthread_mutexattr_destroy(&attr)) return 1;
-  return 0;
-}
-
-#else /* WIN32 */
-/* Win32 critical sections */
-#define MLOCK_T               CRITICAL_SECTION
-#define CURRENT_THREAD        GetCurrentThreadId()
-#define INITIAL_LOCK(s)       (!InitializeCriticalSectionAndSpinCount((s), 0x80000000|4000))
-#define ACQUIRE_LOCK(s)       (EnterCriticalSection(sl), 0)
-#define RELEASE_LOCK(s)       LeaveCriticalSection(sl)
-#define TRY_LOCK(s)           TryEnterCriticalSection(sl)
-#define NEED_GLOBAL_LOCK_INIT
-
-static MLOCK_T malloc_global_mutex;
-static volatile long malloc_global_mutex_status;
-
-/* Use spin loop to initialize global lock */
-static void init_malloc_global_mutex() {
-  for (;;) {
-    long stat = malloc_global_mutex_status;
-    if (stat > 0)
-      return;
-    /* transition to < 0 while initializing, then to > 0) */
-    if (stat == 0 &&
-        interlockedcompareexchange(&malloc_global_mutex_status, -1, 0) == 0) {
-      InitializeCriticalSection(&malloc_global_mutex);
-      interlockedexchange(&malloc_global_mutex_status,1);
-      return;
-    }
-    SleepEx(0, FALSE);
-  }
-}
-
-#endif /* WIN32 */
-#endif /* USE_SPIN_LOCKS */
-#endif /* USE_LOCKS == 1 */
-
-/* -----------------------  User-defined locks ------------------------ */
-
-#if USE_LOCKS > 1
-/* Define your own lock implementation here */
-/* #define INITIAL_LOCK(sl)  ... */
-/* #define ACQUIRE_LOCK(sl)  ... */
-/* #define RELEASE_LOCK(sl)  ... */
-/* #define TRY_LOCK(sl) ... */
-/* static MLOCK_T malloc_global_mutex = ... */
-#endif /* USE_LOCKS > 1 */
-
-/* -----------------------  Lock-based state ------------------------ */
-
-#if USE_LOCKS
-#define USE_LOCK_BIT               (2U)
-#else  /* USE_LOCKS */
-#define USE_LOCK_BIT               (0U)
-#define INITIAL_LOCK(l)
-#endif /* USE_LOCKS */
-
-#if USE_LOCKS
-#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK
-#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
-#endif
-#ifndef RELEASE_MALLOC_GLOBAL_LOCK
-#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
-#endif
-#else  /* USE_LOCKS */
-#define ACQUIRE_MALLOC_GLOBAL_LOCK()
-#define RELEASE_MALLOC_GLOBAL_LOCK()
-#endif /* USE_LOCKS */
-
-
-/* -----------------------  Chunk representations ------------------------ */
-
-/*
-  (The following includes lightly edited explanations by Colin Plumb.)
-
-  The malloc_chunk declaration below is misleading (but accurate and
-  necessary).  It declares a "view" into memory allowing access to
-  necessary fields at known offsets from a given base.
-
-  Chunks of memory are maintained using a `boundary tag' method as
-  originally described by Knuth.  (See the paper by Paul Wilson
-  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
-  techniques.)  Sizes of free chunks are stored both in the front of
-  each chunk and at the end.  This makes consolidating fragmented
-  chunks into bigger chunks fast.  The head fields also hold bits
-  representing whether chunks are free or in use.
-
-  Here are some pictures to make it clearer.  They are "exploded" to
-  show that the state of a chunk can be thought of as extending from
-  the high 31 bits of the head field of its header through the
-  prev_foot and PINUSE_BIT bit of the following chunk header.
-
-  A chunk that's in use looks like:
-
-   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-           | Size of previous chunk (if P = 0)                             |
-           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
-         | Size of this chunk                                         1| +-+
-   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         |                                                               |
-         +-                                                             -+
-         |                                                               |
-         +-                                                             -+
-         |                                                               :
-         +-      size - sizeof(size_t) available payload bytes          -+
-         :                                                               |
- chunk-> +-                                                             -+
-         |                                                               |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
-       | Size of next chunk (may or may not be in use)               | +-+
- mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-    And if it's free, it looks like this:
-
-   chunk-> +-                                                             -+
-           | User payload (must be in use, or we would have merged!)       |
-           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
-         | Size of this chunk                                         0| +-+
-   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         | Next pointer                                                  |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         | Prev pointer                                                  |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         |                                                               :
-         +-      size - sizeof(struct chunk) unused bytes               -+
-         :                                                               |
- chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         | Size of this chunk                                            |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
-       | Size of next chunk (must be in use, or we would have merged)| +-+
- mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-       |                                                               :
-       +- User payload                                                -+
-       :                                                               |
-       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-                                                                     |0|
-                                                                     +-+
-  Note that since we always merge adjacent free chunks, the chunks
-  adjacent to a free chunk must be in use.
-
-  Given a pointer to a chunk (which can be derived trivially from the
-  payload pointer) we can, in O(1) time, find out whether the adjacent
-  chunks are free, and if so, unlink them from the lists that they
-  are on and merge them with the current chunk.
-
-  Chunks always begin on even word boundaries, so the mem portion
-  (which is returned to the user) is also on an even word boundary, and
-  thus at least double-word aligned.
-
-  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
-  chunk size (which is always a multiple of two words), is an in-use
-  bit for the *previous* chunk.  If that bit is *clear*, then the
-  word before the current chunk size contains the previous chunk
-  size, and can be used to find the front of the previous chunk.
-  The very first chunk allocated always has this bit set, preventing
-  access to non-existent (or non-owned) memory. If pinuse is set for
-  any given chunk, then you CANNOT determine the size of the
-  previous chunk, and might even get a memory addressing fault when
-  trying to do so.
-
-  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
-  the chunk size redundantly records whether the current chunk is
-  inuse (unless the chunk is mmapped). This redundancy enables usage
-  checks within free and realloc, and reduces indirection when freeing
-  and consolidating chunks.
-
-  Each freshly allocated chunk must have both cinuse and pinuse set.
-  That is, each allocated chunk borders either a previously allocated
-  and still in-use chunk, or the base of its memory arena. This is
-  ensured by making all allocations from the the `lowest' part of any
-  found chunk.  Further, no free chunk physically borders another one,
-  so each free chunk is known to be preceded and followed by either
-  inuse chunks or the ends of memory.
-
-  Note that the `foot' of the current chunk is actually represented
-  as the prev_foot of the NEXT chunk. This makes it easier to
-  deal with alignments etc but can be very confusing when trying
-  to extend or adapt this code.
-
-  The exceptions to all this are
-
-     1. The special chunk `top' is the top-most available chunk (i.e.,
-        the one bordering the end of available memory). It is treated
-        specially.  Top is never included in any bin, is used only if
-        no other chunk is available, and is released back to the
-        system if it is very large (see M_TRIM_THRESHOLD).  In effect,
-        the top chunk is treated as larger (and thus less well
-        fitting) than any other available chunk.  The top chunk
-        doesn't update its trailing size field since there is no next
-        contiguous chunk that would have to index off it. However,
-        space is still allocated for it (TOP_FOOT_SIZE) to enable
-        separation or merging when space is extended.
-
-     3. Chunks allocated via mmap, have both cinuse and pinuse bits
-        cleared in their head fields.  Because they are allocated
-        one-by-one, each must carry its own prev_foot field, which is
-        also used to hold the offset this chunk has within its mmapped
-        region, which is needed to preserve alignment. Each mmapped
-        chunk is trailed by the first two fields of a fake next-chunk
-        for sake of usage checks.
-
-*/
-
-struct malloc_chunk {
-  size_t               prev_foot;  /* Size of previous chunk (if free).  */
-  size_t               head;       /* Size and inuse bits. */
-  struct malloc_chunk* fd;         /* double links -- used only if free. */
-  struct malloc_chunk* bk;
-};
-
-typedef struct malloc_chunk  mchunk;
-typedef struct malloc_chunk* mchunkptr;
-typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
-typedef unsigned int bindex_t;         /* Described below */
-typedef unsigned int binmap_t;         /* Described below */
-
-/* ------------------- Chunks sizes and alignments ----------------------- */
-
-#define MCHUNK_SIZE         (sizeof(mchunk))
-
-#if FOOTERS
-#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
-#else /* FOOTERS */
-#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
-#endif /* FOOTERS */
-
-/* MMapped chunks need a second word of overhead ... */
-#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
-/* ... and additional padding for fake next-chunk at foot */
-#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)
-
-/* The smallest size we can malloc is an aligned minimal chunk */
-#define MIN_CHUNK_SIZE\
-  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
-
-/* conversion from malloc headers to user pointers, and back */
-#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
-#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
-/* chunk associated with aligned address A */
-#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))
-
-/* Bounds on request (not chunk) sizes. */
-#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
-#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)
-
-/* pad request bytes into a usable size */
-#define pad_request(req) \
-   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
-
-/* pad request, checking for minimum (but not maximum) */
-#define request2size(req) \
-  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))
-
-
-/* ------------------ Operations on head and foot fields ----------------- */
-
-/*
-  The head field of a chunk is or'ed with PINUSE_BIT when previous
-  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
-  use, unless mmapped, in which case both bits are cleared.
-
-  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
-*/
-
-#define PINUSE_BIT          (SIZE_T_ONE)
-#define CINUSE_BIT          (SIZE_T_TWO)
-#define FLAG4_BIT           (SIZE_T_FOUR)
-#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
-#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)
-
-/* Head value for fenceposts */
-#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)
-
-/* extraction of fields from head words */
-#define cinuse(p)           ((p)->head & CINUSE_BIT)
-#define pinuse(p)           ((p)->head & PINUSE_BIT)
-#define is_inuse(p)         (((p)->head & INUSE_BITS) != PINUSE_BIT)
-#define is_mmapped(p)       (((p)->head & INUSE_BITS) == 0)
-
-#define chunksize(p)        ((p)->head & ~(FLAG_BITS))
-
-#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
-
-/* Treat space at ptr +/- offset as a chunk */
-#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
-#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))
-
-/* Ptr to next or previous physical malloc_chunk. */
-#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
-#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))
-
-/* extract next chunk's pinuse bit */
-#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)
-
-/* Get/set size at footer */
-#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
-#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))
-
-/* Set size, pinuse bit, and foot */
-#define set_size_and_pinuse_of_free_chunk(p, s)\
-  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))
-
-/* Set size, pinuse bit, foot, and clear next pinuse */
-#define set_free_with_pinuse(p, s, n)\
-  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))
-
-/* Get the internal overhead associated with chunk p */
-#define overhead_for(p)\
- (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)
-
-/* Return true if malloced space is not necessarily cleared */
-#if MMAP_CLEARS
-#define calloc_must_clear(p) (!is_mmapped(p))
-#else /* MMAP_CLEARS */
-#define calloc_must_clear(p) (1)
-#endif /* MMAP_CLEARS */
-
-/* ---------------------- Overlaid data structures ----------------------- */
-
-/*
-  When chunks are not in use, they are treated as nodes of either
-  lists or trees.
-
-  "Small"  chunks are stored in circular doubly-linked lists, and look
-  like this:
-
-    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Size of previous chunk                            |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `head:' |             Size of chunk, in bytes                         |P|
-      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Forward pointer to next chunk in list             |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Back pointer to previous chunk in list            |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Unused space (may be 0 bytes long)                .
-            .                                                               .
-            .                                                               |
-nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `foot:' |             Size of chunk, in bytes                           |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-  Larger chunks are kept in a form of bitwise digital trees (aka
-  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
-  free chunks greater than 256 bytes, their size doesn't impose any
-  constraints on user chunk sizes.  Each node looks like:
-
-    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Size of previous chunk                            |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `head:' |             Size of chunk, in bytes                         |P|
-      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Forward pointer to next chunk of same size        |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Back pointer to previous chunk of same size       |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Pointer to left child (child[0])                  |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Pointer to right child (child[1])                 |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Pointer to parent                                 |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             bin index of this chunk                           |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Unused space                                      .
-            .                                                               |
-nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `foot:' |             Size of chunk, in bytes                           |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
-  of the same size are arranged in a circularly-linked list, with only
-  the oldest chunk (the next to be used, in our FIFO ordering)
-  actually in the tree.  (Tree members are distinguished by a non-null
-  parent pointer.)  If a chunk with the same size an an existing node
-  is inserted, it is linked off the existing node using pointers that
-  work in the same way as fd/bk pointers of small chunks.
-
-  Each tree contains a power of 2 sized range of chunk sizes (the
-  smallest is 0x100 <= x < 0x180), which is is divided in half at each
-  tree level, with the chunks in the smaller half of the range (0x100
-  <= x < 0x140 for the top nose) in the left subtree and the larger
-  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
-  done by inspecting individual bits.
-
-  Using these rules, each node's left subtree contains all smaller
-  sizes than its right subtree.  However, the node at the root of each
-  subtree has no particular ordering relationship to either.  (The
-  dividing line between the subtree sizes is based on trie relation.)
-  If we remove the last chunk of a given size from the interior of the
-  tree, we need to replace it with a leaf node.  The tree ordering
-  rules permit a node to be replaced by any leaf below it.
-
-  The smallest chunk in a tree (a common operation in a best-fit
-  allocator) can be found by walking a path to the leftmost leaf in
-  the tree.  Unlike a usual binary tree, where we follow left child
-  pointers until we reach a null, here we follow the right child
-  pointer any time the left one is null, until we reach a leaf with
-  both child pointers null. The smallest chunk in the tree will be
-  somewhere along that path.
-
-  The worst case number of steps to add, find, or remove a node is
-  bounded by the number of bits differentiating chunks within
-  bins. Under current bin calculations, this ranges from 6 up to 21
-  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
-  is of course much better.
-*/
-
-struct malloc_tree_chunk {
-  /* The first four fields must be compatible with malloc_chunk */
-  size_t                    prev_foot;
-  size_t                    head;
-  struct malloc_tree_chunk* fd;
-  struct malloc_tree_chunk* bk;
-
-  struct malloc_tree_chunk* child[2];
-  struct malloc_tree_chunk* parent;
-  bindex_t                  index;
-};
-
-typedef struct malloc_tree_chunk  tchunk;
-typedef struct malloc_tree_chunk* tchunkptr;
-typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */
-
-/* A little helper macro for trees */
-#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])
-
-/* ----------------------------- Segments -------------------------------- */
-
-/*
-  Each malloc space may include non-contiguous segments, held in a
-  list headed by an embedded malloc_segment record representing the
-  top-most space. Segments also include flags holding properties of
-  the space. Large chunks that are directly allocated by mmap are not
-  included in this list. They are instead independently created and
-  destroyed without otherwise keeping track of them.
-
-  Segment management mainly comes into play for spaces allocated by
-  MMAP.  Any call to MMAP might or might not return memory that is
-  adjacent to an existing segment.  MORECORE normally contiguously
-  extends the current space, so this space is almost always adjacent,
-  which is simpler and faster to deal with. (This is why MORECORE is
-  used preferentially to MMAP when both are available -- see
-  sys_alloc.)  When allocating using MMAP, we don't use any of the
-  hinting mechanisms (inconsistently) supported in various
-  implementations of unix mmap, or distinguish reserving from
-  committing memory. Instead, we just ask for space, and exploit
-  contiguity when we get it.  It is probably possible to do
-  better than this on some systems, but no general scheme seems
-  to be significantly better.
-
-  Management entails a simpler variant of the consolidation scheme
-  used for chunks to reduce fragmentation -- new adjacent memory is
-  normally prepended or appended to an existing segment. However,
-  there are limitations compared to chunk consolidation that mostly
-  reflect the fact that segment processing is relatively infrequent
-  (occurring only when getting memory from system) and that we
-  don't expect to have huge numbers of segments:
-
-  * Segments are not indexed, so traversal requires linear scans.  (It
-    would be possible to index these, but is not worth the extra
-    overhead and complexity for most programs on most platforms.)
-  * New segments are only appended to old ones when holding top-most
-    memory; if they cannot be prepended to others, they are held in
-    different segments.
-
-  Except for the top-most segment of an mstate, each segment record
-  is kept at the tail of its segment. Segments are added by pushing
-  segment records onto the list headed by &mstate.seg for the
-  containing mstate.
-
-  Segment flags control allocation/merge/deallocation policies:
-  * If EXTERN_BIT set, then we did not allocate this segment,
-    and so should not try to deallocate or merge with others.
-    (This currently holds only for the initial segment passed
-    into create_mspace_with_base.)
-  * If USE_MMAP_BIT set, the segment may be merged with
-    other surrounding mmapped segments and trimmed/de-allocated
-    using munmap.
-  * If neither bit is set, then the segment was obtained using
-    MORECORE so can be merged with surrounding MORECORE'd segments
-    and deallocated/trimmed using MORECORE with negative arguments.
-*/
-
-struct malloc_segment {
-  char*        base;             /* base address */
-  size_t       size;             /* allocated size */
-  struct malloc_segment* next;   /* ptr to next segment */
-  flag_t       sflags;           /* mmap and extern flag */
-};
-
-#define is_mmapped_segment(S)  ((S)->sflags & USE_MMAP_BIT)
-#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)
-
-typedef struct malloc_segment  msegment;
-typedef struct malloc_segment* msegmentptr;
-
-/* ---------------------------- malloc_state ----------------------------- */
-
-/*
-   A malloc_state holds all of the bookkeeping for a space.
-   The main fields are:
-
-  Top
-    The topmost chunk of the currently active segment. Its size is
-    cached in topsize.  The actual size of topmost space is
-    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
-    fenceposts and segment records if necessary when getting more
-    space from the system.  The size at which to autotrim top is
-    cached from mparams in trim_check, except that it is disabled if
-    an autotrim fails.
-
-  Designated victim (dv)
-    This is the preferred chunk for servicing small requests that
-    don't have exact fits.  It is normally the chunk split off most
-    recently to service another small request.  Its size is cached in
-    dvsize. The link fields of this chunk are not maintained since it
-    is not kept in a bin.
-
-  SmallBins
-    An array of bin headers for free chunks.  These bins hold chunks
-    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
-    chunks of all the same size, spaced 8 bytes apart.  To simplify
-    use in double-linked lists, each bin header acts as a malloc_chunk
-    pointing to the real first node, if it exists (else pointing to
-    itself).  This avoids special-casing for headers.  But to avoid
-    waste, we allocate only the fd/bk pointers of bins, and then use
-    repositioning tricks to treat these as the fields of a chunk.
-
-  TreeBins
-    Treebins are pointers to the roots of trees holding a range of
-    sizes. There are 2 equally spaced treebins for each power of two
-    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
-    larger.
-
-  Bin maps
-    There is one bit map for small bins ("smallmap") and one for
-    treebins ("treemap).  Each bin sets its bit when non-empty, and
-    clears the bit when empty.  Bit operations are then used to avoid
-    bin-by-bin searching -- nearly all "search" is done without ever
-    looking at bins that won't be selected.  The bit maps
-    conservatively use 32 bits per map word, even if on 64bit system.
-    For a good description of some of the bit-based techniques used
-    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
-    supplement at http://hackersdelight.org/). Many of these are
-    intended to reduce the branchiness of paths through malloc etc, as
-    well as to reduce the number of memory locations read or written.
-
-  Segments
-    A list of segments headed by an embedded malloc_segment record
-    representing the initial space.
-
-  Address check support
-    The least_addr field is the least address ever obtained from
-    MORECORE or MMAP. Attempted frees and reallocs of any address less
-    than this are trapped (unless INSECURE is defined).
-
-  Magic tag
-    A cross-check field that should always hold same value as mparams.magic.
-
-  Flags
-    Bits recording whether to use MMAP, locks, or contiguous MORECORE
-
-  Statistics
-    Each space keeps track of current and maximum system memory
-    obtained via MORECORE or MMAP.
-
-  Trim support
-    Fields holding the amount of unused topmost memory that should trigger
-    timming, and a counter to force periodic scanning to release unused
-    non-topmost segments.
-
-  Locking
-    If USE_LOCKS is defined, the "mutex" lock is acquired and released
-    around every public call using this mspace.
-
-  Extension support
-    A void* pointer and a size_t field that can be used to help implement
-    extensions to this malloc.
-*/
-
-/* Bin types, widths and sizes */
-#define NSMALLBINS        (32U)
-#define NTREEBINS         (32U)
-#define SMALLBIN_SHIFT    (3U)
-#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
-#define TREEBIN_SHIFT     (8U)
-#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
-#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
-#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)
-
-struct malloc_state {
-  binmap_t   smallmap;
-  binmap_t   treemap;
-  size_t     dvsize;
-  size_t     topsize;
-  char*      least_addr;
-  mchunkptr  dv;
-  mchunkptr  top;
-  size_t     trim_check;
-  size_t     release_checks;
-  size_t     magic;
-  mchunkptr  smallbins[(NSMALLBINS+1)*2];
-  tbinptr    treebins[NTREEBINS];
-  size_t     footprint;
-  size_t     max_footprint;
-  flag_t     mflags;
-  msegment   seg;
-#if USE_LOCKS
-  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
-#endif /* USE_LOCKS */
-  void*      extp;      /* Unused but available for extensions */
-  size_t     exts;
-};
-
-typedef struct malloc_state*    mstate;
-
-/* ------------- Global malloc_state and malloc_params ------------------- */
-
-#if !ONLY_MSPACES
-
-/* The global malloc_state used for all non-"mspace" calls */
-static struct malloc_state _gm_;
-#define gm                 (&_gm_)
-#define is_global(M)       ((M) == &_gm_)
-
-#endif /* !ONLY_MSPACES */
-
-#define is_initialized(M)  ((M)->top != 0)
-
-/* -------------------------- system alloc setup ------------------------- */
-
-/* Operations on mflags */
-
-#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
-#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
-#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
-
-#define use_mmap(M)           ((M)->mflags &   USE_MMAP_BIT)
-#define enable_mmap(M)        ((M)->mflags |=  USE_MMAP_BIT)
-#define disable_mmap(M)       ((M)->mflags &= ~USE_MMAP_BIT)
-
-#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
-#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)
-
-#define set_lock(M,L)\
- ((M)->mflags = (L)?\
-  ((M)->mflags | USE_LOCK_BIT) :\
-  ((M)->mflags & ~USE_LOCK_BIT))
-
-/* page-align a size */
-#define page_align(S)\
- (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))
-
-/* granularity-align a size */
-#define granularity_align(S)\
-  (((S) + (mparams.granularity - SIZE_T_ONE))\
-   & ~(mparams.granularity - SIZE_T_ONE))
-
-
-/* For mmap, use granularity alignment on windows, else page-align */
-#ifdef WIN32
-#define mmap_align(S) granularity_align(S)
-#else
-#define mmap_align(S) page_align(S)
-#endif
-
-/* For sys_alloc, enough padding to ensure can malloc request on success */
-#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)
-
-#define is_page_aligned(S)\
-   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
-#define is_granularity_aligned(S)\
-   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)
-
-/*  True if segment S holds address A */
-#define segment_holds(S, A)\
-  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)
-
-/* Return segment holding given address */
-static msegmentptr segment_holding(mstate m, char* addr) {
-  msegmentptr sp = &m->seg;
-  for (;;) {
-    if (addr >= sp->base && addr < sp->base + sp->size)
-      return sp;
-    if ((sp = sp->next) == 0)
-      return 0;
-  }
-}
-
-/* Return true if segment contains a segment link */
-static int has_segment_link(mstate m, msegmentptr ss) {
-  msegmentptr sp = &m->seg;
-  for (;;) {
-    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
-      return 1;
-    if ((sp = sp->next) == 0)
-      return 0;
-  }
-}
-
-#ifndef MORECORE_CANNOT_TRIM
-#define should_trim(M,s)  ((s) > (M)->trim_check)
-#else  /* MORECORE_CANNOT_TRIM */
-#define should_trim(M,s)  (0)
-#endif /* MORECORE_CANNOT_TRIM */
-
-/*
-  TOP_FOOT_SIZE is padding at the end of a segment, including space
-  that may be needed to place segment records and fenceposts when new
-  noncontiguous segments are added.
-*/
-#define TOP_FOOT_SIZE\
-  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)
-
-
-/* -------------------------------  Hooks -------------------------------- */
-
-/*
-  PREACTION should be defined to return 0 on success, and nonzero on
-  failure. If you are not using locking, you can redefine these to do
-  anything you like.
-*/
-
-#if USE_LOCKS
-
-#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
-#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
-#else /* USE_LOCKS */
-
-#ifndef PREACTION
-#define PREACTION(M) (0)
-#endif  /* PREACTION */
-
-#ifndef POSTACTION
-#define POSTACTION(M)
-#endif  /* POSTACTION */
-
-#endif /* USE_LOCKS */
-
-/*
-  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
-  USAGE_ERROR_ACTION is triggered on detected bad frees and
-  reallocs. The argument p is an address that might have triggered the
-  fault. It is ignored by the two predefined actions, but might be
-  useful in custom actions that try to help diagnose errors.
-*/
-
-#if PROCEED_ON_ERROR
-
-/* A count of the number of corruption errors causing resets */
-int malloc_corruption_error_count;
-
-/* default corruption action */
-static void reset_on_error(mstate m);
-
-#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
-#define USAGE_ERROR_ACTION(m, p)
-
-#else /* PROCEED_ON_ERROR */
-
-#ifndef CORRUPTION_ERROR_ACTION
-#define CORRUPTION_ERROR_ACTION(m) ABORT
-#endif /* CORRUPTION_ERROR_ACTION */
-
-#ifndef USAGE_ERROR_ACTION
-#define USAGE_ERROR_ACTION(m,p) ABORT
-#endif /* USAGE_ERROR_ACTION */
-
-#endif /* PROCEED_ON_ERROR */
-
-/* -------------------------- Debugging setup ---------------------------- */
-
-#if ! DEBUG
-
-#define check_free_chunk(M,P)
-#define check_inuse_chunk(M,P)
-#define check_malloced_chunk(M,P,N)
-#define check_mmapped_chunk(M,P)
-#define check_malloc_state(M)
-#define check_top_chunk(M,P)
-
-#else /* DEBUG */
-#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
-#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
-#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
-#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
-#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
-#define check_malloc_state(M)       do_check_malloc_state(M)
-
-static void   do_check_any_chunk(mstate m, mchunkptr p);
-static void   do_check_top_chunk(mstate m, mchunkptr p);
-static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
-static void   do_check_inuse_chunk(mstate m, mchunkptr p);
-static void   do_check_free_chunk(mstate m, mchunkptr p);
-static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
-static void   do_check_tree(mstate m, tchunkptr t);
-static void   do_check_treebin(mstate m, bindex_t i);
-static void   do_check_smallbin(mstate m, bindex_t i);
-static void   do_check_malloc_state(mstate m);
-static int    bin_find(mstate m, mchunkptr x);
-static size_t traverse_and_check(mstate m);
-#endif /* DEBUG */
-
-/* ---------------------------- Indexing Bins ---------------------------- */
-
-#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
-#define small_index(s)      (bindex_t)((s)  >> SMALLBIN_SHIFT)
-#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
-#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))
-
-/* addressing by index. See above about smallbin repositioning */
-#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
-#define treebin_at(M,i)     (&((M)->treebins[i]))
-
-/* assign tree index for size S to variable I. Use x86 asm if possible  */
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-#define compute_tree_index(S, I)\
-{\
-  unsigned int X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int K;\
-    __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "g"  (X));\
-    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
-  }\
-}
-
-#elif defined (__INTEL_COMPILER)
-#define compute_tree_index(S, I)\
-{\
-  size_t X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int K = _bit_scan_reverse (X); \
-    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
-  }\
-}
-
-#elif defined(_MSC_VER) && _MSC_VER>=1300
-#define compute_tree_index(S, I)\
-{\
-  size_t X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int K;\
-    _BitScanReverse((DWORD *) &K, (DWORD) X);\
-    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
-  }\
-}
-
-#else /* GNUC */
-#define compute_tree_index(S, I)\
-{\
-  size_t X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int Y = (unsigned int)X;\
-    unsigned int N = ((Y - 0x100) >> 16) & 8;\
-    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
-    N += K;\
-    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
-    K = 14 - N + ((Y <<= K) >> 15);\
-    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
-  }\
-}
-#endif /* GNUC */
-
-/* Bit representing maximum resolved size in a treebin at i */
-#define bit_for_tree_index(i) \
-   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)
-
-/* Shift placing maximum resolved bit in a treebin at i as sign bit */
-#define leftshift_for_tree_index(i) \
-   ((i == NTREEBINS-1)? 0 : \
-    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))
-
-/* The size of the smallest chunk held in bin with index i */
-#define minsize_for_tree_index(i) \
-   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
-   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))
-
-
-/* ------------------------ Operations on bin maps ----------------------- */
-
-/* bit corresponding to given index */
-#define idx2bit(i)              ((binmap_t)(1) << (i))
-
-/* Mark/Clear bits with given index */
-#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
-#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
-#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))
-
-#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
-#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
-#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))
-
-/* isolate the least set bit of a bitmap */
-#define least_bit(x)         ((x) & -(x))
-
-/* mask with all bits to left of least bit of x on */
-#define left_bits(x)         ((x<<1) | -(x<<1))
-
-/* mask with all bits to left of or equal to least bit of x on */
-#define same_or_left_bits(x) ((x) | -(x))
-
-/* index corresponding to given bit. Use x86 asm if possible */
-
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int J;\
-  __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "g" (X));\
-  I = (bindex_t)J;\
-}
-
-#elif defined (__INTEL_COMPILER)
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int J;\
-  J = _bit_scan_forward (X); \
-  I = (bindex_t)J;\
-}
-
-#elif defined(_MSC_VER) && _MSC_VER>=1300
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int J;\
-  _BitScanForward((DWORD *) &J, X);\
-  I = (bindex_t)J;\
-}
-
-#elif USE_BUILTIN_FFS
-#define compute_bit2idx(X, I) I = ffs(X)-1
-
-#else
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int Y = X - 1;\
-  unsigned int K = Y >> (16-4) & 16;\
-  unsigned int N = K;        Y >>= K;\
-  N += K = Y >> (8-3) &  8;  Y >>= K;\
-  N += K = Y >> (4-2) &  4;  Y >>= K;\
-  N += K = Y >> (2-1) &  2;  Y >>= K;\
-  N += K = Y >> (1-0) &  1;  Y >>= K;\
-  I = (bindex_t)(N + Y);\
-}
-#endif /* GNUC */
-
-
-/* ----------------------- Runtime Check Support ------------------------- */
-
-/*
-  For security, the main invariant is that malloc/free/etc never
-  writes to a static address other than malloc_state, unless static
-  malloc_state itself has been corrupted, which cannot occur via
-  malloc (because of these checks). In essence this means that we
-  believe all pointers, sizes, maps etc held in malloc_state, but
-  check all of those linked or offsetted from other embedded data
-  structures.  These checks are interspersed with main code in a way
-  that tends to minimize their run-time cost.
-
-  When FOOTERS is defined, in addition to range checking, we also
-  verify footer fields of inuse chunks, which can be used guarantee
-  that the mstate controlling malloc/free is intact.  This is a
-  streamlined version of the approach described by William Robertson
-  et al in "Run-time Detection of Heap-based Overflows" LISA'03
-  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
-  of an inuse chunk holds the xor of its mstate and a random seed,
-  that is checked upon calls to free() and realloc().  This is
-  (probablistically) unguessable from outside the program, but can be
-  computed by any code successfully malloc'ing any chunk, so does not
-  itself provide protection against code that has already broken
-  security through some other means.  Unlike Robertson et al, we
-  always dynamically check addresses of all offset chunks (previous,
-  next, etc). This turns out to be cheaper than relying on hashes.
-*/
-
-#if !INSECURE
-/* Check if address a is at least as high as any from MORECORE or MMAP */
-#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
-/* Check if address of next chunk n is higher than base chunk p */
-#define ok_next(p, n)    ((char*)(p) < (char*)(n))
-/* Check if p has inuse status */
-#define ok_inuse(p)     is_inuse(p)
-/* Check if p has its pinuse bit on */
-#define ok_pinuse(p)     pinuse(p)
-
-#else /* !INSECURE */
-#define ok_address(M, a) (1)
-#define ok_next(b, n)    (1)
-#define ok_inuse(p)      (1)
-#define ok_pinuse(p)     (1)
-#endif /* !INSECURE */
-
-#if (FOOTERS && !INSECURE)
-/* Check if (alleged) mstate m has expected magic field */
-#define ok_magic(M)      ((M)->magic == mparams.magic)
-#else  /* (FOOTERS && !INSECURE) */
-#define ok_magic(M)      (1)
-#endif /* (FOOTERS && !INSECURE) */
-
-
-/* In gcc, use __builtin_expect to minimize impact of checks */
-#if !INSECURE
-#if defined(__GNUC__) && __GNUC__ >= 3
-#define RTCHECK(e)  __builtin_expect(e, 1)
-#else /* GNUC */
-#define RTCHECK(e)  (e)
-#endif /* GNUC */
-#else /* !INSECURE */
-#define RTCHECK(e)  (1)
-#endif /* !INSECURE */
-
-/* macros to set up inuse chunks with or without footers */
-
-#if !FOOTERS
-
-#define mark_inuse_foot(M,p,s)
-
-/* Macros for setting head/foot of non-mmapped chunks */
-
-/* Set cinuse bit and pinuse bit of next chunk */
-#define set_inuse(M,p,s)\
-  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
-  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
-
-/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
-#define set_inuse_and_pinuse(M,p,s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
-  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
-
-/* Set size, cinuse and pinuse bit of this chunk */
-#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))
-
-#else /* FOOTERS */
-
-/* Set foot of inuse chunk to be xor of mstate and seed */
-#define mark_inuse_foot(M,p,s)\
-  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))
-
-#define get_mstate_for(p)\
-  ((mstate)(((mchunkptr)((char*)(p) +\
-    (chunksize(p))))->prev_foot ^ mparams.magic))
-
-#define set_inuse(M,p,s)\
-  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
-  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
-  mark_inuse_foot(M,p,s))
-
-#define set_inuse_and_pinuse(M,p,s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
-  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
- mark_inuse_foot(M,p,s))
-
-#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
-  mark_inuse_foot(M, p, s))
-
-#endif /* !FOOTERS */
-
-/* ---------------------------- setting mparams -------------------------- */
-
-#ifdef ENABLE_LARGE_PAGES
-typedef size_t (WINAPI *GetLargePageMinimum_t)(void);
-#endif
-
-/* Initialize mparams */
-static int init_mparams(void) {
-#ifdef NEED_GLOBAL_LOCK_INIT
-  if (malloc_global_mutex_status <= 0)
-    init_malloc_global_mutex();
-#endif
-
-  ACQUIRE_MALLOC_GLOBAL_LOCK();
-  if (mparams.magic == 0) {
-    size_t magic;
-    size_t psize;
-    size_t gsize;
-
-#ifndef WIN32
-    psize = malloc_getpagesize;
-    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
-#else /* WIN32 */
-    {
-      SYSTEM_INFO system_info;
-      GetSystemInfo(&system_info);
-      psize = system_info.dwPageSize;
-      gsize = ((DEFAULT_GRANULARITY != 0)?
-               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
-#ifdef ENABLE_LARGE_PAGES
-      { 
-          GetLargePageMinimum_t GetLargePageMinimum_ = (GetLargePageMinimum_t) GetProcAddress(GetModuleHandle(__T("kernel32.dll")), "GetLargePageMinimum");
-          if(GetLargePageMinimum_) {
-              size_t largepagesize = GetLargePageMinimum_();
-              if(largepagesize) {
-                  psize = largepagesize;
-                  gsize = ((DEFAULT_GRANULARITY != 0)?
-                           DEFAULT_GRANULARITY : largepagesize);
-                  if(gsize < largepagesize) gsize = largepagesize;
-              }
-          }
-      }
-#endif
-    }
-#endif /* WIN32 */
-
-    /* Sanity-check configuration:
-       size_t must be unsigned and as wide as pointer type.
-       ints must be at least 4 bytes.
-       alignment must be at least 8.
-       Alignment, min chunk size, and page size must all be powers of 2.
-    */
-    if ((sizeof(size_t) != sizeof(char*)) ||
-        (MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
-        (sizeof(int) < 4)  ||
-        (MALLOC_ALIGNMENT < (size_t)8U) ||
-        ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
-        ((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
-        ((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
-        ((psize            & (psize-SIZE_T_ONE))            != 0))
-      ABORT;
-
-    mparams.granularity = gsize;
-    mparams.page_size = psize;
-    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
-    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
-#if MORECORE_CONTIGUOUS
-    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
-#else  /* MORECORE_CONTIGUOUS */
-    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
-#endif /* MORECORE_CONTIGUOUS */
-
-#if !ONLY_MSPACES
-    /* Set up lock for main malloc area */
-    gm->mflags = mparams.default_mflags;
-    INITIAL_LOCK(&gm->mutex);
-#endif
-
-    {
-#if USE_DEV_RANDOM
-      int fd;
-      unsigned char buf[sizeof(size_t)];
-      /* Try to use /dev/urandom, else fall back on using time */
-      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
-          read(fd, buf, sizeof(buf)) == sizeof(buf)) {
-        magic = *((size_t *) buf);
-        close(fd);
-      }
-      else
-#endif /* USE_DEV_RANDOM */
-#ifdef WIN32
-        magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
-#else
-        magic = (size_t)(time(0) ^ (size_t)0x55555555U);
-#endif
-      magic |= (size_t)8U;    /* ensure nonzero */
-      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
-      mparams.magic = magic;
-    }
-  }
-
-  RELEASE_MALLOC_GLOBAL_LOCK();
-  return 1;
-}
-
-/* support for mallopt */
-static int change_mparam(int param_number, int value) {
-  size_t val;
-  ensure_initialization();
-  val = (value == -1)? MAX_SIZE_T : (size_t)value;
-  switch(param_number) {
-  case M_TRIM_THRESHOLD:
-    mparams.trim_threshold = val;
-    return 1;
-  case M_GRANULARITY:
-    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
-      mparams.granularity = val;
-      return 1;
-    }
-    else
-      return 0;
-  case M_MMAP_THRESHOLD:
-    mparams.mmap_threshold = val;
-    return 1;
-  default:
-    return 0;
-  }
-}
-
-#if DEBUG
-/* ------------------------- Debugging Support --------------------------- */
-
-/* Check properties of any chunk, whether free, inuse, mmapped etc  */
-static void do_check_any_chunk(mstate m, mchunkptr p) {
-  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
-  assert(ok_address(m, p));
-}
-
-/* Check properties of top chunk */
-static void do_check_top_chunk(mstate m, mchunkptr p) {
-  msegmentptr sp = segment_holding(m, (char*)p);
-  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
-  assert(sp != 0);
-  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
-  assert(ok_address(m, p));
-  assert(sz == m->topsize);
-  assert(sz > 0);
-  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
-  assert(pinuse(p));
-  assert(!pinuse(chunk_plus_offset(p, sz)));
-}
-
-/* Check properties of (inuse) mmapped chunks */
-static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
-  size_t  sz = chunksize(p);
-  size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD);
-  assert(is_mmapped(p));
-  assert(use_mmap(m));
-  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
-  assert(ok_address(m, p));
-  assert(!is_small(sz));
-  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
-  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
-  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
-}
-
-/* Check properties of inuse chunks */
-static void do_check_inuse_chunk(mstate m, mchunkptr p) {
-  do_check_any_chunk(m, p);
-  assert(is_inuse(p));
-  assert(next_pinuse(p));
-  /* If not pinuse and not mmapped, previous chunk has OK offset */
-  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
-  if (is_mmapped(p))
-    do_check_mmapped_chunk(m, p);
-}
-
-/* Check properties of free chunks */
-static void do_check_free_chunk(mstate m, mchunkptr p) {
-  size_t sz = chunksize(p);
-  mchunkptr next = chunk_plus_offset(p, sz);
-  do_check_any_chunk(m, p);
-  assert(!is_inuse(p));
-  assert(!next_pinuse(p));
-  assert (!is_mmapped(p));
-  if (p != m->dv && p != m->top) {
-    if (sz >= MIN_CHUNK_SIZE) {
-      assert((sz & CHUNK_ALIGN_MASK) == 0);
-      assert(is_aligned(chunk2mem(p)));
-      assert(next->prev_foot == sz);
-      assert(pinuse(p));
-      assert (next == m->top || is_inuse(next));
-      assert(p->fd->bk == p);
-      assert(p->bk->fd == p);
-    }
-    else  /* markers are always of size SIZE_T_SIZE */
-      assert(sz == SIZE_T_SIZE);
-  }
-}
-
-/* Check properties of malloced chunks at the point they are malloced */
-static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
-  if (mem != 0) {
-    mchunkptr p = mem2chunk(mem);
-    size_t sz = p->head & ~INUSE_BITS;
-    do_check_inuse_chunk(m, p);
-    assert((sz & CHUNK_ALIGN_MASK) == 0);
-    assert(sz >= MIN_CHUNK_SIZE);
-    assert(sz >= s);
-    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
-    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
-  }
-}
-
-/* Check a tree and its subtrees.  */
-static void do_check_tree(mstate m, tchunkptr t) {
-  tchunkptr head = 0;
-  tchunkptr u = t;
-  bindex_t tindex = t->index;
-  size_t tsize = chunksize(t);
-  bindex_t idx;
-  compute_tree_index(tsize, idx);
-  assert(tindex == idx);
-  assert(tsize >= MIN_LARGE_SIZE);
-  assert(tsize >= minsize_for_tree_index(idx));
-  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));
-
-  do { /* traverse through chain of same-sized nodes */
-    do_check_any_chunk(m, ((mchunkptr)u));
-    assert(u->index == tindex);
-    assert(chunksize(u) == tsize);
-    assert(!is_inuse(u));
-    assert(!next_pinuse(u));
-    assert(u->fd->bk == u);
-    assert(u->bk->fd == u);
-    if (u->parent == 0) {
-      assert(u->child[0] == 0);
-      assert(u->child[1] == 0);
-    }
-    else {
-      assert(head == 0); /* only one node on chain has parent */
-      head = u;
-      assert(u->parent != u);
-      assert (u->parent->child[0] == u ||
-              u->parent->child[1] == u ||
-              *((tbinptr*)(u->parent)) == u);
-      if (u->child[0] != 0) {
-        assert(u->child[0]->parent == u);
-        assert(u->child[0] != u);
-        do_check_tree(m, u->child[0]);
-      }
-      if (u->child[1] != 0) {
-        assert(u->child[1]->parent == u);
-        assert(u->child[1] != u);
-        do_check_tree(m, u->child[1]);
-      }
-      if (u->child[0] != 0 && u->child[1] != 0) {
-        assert(chunksize(u->child[0]) < chunksize(u->child[1]));
-      }
-    }
-    u = u->fd;
-  } while (u != t);
-  assert(head != 0);
-}
-
-/*  Check all the chunks in a treebin.  */
-static void do_check_treebin(mstate m, bindex_t i) {
-  tbinptr* tb = treebin_at(m, i);
-  tchunkptr t = *tb;
-  int empty = (m->treemap & (1U << i)) == 0;
-  if (t == 0)
-    assert(empty);
-  if (!empty)
-    do_check_tree(m, t);
-}
-
-/*  Check all the chunks in a smallbin.  */
-static void do_check_smallbin(mstate m, bindex_t i) {
-  sbinptr b = smallbin_at(m, i);
-  mchunkptr p = b->bk;
-  unsigned int empty = (m->smallmap & (1U << i)) == 0;
-  if (p == b)
-    assert(empty);
-  if (!empty) {
-    for (; p != b; p = p->bk) {
-      size_t size = chunksize(p);
-      mchunkptr q;
-      /* each chunk claims to be free */
-      do_check_free_chunk(m, p);
-      /* chunk belongs in bin */
-      assert(small_index(size) == i);
-      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
-      /* chunk is followed by an inuse chunk */
-      q = next_chunk(p);
-      if (q->head != FENCEPOST_HEAD)
-        do_check_inuse_chunk(m, q);
-    }
-  }
-}
-
-/* Find x in a bin. Used in other check functions. */
-static int bin_find(mstate m, mchunkptr x) {
-  size_t size = chunksize(x);
-  if (is_small(size)) {
-    bindex_t sidx = small_index(size);
-    sbinptr b = smallbin_at(m, sidx);
-    if (smallmap_is_marked(m, sidx)) {
-      mchunkptr p = b;
-      do {
-        if (p == x)
-          return 1;
-      } while ((p = p->fd) != b);
-    }
-  }
-  else {
-    bindex_t tidx;
-    compute_tree_index(size, tidx);
-    if (treemap_is_marked(m, tidx)) {
-      tchunkptr t = *treebin_at(m, tidx);
-      size_t sizebits = size << leftshift_for_tree_index(tidx);
-      while (t != 0 && chunksize(t) != size) {
-        t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
-        sizebits <<= 1;
-      }
-      if (t != 0) {
-        tchunkptr u = t;
-        do {
-          if (u == (tchunkptr)x)
-            return 1;
-        } while ((u = u->fd) != t);
-      }
-    }
-  }
-  return 0;
-}
-
-/* Traverse each chunk and check it; return total */
-static size_t traverse_and_check(mstate m) {
-  size_t sum = 0;
-  if (is_initialized(m)) {
-    msegmentptr s = &m->seg;
-    sum += m->topsize + TOP_FOOT_SIZE;
-    while (s != 0) {
-      mchunkptr q = align_as_chunk(s->base);
-      mchunkptr lastq = 0;
-      assert(pinuse(q));
-      while (segment_holds(s, q) &&
-             q != m->top && q->head != FENCEPOST_HEAD) {
-        sum += chunksize(q);
-        if (is_inuse(q)) {
-          assert(!bin_find(m, q));
-          do_check_inuse_chunk(m, q);
-        }
-        else {
-          assert(q == m->dv || bin_find(m, q));
-          assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */
-          do_check_free_chunk(m, q);
-        }
-        lastq = q;
-        q = next_chunk(q);
-      }
-      s = s->next;
-    }
-  }
-  return sum;
-}
-
-/* Check all properties of malloc_state. */
-static void do_check_malloc_state(mstate m) {
-  bindex_t i;
-  size_t total;
-  /* check bins */
-  for (i = 0; i < NSMALLBINS; ++i)
-    do_check_smallbin(m, i);
-  for (i = 0; i < NTREEBINS; ++i)
-    do_check_treebin(m, i);
-
-  if (m->dvsize != 0) { /* check dv chunk */
-    do_check_any_chunk(m, m->dv);
-    assert(m->dvsize == chunksize(m->dv));
-    assert(m->dvsize >= MIN_CHUNK_SIZE);
-    assert(bin_find(m, m->dv) == 0);
-  }
-
-  if (m->top != 0) {   /* check top chunk */
-    do_check_top_chunk(m, m->top);
-    /*assert(m->topsize == chunksize(m->top)); redundant */
-    assert(m->topsize > 0);
-    assert(bin_find(m, m->top) == 0);
-  }
-
-  total = traverse_and_check(m);
-  assert(total <= m->footprint);
-  assert(m->footprint <= m->max_footprint);
-}
-#endif /* DEBUG */
-
-/* ----------------------------- statistics ------------------------------ */
-
-#if !NO_MALLINFO
-static struct mallinfo internal_mallinfo(mstate m) {
-  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-  ensure_initialization();
-  if (!PREACTION(m)) {
-    check_malloc_state(m);
-    if (is_initialized(m)) {
-      size_t nfree = SIZE_T_ONE; /* top always free */
-      size_t mfree = m->topsize + TOP_FOOT_SIZE;
-      size_t sum = mfree;
-      msegmentptr s = &m->seg;
-      while (s != 0) {
-        mchunkptr q = align_as_chunk(s->base);
-        while (segment_holds(s, q) &&
-               q != m->top && q->head != FENCEPOST_HEAD) {
-          size_t sz = chunksize(q);
-          sum += sz;
-          if (!is_inuse(q)) {
-            mfree += sz;
-            ++nfree;
-          }
-          q = next_chunk(q);
-        }
-        s = s->next;
-      }
-
-      nm.arena    = sum;
-      nm.ordblks  = nfree;
-      nm.hblkhd   = m->footprint - sum;
-      nm.usmblks  = m->max_footprint;
-      nm.uordblks = m->footprint - mfree;
-      nm.fordblks = mfree;
-      nm.keepcost = m->topsize;
-    }
-
-    POSTACTION(m);
-  }
-  return nm;
-}
-#endif /* !NO_MALLINFO */
-
-static void internal_malloc_stats(mstate m) {
-  ensure_initialization();
-  if (!PREACTION(m)) {
-    size_t maxfp = 0;
-    size_t fp = 0;
-    size_t used = 0;
-    check_malloc_state(m);
-    if (is_initialized(m)) {
-      msegmentptr s = &m->seg;
-      maxfp = m->max_footprint;
-      fp = m->footprint;
-      used = fp - (m->topsize + TOP_FOOT_SIZE);
-
-      while (s != 0) {
-        mchunkptr q = align_as_chunk(s->base);
-        while (segment_holds(s, q) &&
-               q != m->top && q->head != FENCEPOST_HEAD) {
-          if (!is_inuse(q))
-            used -= chunksize(q);
-          q = next_chunk(q);
-        }
-        s = s->next;
-      }
-    }
-
-    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
-    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
-    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
-
-    POSTACTION(m);
-  }
-}
-
-/* ----------------------- Operations on smallbins ----------------------- */
-
-/*
-  Various forms of linking and unlinking are defined as macros.  Even
-  the ones for trees, which are very long but have very short typical
-  paths.  This is ugly but reduces reliance on inlining support of
-  compilers.
-*/
-
-/* Link a free chunk into a smallbin  */
-#define insert_small_chunk(M, P, S) {\
-  bindex_t I  = small_index(S);\
-  mchunkptr B = smallbin_at(M, I);\
-  mchunkptr F = B;\
-  assert(S >= MIN_CHUNK_SIZE);\
-  if (!smallmap_is_marked(M, I))\
-    mark_smallmap(M, I);\
-  else if (RTCHECK(ok_address(M, B->fd)))\
-    F = B->fd;\
-  else {\
-    CORRUPTION_ERROR_ACTION(M);\
-  }\
-  B->fd = P;\
-  F->bk = P;\
-  P->fd = F;\
-  P->bk = B;\
-}
-
-/* Unlink a chunk from a smallbin  */
-#define unlink_small_chunk(M, P, S) {\
-  mchunkptr F = P->fd;\
-  mchunkptr B = P->bk;\
-  bindex_t I = small_index(S);\
-  assert(P != B);\
-  assert(P != F);\
-  assert(chunksize(P) == small_index2size(I));\
-  if (F == B)\
-    clear_smallmap(M, I);\
-  else if (RTCHECK((F == smallbin_at(M,I) || ok_address(M, F)) &&\
-                   (B == smallbin_at(M,I) || ok_address(M, B)))) {\
-    F->bk = B;\
-    B->fd = F;\
-  }\
-  else {\
-    CORRUPTION_ERROR_ACTION(M);\
-  }\
-}
-
-/* Unlink the first chunk from a smallbin */
-#define unlink_first_small_chunk(M, B, P, I) {\
-  mchunkptr F = P->fd;\
-  assert(P != B);\
-  assert(P != F);\
-  assert(chunksize(P) == small_index2size(I));\
-  if (B == F)\
-    clear_smallmap(M, I);\
-  else if (RTCHECK(ok_address(M, F))) {\
-    B->fd = F;\
-    F->bk = B;\
-  }\
-  else {\
-    CORRUPTION_ERROR_ACTION(M);\
-  }\
-}
-
-
-
-/* Replace dv node, binning the old one */
-/* Used only when dvsize known to be small */
-#define replace_dv(M, P, S) {\
-  size_t DVS = M->dvsize;\
-  if (DVS != 0) {\
-    mchunkptr DV = M->dv;\
-    assert(is_small(DVS));\
-    insert_small_chunk(M, DV, DVS);\
-  }\
-  M->dvsize = S;\
-  M->dv = P;\
-}
-
-/* ------------------------- Operations on trees ------------------------- */
-
-/* Insert chunk into tree */
-#define insert_large_chunk(M, X, S) {\
-  tbinptr* H;\
-  bindex_t I;\
-  compute_tree_index(S, I);\
-  H = treebin_at(M, I);\
-  X->index = I;\
-  X->child[0] = X->child[1] = 0;\
-  if (!treemap_is_marked(M, I)) {\
-    mark_treemap(M, I);\
-    *H = X;\
-    X->parent = (tchunkptr)H;\
-    X->fd = X->bk = X;\
-  }\
-  else {\
-    tchunkptr T = *H;\
-    size_t K = S << leftshift_for_tree_index(I);\
-    for (;;) {\
-      if (chunksize(T) != S) {\
-        tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
-        K <<= 1;\
-        if (*C != 0)\
-          T = *C;\
-        else if (RTCHECK(ok_address(M, C))) {\
-          *C = X;\
-          X->parent = T;\
-          X->fd = X->bk = X;\
-          break;\
-        }\
-        else {\
-          CORRUPTION_ERROR_ACTION(M);\
-          break;\
-        }\
-      }\
-      else {\
-        tchunkptr F = T->fd;\
-        if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
-          T->fd = F->bk = X;\
-          X->fd = F;\
-          X->bk = T;\
-          X->parent = 0;\
-          break;\
-        }\
-        else {\
-          CORRUPTION_ERROR_ACTION(M);\
-          break;\
-        }\
-      }\
-    }\
-  }\
-}
-
-/*
-  Unlink steps:
-
-  1. If x is a chained node, unlink it from its same-sized fd/bk links
-     and choose its bk node as its replacement.
-  2. If x was the last node of its size, but not a leaf node, it must
-     be replaced with a leaf node (not merely one with an open left or
-     right), to make sure that lefts and rights of descendents
-     correspond properly to bit masks.  We use the rightmost descendent
-     of x.  We could use any other leaf, but this is easy to locate and
-     tends to counteract removal of leftmosts elsewhere, and so keeps
-     paths shorter than minimally guaranteed.  This doesn't loop much
-     because on average a node in a tree is near the bottom.
-  3. If x is the base of a chain (i.e., has parent links) relink
-     x's parent and children to x's replacement (or null if none).
-*/
-
-#define unlink_large_chunk(M, X) {\
-  tchunkptr XP = X->parent;\
-  tchunkptr R;\
-  if (X->bk != X) {\
-    tchunkptr F = X->fd;\
-    R = X->bk;\
-    if (RTCHECK(ok_address(M, F))) {\
-      F->bk = R;\
-      R->fd = F;\
-    }\
-    else {\
-      CORRUPTION_ERROR_ACTION(M);\
-    }\
-  }\
-  else {\
-    tchunkptr* RP;\
-    if (((R = *(RP = &(X->child[1]))) != 0) ||\
-        ((R = *(RP = &(X->child[0]))) != 0)) {\
-      tchunkptr* CP;\
-      while ((*(CP = &(R->child[1])) != 0) ||\
-             (*(CP = &(R->child[0])) != 0)) {\
-        R = *(RP = CP);\
-      }\
-      if (RTCHECK(ok_address(M, RP)))\
-        *RP = 0;\
-      else {\
-        CORRUPTION_ERROR_ACTION(M);\
-      }\
-    }\
-  }\
-  if (XP != 0) {\
-    tbinptr* H = treebin_at(M, X->index);\
-    if (X == *H) {\
-      if ((*H = R) == 0) \
-        clear_treemap(M, X->index);\
-    }\
-    else if (RTCHECK(ok_address(M, XP))) {\
-      if (XP->child[0] == X) \
-        XP->child[0] = R;\
-      else \
-        XP->child[1] = R;\
-    }\
-    else\
-      CORRUPTION_ERROR_ACTION(M);\
-    if (R != 0) {\
-      if (RTCHECK(ok_address(M, R))) {\
-        tchunkptr C0, C1;\
-        R->parent = XP;\
-        if ((C0 = X->child[0]) != 0) {\
-          if (RTCHECK(ok_address(M, C0))) {\
-            R->child[0] = C0;\
-            C0->parent = R;\
-          }\
-          else\
-            CORRUPTION_ERROR_ACTION(M);\
-        }\
-        if ((C1 = X->child[1]) != 0) {\
-          if (RTCHECK(ok_address(M, C1))) {\
-            R->child[1] = C1;\
-            C1->parent = R;\
-          }\
-          else\
-            CORRUPTION_ERROR_ACTION(M);\
-        }\
-      }\
-      else\
-        CORRUPTION_ERROR_ACTION(M);\
-    }\
-  }\
-}
-
-/* Relays to large vs small bin operations */
-
-#define insert_chunk(M, P, S)\
-  if (is_small(S)) insert_small_chunk(M, P, S)\
-  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }
-
-#define unlink_chunk(M, P, S)\
-  if (is_small(S)) unlink_small_chunk(M, P, S)\
-  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }
-
-
-/* Relays to internal calls to malloc/free from realloc, memalign etc */
-
-#if ONLY_MSPACES
-#define internal_malloc(m, b) mspace_malloc(m, b)
-#define internal_free(m, mem) mspace_free(m,mem);
-#else /* ONLY_MSPACES */
-#if MSPACES
-#define internal_malloc(m, b)\
-   (m == gm)? dlmalloc(b) : mspace_malloc(m, b)
-#define internal_free(m, mem)\
-   if (m == gm) dlfree(mem); else mspace_free(m,mem);
-#else /* MSPACES */
-#define internal_malloc(m, b) dlmalloc(b)
-#define internal_free(m, mem) dlfree(mem)
-#endif /* MSPACES */
-#endif /* ONLY_MSPACES */
-
-/* -----------------------  Direct-mmapping chunks ----------------------- */
-
-/*
-  Directly mmapped chunks are set up with an offset to the start of
-  the mmapped region stored in the prev_foot field of the chunk. This
-  allows reconstruction of the required argument to MUNMAP when freed,
-  and also allows adjustment of the returned chunk to meet alignment
-  requirements (especially in memalign).
-*/
-
-/* Malloc using mmap */
-static void* mmap_alloc(mstate m, size_t nb) {
-  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
-  if (mmsize > nb) {     /* Check for wrap around 0 */
-    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
-    if (mm != CMFAIL) {
-      size_t offset = align_offset(chunk2mem(mm));
-      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
-      mchunkptr p = (mchunkptr)(mm + offset);
-      p->prev_foot = offset;
-      p->head = psize;
-      mark_inuse_foot(m, p, psize);
-      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
-      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;
-
-      if (m->least_addr == 0 || mm < m->least_addr)
-        m->least_addr = mm;
-      if ((m->footprint += mmsize) > m->max_footprint)
-        m->max_footprint = m->footprint;
-      assert(is_aligned(chunk2mem(p)));
-      check_mmapped_chunk(m, p);
-      return chunk2mem(p);
-    }
-  }
-  return 0;
-}
-
-/* Realloc using mmap */
-static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) {
-  size_t oldsize = chunksize(oldp);
-  if (is_small(nb)) /* Can't shrink mmap regions below small size */
-    return 0;
-  /* Keep old chunk if big enough but not too big */
-  if (oldsize >= nb + SIZE_T_SIZE &&
-      (oldsize - nb) <= (mparams.granularity << 1))
-    return oldp;
-  else {
-    size_t offset = oldp->prev_foot;
-    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
-    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
-    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
-                                  oldmmsize, newmmsize, 1);
-    if (cp != CMFAIL) {
-      mchunkptr newp = (mchunkptr)(cp + offset);
-      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
-      newp->head = psize;
-      mark_inuse_foot(m, newp, psize);
-      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
-      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;
-
-      if (cp < m->least_addr)
-        m->least_addr = cp;
-      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
-        m->max_footprint = m->footprint;
-      check_mmapped_chunk(m, newp);
-      return newp;
-    }
-  }
-  return 0;
-}
-
-/* -------------------------- mspace management -------------------------- */
-
-/* Initialize top chunk and its size */
-static void init_top(mstate m, mchunkptr p, size_t psize) {
-  /* Ensure alignment */
-  size_t offset = align_offset(chunk2mem(p));
-  p = (mchunkptr)((char*)p + offset);
-  psize -= offset;
-
-  m->top = p;
-  m->topsize = psize;
-  p->head = psize | PINUSE_BIT;
-  /* set size of fake trailing chunk holding overhead space only once */
-  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
-  m->trim_check = mparams.trim_threshold; /* reset on each update */
-}
-
-/* Initialize bins for a new mstate that is otherwise zeroed out */
-static void init_bins(mstate m) {
-  /* Establish circular links for smallbins */
-  bindex_t i;
-  for (i = 0; i < NSMALLBINS; ++i) {
-    sbinptr bin = smallbin_at(m,i);
-    bin->fd = bin->bk = bin;
-  }
-}
-
-#if PROCEED_ON_ERROR
-
-/* default corruption action */
-static void reset_on_error(mstate m) {
-  int i;
-  ++malloc_corruption_error_count;
-  /* Reinitialize fields to forget about all memory */
-  m->smallbins = m->treebins = 0;
-  m->dvsize = m->topsize = 0;
-  m->seg.base = 0;
-  m->seg.size = 0;
-  m->seg.next = 0;
-  m->top = m->dv = 0;
-  for (i = 0; i < NTREEBINS; ++i)
-    *treebin_at(m, i) = 0;
-  init_bins(m);
-}
-#endif /* PROCEED_ON_ERROR */
-
-/* Allocate chunk and prepend remainder with chunk in successor base. */
-static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
-                           size_t nb) {
-  mchunkptr p = align_as_chunk(newbase);
-  mchunkptr oldfirst = align_as_chunk(oldbase);
-  size_t psize = (char*)oldfirst - (char*)p;
-  mchunkptr q = chunk_plus_offset(p, nb);
-  size_t qsize = psize - nb;
-  set_size_and_pinuse_of_inuse_chunk(m, p, nb);
-
-  assert((char*)oldfirst > (char*)q);
-  assert(pinuse(oldfirst));
-  assert(qsize >= MIN_CHUNK_SIZE);
-
-  /* consolidate remainder with first chunk of old base */
-  if (oldfirst == m->top) {
-    size_t tsize = m->topsize += qsize;
-    m->top = q;
-    q->head = tsize | PINUSE_BIT;
-    check_top_chunk(m, q);
-  }
-  else if (oldfirst == m->dv) {
-    size_t dsize = m->dvsize += qsize;
-    m->dv = q;
-    set_size_and_pinuse_of_free_chunk(q, dsize);
-  }
-  else {
-    if (!is_inuse(oldfirst)) {
-      size_t nsize = chunksize(oldfirst);
-      unlink_chunk(m, oldfirst, nsize);
-      oldfirst = chunk_plus_offset(oldfirst, nsize);
-      qsize += nsize;
-    }
-    set_free_with_pinuse(q, qsize, oldfirst);
-    insert_chunk(m, q, qsize);
-    check_free_chunk(m, q);
-  }
-
-  check_malloced_chunk(m, chunk2mem(p), nb);
-  return chunk2mem(p);
-}
-
-/* Add a segment to hold a new noncontiguous region */
-static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
-  /* Determine locations and sizes of segment, fenceposts, old top */
-  char* old_top = (char*)m->top;
-  msegmentptr oldsp = segment_holding(m, old_top);
-  char* old_end = oldsp->base + oldsp->size;
-  size_t ssize = pad_request(sizeof(struct malloc_segment));
-  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
-  size_t offset = align_offset(chunk2mem(rawsp));
-  char* asp = rawsp + offset;
-  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
-  mchunkptr sp = (mchunkptr)csp;
-  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
-  mchunkptr tnext = chunk_plus_offset(sp, ssize);
-  mchunkptr p = tnext;
-  int nfences = 0;
-
-  /* reset top to new space */
-  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
-
-  /* Set up segment record */
-  assert(is_aligned(ss));
-  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
-  *ss = m->seg; /* Push current record */
-  m->seg.base = tbase;
-  m->seg.size = tsize;
-  m->seg.sflags = mmapped;
-  m->seg.next = ss;
-
-  /* Insert trailing fenceposts */
-  for (;;) {
-    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
-    p->head = FENCEPOST_HEAD;
-    ++nfences;
-    if ((char*)(&(nextp->head)) < old_end)
-      p = nextp;
-    else
-      break;
-  }
-  assert(nfences >= 2);
-
-  /* Insert the rest of old top into a bin as an ordinary free chunk */
-  if (csp != old_top) {
-    mchunkptr q = (mchunkptr)old_top;
-    size_t psize = csp - old_top;
-    mchunkptr tn = chunk_plus_offset(q, psize);
-    set_free_with_pinuse(q, psize, tn);
-    insert_chunk(m, q, psize);
-  }
-
-  check_top_chunk(m, m->top);
-}
-
-/* -------------------------- System allocation -------------------------- */
-
-/* Get memory from system using MORECORE or MMAP */
-static void* sys_alloc(mstate m, size_t nb) {
-  char* tbase = CMFAIL;
-  size_t tsize = 0;
-  flag_t mmap_flag = 0;
-
-  ensure_initialization();
-
-  /* Directly map large chunks, but only if already initialized */
-  if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize != 0) {
-    void* mem = mmap_alloc(m, nb);
-    if (mem != 0)
-      return mem;
-  }
-
-  /*
-    Try getting memory in any of three ways (in most-preferred to
-    least-preferred order):
-    1. A call to MORECORE that can normally contiguously extend memory.
-       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
-       or main space is mmapped or a previous contiguous call failed)
-    2. A call to MMAP new space (disabled if not HAVE_MMAP).
-       Note that under the default settings, if MORECORE is unable to
-       fulfill a request, and HAVE_MMAP is true, then mmap is
-       used as a noncontiguous system allocator. This is a useful backup
-       strategy for systems with holes in address spaces -- in this case
-       sbrk cannot contiguously expand the heap, but mmap may be able to
-       find space.
-    3. A call to MORECORE that cannot usually contiguously extend memory.
-       (disabled if not HAVE_MORECORE)
-
-   In all cases, we need to request enough bytes from system to ensure
-   we can malloc nb bytes upon success, so pad with enough space for
-   top_foot, plus alignment-pad to make sure we don't lose bytes if
-   not on boundary, and round this up to a granularity unit.
-  */
-
-  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
-    char* br = CMFAIL;
-    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top);
-    size_t asize = 0;
-    ACQUIRE_MALLOC_GLOBAL_LOCK();
-
-    if (ss == 0) {  /* First time through or recovery */
-      char* base = (char*)CALL_MORECORE(0);
-      if (base != CMFAIL) {
-        asize = granularity_align(nb + SYS_ALLOC_PADDING);
-        /* Adjust to end on a page boundary */
-        if (!is_page_aligned(base))
-          asize += (page_align((size_t)base) - (size_t)base);
-        /* Can't call MORECORE if size is negative when treated as signed */
-        if (asize < HALF_MAX_SIZE_T &&
-            (br = (char*)(CALL_MORECORE(asize))) == base) {
-          tbase = base;
-          tsize = asize;
-        }
-      }
-    }
-    else {
-      /* Subtract out existing available top space from MORECORE request. */
-      asize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
-      /* Use mem here only if it did continuously extend old space */
-      if (asize < HALF_MAX_SIZE_T &&
-          (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) {
-        tbase = br;
-        tsize = asize;
-      }
-    }
-
-    if (tbase == CMFAIL) {    /* Cope with partial failure */
-      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
-        if (asize < HALF_MAX_SIZE_T &&
-            asize < nb + SYS_ALLOC_PADDING) {
-          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - asize);
-          if (esize < HALF_MAX_SIZE_T) {
-            char* end = (char*)CALL_MORECORE(esize);
-            if (end != CMFAIL)
-              asize += esize;
-            else {            /* Can't use; try to release */
-              (void) CALL_MORECORE(-asize);
-              br = CMFAIL;
-            }
-          }
-        }
-      }
-      if (br != CMFAIL) {    /* Use the space we did get */
-        tbase = br;
-        tsize = asize;
-      }
-      else
-        disable_contiguous(m); /* Don't try contiguous path in the future */
-    }
-
-    RELEASE_MALLOC_GLOBAL_LOCK();
-  }
-
-  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
-    size_t rsize = granularity_align(nb + SYS_ALLOC_PADDING);
-    if (rsize > nb) { /* Fail if wraps around zero */
-      char* mp = (char*)(CALL_MMAP(rsize));
-      if (mp != CMFAIL) {
-        tbase = mp;
-        tsize = rsize;
-        mmap_flag = USE_MMAP_BIT;
-      }
-    }
-  }
-
-  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
-    size_t asize = granularity_align(nb + SYS_ALLOC_PADDING);
-    if (asize < HALF_MAX_SIZE_T) {
-      char* br = CMFAIL;
-      char* end = CMFAIL;
-      ACQUIRE_MALLOC_GLOBAL_LOCK();
-      br = (char*)(CALL_MORECORE(asize));
-      end = (char*)(CALL_MORECORE(0));
-      RELEASE_MALLOC_GLOBAL_LOCK();
-      if (br != CMFAIL && end != CMFAIL && br < end) {
-        size_t ssize = end - br;
-        if (ssize > nb + TOP_FOOT_SIZE) {
-          tbase = br;
-          tsize = ssize;
-        }
-      }
-    }
-  }
-
-  if (tbase != CMFAIL) {
-
-    if ((m->footprint += tsize) > m->max_footprint)
-      m->max_footprint = m->footprint;
-
-    if (!is_initialized(m)) { /* first-time initialization */
-      if (m->least_addr == 0 || tbase < m->least_addr)
-        m->least_addr = tbase;
-      m->seg.base = tbase;
-      m->seg.size = tsize;
-      m->seg.sflags = mmap_flag;
-      m->magic = mparams.magic;
-      m->release_checks = MAX_RELEASE_CHECK_RATE;
-      init_bins(m);
-#if !ONLY_MSPACES
-      if (is_global(m))
-        init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
-      else
-#endif
-      {
-        /* Offset top by embedded malloc_state */
-        mchunkptr mn = next_chunk(mem2chunk(m));
-        init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
-      }
-    }
-
-    else {
-      /* Try to merge with an existing segment */
-      msegmentptr sp = &m->seg;
-      /* Only consider most recent segment if traversal suppressed */
-      while (sp != 0 && tbase != sp->base + sp->size)
-        sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
-      if (sp != 0 &&
-          !is_extern_segment(sp) &&
-          (sp->sflags & USE_MMAP_BIT) == mmap_flag &&
-          segment_holds(sp, m->top)) { /* append */
-        sp->size += tsize;
-        init_top(m, m->top, m->topsize + tsize);
-      }
-      else {
-        if (tbase < m->least_addr)
-          m->least_addr = tbase;
-        sp = &m->seg;
-        while (sp != 0 && sp->base != tbase + tsize)
-          sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
-        if (sp != 0 &&
-            !is_extern_segment(sp) &&
-            (sp->sflags & USE_MMAP_BIT) == mmap_flag) {
-          char* oldbase = sp->base;
-          sp->base = tbase;
-          sp->size += tsize;
-          return prepend_alloc(m, tbase, oldbase, nb);
-        }
-        else
-          add_segment(m, tbase, tsize, mmap_flag);
-      }
-    }
-
-    if (nb < m->topsize) { /* Allocate from new or extended top space */
-      size_t rsize = m->topsize -= nb;
-      mchunkptr p = m->top;
-      mchunkptr r = m->top = chunk_plus_offset(p, nb);
-      r->head = rsize | PINUSE_BIT;
-      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
-      check_top_chunk(m, m->top);
-      check_malloced_chunk(m, chunk2mem(p), nb);
-      return chunk2mem(p);
-    }
-  }
-
-  MALLOC_FAILURE_ACTION;
-  return 0;
-}
-
-/* -----------------------  system deallocation -------------------------- */
-
-/* Unmap and unlink any mmapped segments that don't contain used chunks */
-static size_t release_unused_segments(mstate m) {
-  size_t released = 0;
-  int nsegs = 0;
-  msegmentptr pred = &m->seg;
-  msegmentptr sp = pred->next;
-  while (sp != 0) {
-    char* base = sp->base;
-    size_t size = sp->size;
-    msegmentptr next = sp->next;
-    ++nsegs;
-    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
-      mchunkptr p = align_as_chunk(base);
-      size_t psize = chunksize(p);
-      /* Can unmap if first chunk holds entire segment and not pinned */
-      if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
-        tchunkptr tp = (tchunkptr)p;
-        assert(segment_holds(sp, (char*)sp));
-        if (p == m->dv) {
-          m->dv = 0;
-          m->dvsize = 0;
-        }
-        else {
-          unlink_large_chunk(m, tp);
-        }
-        if (CALL_MUNMAP(base, size) == 0) {
-          released += size;
-          m->footprint -= size;
-          /* unlink obsoleted record */
-          sp = pred;
-          sp->next = next;
-        }
-        else { /* back out if cannot unmap */
-          insert_large_chunk(m, tp, psize);
-        }
-      }
-    }
-    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
-      break;
-    pred = sp;
-    sp = next;
-  }
-  /* Reset check counter */
-  m->release_checks = ((nsegs > MAX_RELEASE_CHECK_RATE)?
-                       nsegs : MAX_RELEASE_CHECK_RATE);
-  return released;
-}
-
-static int sys_trim(mstate m, size_t pad) {
-  size_t released = 0;
-  ensure_initialization();
-  if (pad < MAX_REQUEST && is_initialized(m)) {
-    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */
-
-    if (m->topsize > pad) {
-      /* Shrink top space in granularity-size units, keeping at least one */
-      size_t unit = mparams.granularity;
-      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
-                      SIZE_T_ONE) * unit;
-      msegmentptr sp = segment_holding(m, (char*)m->top);
-
-      if (!is_extern_segment(sp)) {
-        if (is_mmapped_segment(sp)) {
-          if (HAVE_MMAP &&
-              sp->size >= extra &&
-              !has_segment_link(m, sp)) { /* can't shrink if pinned */
-            size_t newsize = sp->size - extra;
-            /* Prefer mremap, fall back to munmap */
-            if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
-                (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
-              released = extra;
-            }
-          }
-        }
-        else if (HAVE_MORECORE) {
-          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
-            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
-          ACQUIRE_MALLOC_GLOBAL_LOCK();
-          {
-            /* Make sure end of memory is where we last set it. */
-            char* old_br = (char*)(CALL_MORECORE(0));
-            if (old_br == sp->base + sp->size) {
-              char* rel_br = (char*)(CALL_MORECORE(-extra));
-              char* new_br = (char*)(CALL_MORECORE(0));
-              if (rel_br != CMFAIL && new_br < old_br)
-                released = old_br - new_br;
-            }
-          }
-          RELEASE_MALLOC_GLOBAL_LOCK();
-        }
-      }
-
-      if (released != 0) {
-        sp->size -= released;
-        m->footprint -= released;
-        init_top(m, m->top, m->topsize - released);
-        check_top_chunk(m, m->top);
-      }
-    }
-
-    /* Unmap any unused mmapped segments */
-    if (HAVE_MMAP)
-      released += release_unused_segments(m);
-
-    /* On failure, disable autotrim to avoid repeated failed future calls */
-    if (released == 0 && m->topsize > m->trim_check)
-      m->trim_check = MAX_SIZE_T;
-  }
-
-  return (released != 0)? 1 : 0;
-}
-
-
-/* ---------------------------- malloc support --------------------------- */
-
-/* allocate a large request from the best fitting chunk in a treebin */
-static void* tmalloc_large(mstate m, size_t nb) {
-  tchunkptr v = 0;
-  size_t rsize = -nb; /* Unsigned negation */
-  tchunkptr t;
-  bindex_t idx;
-  compute_tree_index(nb, idx);
-  if ((t = *treebin_at(m, idx)) != 0) {
-    /* Traverse tree for this bin looking for node with size == nb */
-    size_t sizebits = nb << leftshift_for_tree_index(idx);
-    tchunkptr rst = 0;  /* The deepest untaken right subtree */
-    for (;;) {
-      tchunkptr rt;
-      size_t trem = chunksize(t) - nb;
-      if (trem < rsize) {
-        v = t;
-        if ((rsize = trem) == 0)
-          break;
-      }
-      rt = t->child[1];
-      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
-      if (rt != 0 && rt != t)
-        rst = rt;
-      if (t == 0) {
-        t = rst; /* set t to least subtree holding sizes > nb */
-        break;
-      }
-      sizebits <<= 1;
-    }
-  }
-  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
-    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
-    if (leftbits != 0) {
-      bindex_t i;
-      binmap_t leastbit = least_bit(leftbits);
-      compute_bit2idx(leastbit, i);
-      t = *treebin_at(m, i);
-    }
-  }
-
-  while (t != 0) { /* find smallest of tree or subtree */
-    size_t trem = chunksize(t) - nb;
-    if (trem < rsize) {
-      rsize = trem;
-      v = t;
-    }
-    t = leftmost_child(t);
-  }
-
-  /*  If dv is a better fit, return 0 so malloc will use it */
-  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
-    if (RTCHECK(ok_address(m, v))) { /* split */
-      mchunkptr r = chunk_plus_offset(v, nb);
-      assert(chunksize(v) == rsize + nb);
-      if (RTCHECK(ok_next(v, r))) {
-        unlink_large_chunk(m, v);
-        if (rsize < MIN_CHUNK_SIZE)
-          set_inuse_and_pinuse(m, v, (rsize + nb));
-        else {
-          set_size_and_pinuse_of_inuse_chunk(m, v, nb);
-          set_size_and_pinuse_of_free_chunk(r, rsize);
-          insert_chunk(m, r, rsize);
-        }
-        return chunk2mem(v);
-      }
-    }
-    CORRUPTION_ERROR_ACTION(m);
-  }
-  return 0;
-}
-
-/* allocate a small request from the best fitting chunk in a treebin */
-static void* tmalloc_small(mstate m, size_t nb) {
-  tchunkptr t, v;
-  size_t rsize;
-  bindex_t i;
-  binmap_t leastbit = least_bit(m->treemap);
-  compute_bit2idx(leastbit, i);
-  v = t = *treebin_at(m, i);
-  rsize = chunksize(t) - nb;
-
-  while ((t = leftmost_child(t)) != 0) {
-    size_t trem = chunksize(t) - nb;
-    if (trem < rsize) {
-      rsize = trem;
-      v = t;
-    }
-  }
-
-  if (RTCHECK(ok_address(m, v))) {
-    mchunkptr r = chunk_plus_offset(v, nb);
-    assert(chunksize(v) == rsize + nb);
-    if (RTCHECK(ok_next(v, r))) {
-      unlink_large_chunk(m, v);
-      if (rsize < MIN_CHUNK_SIZE)
-        set_inuse_and_pinuse(m, v, (rsize + nb));
-      else {
-        set_size_and_pinuse_of_inuse_chunk(m, v, nb);
-        set_size_and_pinuse_of_free_chunk(r, rsize);
-        replace_dv(m, r, rsize);
-      }
-      return chunk2mem(v);
-    }
-  }
-
-  CORRUPTION_ERROR_ACTION(m);
-  return 0;
-}
-
-/* --------------------------- realloc support --------------------------- */
-
-static void* internal_realloc(mstate m, void* oldmem, size_t bytes) {
-  if (bytes >= MAX_REQUEST) {
-    MALLOC_FAILURE_ACTION;
-    return 0;
-  }
-  if (!PREACTION(m)) {
-    mchunkptr oldp = mem2chunk(oldmem);
-    size_t oldsize = chunksize(oldp);
-    mchunkptr next = chunk_plus_offset(oldp, oldsize);
-    mchunkptr newp = 0;
-    void* extra = 0;
-
-    /* Try to either shrink or extend into top. Else malloc-copy-free */
-
-    if (RTCHECK(ok_address(m, oldp) && ok_inuse(oldp) &&
-                ok_next(oldp, next) && ok_pinuse(next))) {
-      size_t nb = request2size(bytes);
-      if (is_mmapped(oldp))
-        newp = mmap_resize(m, oldp, nb);
-      else if (oldsize >= nb) { /* already big enough */
-        size_t rsize = oldsize - nb;
-        newp = oldp;
-        if (rsize >= MIN_CHUNK_SIZE) {
-          mchunkptr remainder = chunk_plus_offset(newp, nb);
-          set_inuse(m, newp, nb);
-          set_inuse_and_pinuse(m, remainder, rsize);
-          extra = chunk2mem(remainder);
-        }
-      }
-      else if (next == m->top && oldsize + m->topsize > nb) {
-        /* Expand into top */
-        size_t newsize = oldsize + m->topsize;
-        size_t newtopsize = newsize - nb;
-        mchunkptr newtop = chunk_plus_offset(oldp, nb);
-        set_inuse(m, oldp, nb);
-        newtop->head = newtopsize |PINUSE_BIT;
-        m->top = newtop;
-        m->topsize = newtopsize;
-        newp = oldp;
-      }
-    }
-    else {
-      USAGE_ERROR_ACTION(m, oldmem);
-      POSTACTION(m);
-      return 0;
-    }
-#if DEBUG
-    if (newp != 0) {
-      check_inuse_chunk(m, newp); /* Check requires lock */
-    }
-#endif
-
-    POSTACTION(m);
-
-    if (newp != 0) {
-      if (extra != 0) {
-        internal_free(m, extra);
-      }
-      return chunk2mem(newp);
-    }
-    else {
-      void* newmem = internal_malloc(m, bytes);
-      if (newmem != 0) {
-        size_t oc = oldsize - overhead_for(oldp);
-        memcpy(newmem, oldmem, (oc < bytes)? oc : bytes);
-        internal_free(m, oldmem);
-      }
-      return newmem;
-    }
-  }
-  return 0;
-}
-
-/* --------------------------- memalign support -------------------------- */
-
-static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
-  if (alignment <= MALLOC_ALIGNMENT)    /* Can just use malloc */
-    return internal_malloc(m, bytes);
-  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
-    alignment = MIN_CHUNK_SIZE;
-  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
-    size_t a = MALLOC_ALIGNMENT << 1;
-    while (a < alignment) a <<= 1;
-    alignment = a;
-  }
-
-  if (bytes >= MAX_REQUEST - alignment) {
-    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
-      MALLOC_FAILURE_ACTION;
-    }
-  }
-  else {
-    size_t nb = request2size(bytes);
-    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
-    char* mem = (char*)internal_malloc(m, req);
-    if (mem != 0) {
-      void* leader = 0;
-      void* trailer = 0;
-      mchunkptr p = mem2chunk(mem);
-
-      if (PREACTION(m)) return 0;
-      if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */
-        /*
-          Find an aligned spot inside chunk.  Since we need to give
-          back leading space in a chunk of at least MIN_CHUNK_SIZE, if
-          the first calculation places us at a spot with less than
-          MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
-          We've allocated enough total room so that this is always
-          possible.
-        */
-        char* br = (char*)mem2chunk((size_t)(((size_t)(mem +
-                                                       alignment -
-                                                       SIZE_T_ONE)) &
-                                             -alignment));
-        char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
-          br : br+alignment;
-        mchunkptr newp = (mchunkptr)pos;
-        size_t leadsize = pos - (char*)(p);
-        size_t newsize = chunksize(p) - leadsize;
-
-        if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
-          newp->prev_foot = p->prev_foot + leadsize;
-          newp->head = newsize;
-        }
-        else { /* Otherwise, give back leader, use the rest */
-          set_inuse(m, newp, newsize);
-          set_inuse(m, p, leadsize);
-          leader = chunk2mem(p);
-        }
-        p = newp;
-      }
-
-      /* Give back spare room at the end */
-      if (!is_mmapped(p)) {
-        size_t size = chunksize(p);
-        if (size > nb + MIN_CHUNK_SIZE) {
-          size_t remainder_size = size - nb;
-          mchunkptr remainder = chunk_plus_offset(p, nb);
-          set_inuse(m, p, nb);
-          set_inuse(m, remainder, remainder_size);
-          trailer = chunk2mem(remainder);
-        }
-      }
-
-      assert (chunksize(p) >= nb);
-      assert((((size_t)(chunk2mem(p))) % alignment) == 0);
-      check_inuse_chunk(m, p);
-      POSTACTION(m);
-      if (leader != 0) {
-        internal_free(m, leader);
-      }
-      if (trailer != 0) {
-        internal_free(m, trailer);
-      }
-      return chunk2mem(p);
-    }
-  }
-  return 0;
-}
-
-/* ------------------------ comalloc/coalloc support --------------------- */
-
-static void** ialloc(mstate m,
-                     size_t n_elements,
-                     size_t* sizes,
-                     int opts,
-                     void* chunks[]) {
-  /*
-    This provides common support for independent_X routines, handling
-    all of the combinations that can result.
-
-    The opts arg has:
-    bit 0 set if all elements are same size (using sizes[0])
-    bit 1 set if elements should be zeroed
-  */
-
-  size_t    element_size;   /* chunksize of each element, if all same */
-  size_t    contents_size;  /* total size of elements */
-  size_t    array_size;     /* request size of pointer array */
-  void*     mem;            /* malloced aggregate space */
-  mchunkptr p;              /* corresponding chunk */
-  size_t    remainder_size; /* remaining bytes while splitting */
-  void**    marray;         /* either "chunks" or malloced ptr array */
-  mchunkptr array_chunk;    /* chunk for malloced ptr array */
-  flag_t    was_enabled;    /* to disable mmap */
-  size_t    size;
-  size_t    i;
-
-  ensure_initialization();
-  /* compute array length, if needed */
-  if (chunks != 0) {
-    if (n_elements == 0)
-      return chunks; /* nothing to do */
-    marray = chunks;
-    array_size = 0;
-  }
-  else {
-    /* if empty req, must still return chunk representing empty array */
-    if (n_elements == 0)
-      return (void**)internal_malloc(m, 0);
-    marray = 0;
-    array_size = request2size(n_elements * (sizeof(void*)));
-  }
-
-  /* compute total element size */
-  if (opts & 0x1) { /* all-same-size */
-    element_size = request2size(*sizes);
-    contents_size = n_elements * element_size;
-  }
-  else { /* add up all the sizes */
-    element_size = 0;
-    contents_size = 0;
-    for (i = 0; i != n_elements; ++i)
-      contents_size += request2size(sizes[i]);
-  }
-
-  size = contents_size + array_size;
-
-  /*
-     Allocate the aggregate chunk.  First disable direct-mmapping so
-     malloc won't use it, since we would not be able to later
-     free/realloc space internal to a segregated mmap region.
-  */
-  was_enabled = use_mmap(m);
-  disable_mmap(m);
-  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
-  if (was_enabled)
-    enable_mmap(m);
-  if (mem == 0)
-    return 0;
-
-  if (PREACTION(m)) return 0;
-  p = mem2chunk(mem);
-  remainder_size = chunksize(p);
-
-  assert(!is_mmapped(p));
-
-  if (opts & 0x2) {       /* optionally clear the elements */
-    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
-  }
-
-  /* If not provided, allocate the pointer array as final part of chunk */
-  if (marray == 0) {
-    size_t  array_chunk_size;
-    array_chunk = chunk_plus_offset(p, contents_size);
-    array_chunk_size = remainder_size - contents_size;
-    marray = (void**) (chunk2mem(array_chunk));
-    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
-    remainder_size = contents_size;
-  }
-
-  /* split out elements */
-  for (i = 0; ; ++i) {
-    marray[i] = chunk2mem(p);
-    if (i != n_elements-1) {
-      if (element_size != 0)
-        size = element_size;
-      else
-        size = request2size(sizes[i]);
-      remainder_size -= size;
-      set_size_and_pinuse_of_inuse_chunk(m, p, size);
-      p = chunk_plus_offset(p, size);
-    }
-    else { /* the final element absorbs any overallocation slop */
-      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
-      break;
-    }
-  }
-
-#if DEBUG
-  if (marray != chunks) {
-    /* final element must have exactly exhausted chunk */
-    if (element_size != 0) {
-      assert(remainder_size == element_size);
-    }
-    else {
-      assert(remainder_size == request2size(sizes[i]));
-    }
-    check_inuse_chunk(m, mem2chunk(marray));
-  }
-  for (i = 0; i != n_elements; ++i)
-    check_inuse_chunk(m, mem2chunk(marray[i]));
-
-#endif /* DEBUG */
-
-  POSTACTION(m);
-  return marray;
-}
-
-
-/* -------------------------- public routines ---------------------------- */
-
-#if !ONLY_MSPACES
-
-void* dlmalloc(size_t bytes) {
-  /*
-     Basic algorithm:
-     If a small request (< 256 bytes minus per-chunk overhead):
-       1. If one exists, use a remainderless chunk in associated smallbin.
-          (Remainderless means that there are too few excess bytes to
-          represent as a chunk.)
-       2. If it is big enough, use the dv chunk, which is normally the
-          chunk adjacent to the one used for the most recent small request.
-       3. If one exists, split the smallest available chunk in a bin,
-          saving remainder in dv.
-       4. If it is big enough, use the top chunk.
-       5. If available, get memory from system and use it
-     Otherwise, for a large request:
-       1. Find the smallest available binned chunk that fits, and use it
-          if it is better fitting than dv chunk, splitting if necessary.
-       2. If better fitting than any binned chunk, use the dv chunk.
-       3. If it is big enough, use the top chunk.
-       4. If request size >= mmap threshold, try to directly mmap this chunk.
-       5. If available, get memory from system and use it
-
-     The ugly goto's here ensure that postaction occurs along all paths.
-  */
-
-#if USE_LOCKS
-  ensure_initialization(); /* initialize in sys_alloc if not using locks */
-#endif
-
-  if (!PREACTION(gm)) {
-    void* mem;
-    size_t nb;
-    if (bytes <= MAX_SMALL_REQUEST) {
-      bindex_t idx;
-      binmap_t smallbits;
-      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
-      idx = small_index(nb);
-      smallbits = gm->smallmap >> idx;
-
-      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
-        mchunkptr b, p;
-        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
-        b = smallbin_at(gm, idx);
-        p = b->fd;
-        assert(chunksize(p) == small_index2size(idx));
-        unlink_first_small_chunk(gm, b, p, idx);
-        set_inuse_and_pinuse(gm, p, small_index2size(idx));
-        mem = chunk2mem(p);
-        check_malloced_chunk(gm, mem, nb);
-        goto postaction;
-      }
-
-      else if (nb > gm->dvsize) {
-        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
-          mchunkptr b, p, r;
-          size_t rsize;
-          bindex_t i;
-          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
-          binmap_t leastbit = least_bit(leftbits);
-          compute_bit2idx(leastbit, i);
-          b = smallbin_at(gm, i);
-          p = b->fd;
-          assert(chunksize(p) == small_index2size(i));
-          unlink_first_small_chunk(gm, b, p, i);
-          rsize = small_index2size(i) - nb;
-          /* Fit here cannot be remainderless if 4byte sizes */
-          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
-            set_inuse_and_pinuse(gm, p, small_index2size(i));
-          else {
-            set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
-            r = chunk_plus_offset(p, nb);
-            set_size_and_pinuse_of_free_chunk(r, rsize);
-            replace_dv(gm, r, rsize);
-          }
-          mem = chunk2mem(p);
-          check_malloced_chunk(gm, mem, nb);
-          goto postaction;
-        }
-
-        else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
-          check_malloced_chunk(gm, mem, nb);
-          goto postaction;
-        }
-      }
-    }
-    else if (bytes >= MAX_REQUEST)
-      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
-    else {
-      nb = pad_request(bytes);
-      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
-        check_malloced_chunk(gm, mem, nb);
-        goto postaction;
-      }
-    }
-
-    if (nb <= gm->dvsize) {
-      size_t rsize = gm->dvsize - nb;
-      mchunkptr p = gm->dv;
-      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
-        mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
-        gm->dvsize = rsize;
-        set_size_and_pinuse_of_free_chunk(r, rsize);
-        set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
-      }
-      else { /* exhaust dv */
-        size_t dvs = gm->dvsize;
-        gm->dvsize = 0;
-        gm->dv = 0;
-        set_inuse_and_pinuse(gm, p, dvs);
-      }
-      mem = chunk2mem(p);
-      check_malloced_chunk(gm, mem, nb);
-      goto postaction;
-    }
-
-    else if (nb < gm->topsize) { /* Split top */
-      size_t rsize = gm->topsize -= nb;
-      mchunkptr p = gm->top;
-      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
-      r->head = rsize | PINUSE_BIT;
-      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
-      mem = chunk2mem(p);
-      check_top_chunk(gm, gm->top);
-      check_malloced_chunk(gm, mem, nb);
-      goto postaction;
-    }
-
-    mem = sys_alloc(gm, nb);
-
-  postaction:
-    POSTACTION(gm);
-    return mem;
-  }
-
-  return 0;
-}
-
-void dlfree(void* mem) {
-  /*
-     Consolidate freed chunks with preceeding or succeeding bordering
-     free chunks, if they exist, and then place in a bin.  Intermixed
-     with special cases for top, dv, mmapped chunks, and usage errors.
-  */
-
-  if (mem != 0) {
-    mchunkptr p  = mem2chunk(mem);
-#if FOOTERS
-    mstate fm = get_mstate_for(p);
-    if (!ok_magic(fm)) {
-      USAGE_ERROR_ACTION(fm, p);
-      return;
-    }
-#else /* FOOTERS */
-#define fm gm
-#endif /* FOOTERS */
-    if (!PREACTION(fm)) {
-      check_inuse_chunk(fm, p);
-      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
-        size_t psize = chunksize(p);
-        mchunkptr next = chunk_plus_offset(p, psize);
-        if (!pinuse(p)) {
-          size_t prevsize = p->prev_foot;
-          if (is_mmapped(p)) {
-            psize += prevsize + MMAP_FOOT_PAD;
-            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
-              fm->footprint -= psize;
-            goto postaction;
-          }
-          else {
-            mchunkptr prev = chunk_minus_offset(p, prevsize);
-            psize += prevsize;
-            p = prev;
-            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
-              if (p != fm->dv) {
-                unlink_chunk(fm, p, prevsize);
-              }
-              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
-                fm->dvsize = psize;
-                set_free_with_pinuse(p, psize, next);
-                goto postaction;
-              }
-            }
-            else
-              goto erroraction;
-          }
-        }
-
-        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
-          if (!cinuse(next)) {  /* consolidate forward */
-            if (next == fm->top) {
-              size_t tsize = fm->topsize += psize;
-              fm->top = p;
-              p->head = tsize | PINUSE_BIT;
-              if (p == fm->dv) {
-                fm->dv = 0;
-                fm->dvsize = 0;
-              }
-              if (should_trim(fm, tsize))
-                sys_trim(fm, 0);
-              goto postaction;
-            }
-            else if (next == fm->dv) {
-              size_t dsize = fm->dvsize += psize;
-              fm->dv = p;
-              set_size_and_pinuse_of_free_chunk(p, dsize);
-              goto postaction;
-            }
-            else {
-              size_t nsize = chunksize(next);
-              psize += nsize;
-              unlink_chunk(fm, next, nsize);
-              set_size_and_pinuse_of_free_chunk(p, psize);
-              if (p == fm->dv) {
-                fm->dvsize = psize;
-                goto postaction;
-              }
-            }
-          }
-          else
-            set_free_with_pinuse(p, psize, next);
-
-          if (is_small(psize)) {
-            insert_small_chunk(fm, p, psize);
-            check_free_chunk(fm, p);
-          }
-          else {
-            tchunkptr tp = (tchunkptr)p;
-            insert_large_chunk(fm, tp, psize);
-            check_free_chunk(fm, p);
-            if (--fm->release_checks == 0)
-              release_unused_segments(fm);
-          }
-          goto postaction;
-        }
-      }
-    erroraction:
-      USAGE_ERROR_ACTION(fm, p);
-    postaction:
-      POSTACTION(fm);
-    }
-  }
-#if !FOOTERS
-#undef fm
-#endif /* FOOTERS */
-}
-
-void* dlcalloc(size_t n_elements, size_t elem_size) {
-  void* mem;
-  size_t req = 0;
-  if (n_elements != 0) {
-    req = n_elements * elem_size;
-    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
-        (req / n_elements != elem_size))
-      req = MAX_SIZE_T; /* force downstream failure on overflow */
-  }
-  mem = dlmalloc(req);
-  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
-    memset(mem, 0, req);
-  return mem;
-}
-
-void* dlrealloc(void* oldmem, size_t bytes) {
-  if (oldmem == 0)
-    return dlmalloc(bytes);
-#ifdef REALLOC_ZERO_BYTES_FREES
-  if (bytes == 0) {
-    dlfree(oldmem);
-    return 0;
-  }
-#endif /* REALLOC_ZERO_BYTES_FREES */
-  else {
-#if ! FOOTERS
-    mstate m = gm;
-#else /* FOOTERS */
-    mstate m = get_mstate_for(mem2chunk(oldmem));
-    if (!ok_magic(m)) {
-      USAGE_ERROR_ACTION(m, oldmem);
-      return 0;
-    }
-#endif /* FOOTERS */
-    return internal_realloc(m, oldmem, bytes);
-  }
-}
-
-void* dlmemalign(size_t alignment, size_t bytes) {
-  return internal_memalign(gm, alignment, bytes);
-}
-
-void** dlindependent_calloc(size_t n_elements, size_t elem_size,
-                                 void* chunks[]) {
-  size_t sz = elem_size; /* serves as 1-element array */
-  return ialloc(gm, n_elements, &sz, 3, chunks);
-}
-
-void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
-                                   void* chunks[]) {
-  return ialloc(gm, n_elements, sizes, 0, chunks);
-}
-
-void* dlvalloc(size_t bytes) {
-  size_t pagesz;
-  ensure_initialization();
-  pagesz = mparams.page_size;
-  return dlmemalign(pagesz, bytes);
-}
-
-void* dlpvalloc(size_t bytes) {
-  size_t pagesz;
-  ensure_initialization();
-  pagesz = mparams.page_size;
-  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
-}
-
-int dlmalloc_trim(size_t pad) {
-  int result = 0;
-  ensure_initialization();
-  if (!PREACTION(gm)) {
-    result = sys_trim(gm, pad);
-    POSTACTION(gm);
-  }
-  return result;
-}
-
-size_t dlmalloc_footprint(void) {
-  return gm->footprint;
-}
-
-size_t dlmalloc_max_footprint(void) {
-  return gm->max_footprint;
-}
-
-#if !NO_MALLINFO
-struct mallinfo dlmallinfo(void) {
-  return internal_mallinfo(gm);
-}
-#endif /* NO_MALLINFO */
-
-void dlmalloc_stats() {
-  internal_malloc_stats(gm);
-}
-
-int dlmallopt(int param_number, int value) {
-  return change_mparam(param_number, value);
-}
-
-#endif /* !ONLY_MSPACES */
-
-size_t dlmalloc_usable_size(void* mem) {
-  if (mem != 0) {
-    mchunkptr p = mem2chunk(mem);
-    if (is_inuse(p))
-      return chunksize(p) - overhead_for(p);
-  }
-  return 0;
-}
-
-/* ----------------------------- user mspaces ---------------------------- */
-
-#if MSPACES
-
-static mstate init_user_mstate(char* tbase, size_t tsize) {
-  size_t msize = pad_request(sizeof(struct malloc_state));
-  mchunkptr mn;
-  mchunkptr msp = align_as_chunk(tbase);
-  mstate m = (mstate)(chunk2mem(msp));
-  memset(m, 0, msize);
-  INITIAL_LOCK(&m->mutex);
-  msp->head = (msize|INUSE_BITS);
-  m->seg.base = m->least_addr = tbase;
-  m->seg.size = m->footprint = m->max_footprint = tsize;
-  m->magic = mparams.magic;
-  m->release_checks = MAX_RELEASE_CHECK_RATE;
-  m->mflags = mparams.default_mflags;
-  m->extp = 0;
-  m->exts = 0;
-  disable_contiguous(m);
-  init_bins(m);
-  mn = next_chunk(mem2chunk(m));
-  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
-  check_top_chunk(m, m->top);
-  return m;
-}
-
-mspace create_mspace(size_t capacity, int locked) {
-  mstate m = 0;
-  size_t msize;
-  ensure_initialization();
-  msize = pad_request(sizeof(struct malloc_state));
-  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
-    size_t rs = ((capacity == 0)? mparams.granularity :
-                 (capacity + TOP_FOOT_SIZE + msize));
-    size_t tsize = granularity_align(rs);
-    char* tbase = (char*)(CALL_MMAP(tsize));
-    if (tbase != CMFAIL) {
-      m = init_user_mstate(tbase, tsize);
-      m->seg.sflags = USE_MMAP_BIT;
-      set_lock(m, locked);
-    }
-  }
-  return (mspace)m;
-}
-
-mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
-  mstate m = 0;
-  size_t msize;
-  ensure_initialization();
-  msize = pad_request(sizeof(struct malloc_state));
-  if (capacity > msize + TOP_FOOT_SIZE &&
-      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
-    m = init_user_mstate((char*)base, capacity);
-    m->seg.sflags = EXTERN_BIT;
-    set_lock(m, locked);
-  }
-  return (mspace)m;
-}
-
-int mspace_track_large_chunks(mspace msp, int enable) {
-  int ret = 0;
-  mstate ms = (mstate)msp;
-  if (!PREACTION(ms)) {
-    if (!use_mmap(ms))
-      ret = 1;
-    if (!enable)
-      enable_mmap(ms);
-    else
-      disable_mmap(ms);
-    POSTACTION(ms);
-  }
-  return ret;
-}
-
-size_t destroy_mspace(mspace msp) {
-  size_t freed = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    msegmentptr sp = &ms->seg;
-    while (sp != 0) {
-      char* base = sp->base;
-      size_t size = sp->size;
-      flag_t flag = sp->sflags;
-      sp = sp->next;
-      if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
-          CALL_MUNMAP(base, size) == 0)
-        freed += size;
-    }
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return freed;
-}
-
-/*
-  mspace versions of routines are near-clones of the global
-  versions. This is not so nice but better than the alternatives.
-*/
-
-
-void* mspace_malloc(mspace msp, size_t bytes) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  if (!PREACTION(ms)) {
-    void* mem;
-    size_t nb;
-    if (bytes <= MAX_SMALL_REQUEST) {
-      bindex_t idx;
-      binmap_t smallbits;
-      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
-      idx = small_index(nb);
-      smallbits = ms->smallmap >> idx;
-
-      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
-        mchunkptr b, p;
-        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
-        b = smallbin_at(ms, idx);
-        p = b->fd;
-        assert(chunksize(p) == small_index2size(idx));
-        unlink_first_small_chunk(ms, b, p, idx);
-        set_inuse_and_pinuse(ms, p, small_index2size(idx));
-        mem = chunk2mem(p);
-        check_malloced_chunk(ms, mem, nb);
-        goto postaction;
-      }
-
-      else if (nb > ms->dvsize) {
-        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
-          mchunkptr b, p, r;
-          size_t rsize;
-          bindex_t i;
-          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
-          binmap_t leastbit = least_bit(leftbits);
-          compute_bit2idx(leastbit, i);
-          b = smallbin_at(ms, i);
-          p = b->fd;
-          assert(chunksize(p) == small_index2size(i));
-          unlink_first_small_chunk(ms, b, p, i);
-          rsize = small_index2size(i) - nb;
-          /* Fit here cannot be remainderless if 4byte sizes */
-          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
-            set_inuse_and_pinuse(ms, p, small_index2size(i));
-          else {
-            set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
-            r = chunk_plus_offset(p, nb);
-            set_size_and_pinuse_of_free_chunk(r, rsize);
-            replace_dv(ms, r, rsize);
-          }
-          mem = chunk2mem(p);
-          check_malloced_chunk(ms, mem, nb);
-          goto postaction;
-        }
-
-        else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
-          check_malloced_chunk(ms, mem, nb);
-          goto postaction;
-        }
-      }
-    }
-    else if (bytes >= MAX_REQUEST)
-      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
-    else {
-      nb = pad_request(bytes);
-      if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
-        check_malloced_chunk(ms, mem, nb);
-        goto postaction;
-      }
-    }
-
-    if (nb <= ms->dvsize) {
-      size_t rsize = ms->dvsize - nb;
-      mchunkptr p = ms->dv;
-      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
-        mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
-        ms->dvsize = rsize;
-        set_size_and_pinuse_of_free_chunk(r, rsize);
-        set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
-      }
-      else { /* exhaust dv */
-        size_t dvs = ms->dvsize;
-        ms->dvsize = 0;
-        ms->dv = 0;
-        set_inuse_and_pinuse(ms, p, dvs);
-      }
-      mem = chunk2mem(p);
-      check_malloced_chunk(ms, mem, nb);
-      goto postaction;
-    }
-
-    else if (nb < ms->topsize) { /* Split top */
-      size_t rsize = ms->topsize -= nb;
-      mchunkptr p = ms->top;
-      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
-      r->head = rsize | PINUSE_BIT;
-      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
-      mem = chunk2mem(p);
-      check_top_chunk(ms, ms->top);
-      check_malloced_chunk(ms, mem, nb);
-      goto postaction;
-    }
-
-    mem = sys_alloc(ms, nb);
-
-  postaction:
-    POSTACTION(ms);
-    return mem;
-  }
-
-  return 0;
-}
-
-void mspace_free(mspace msp, void* mem) {
-  if (mem != 0) {
-    mchunkptr p  = mem2chunk(mem);
-#if FOOTERS
-    mstate fm = get_mstate_for(p);
-    msp = msp; /* placate people compiling -Wunused */
-#else /* FOOTERS */
-    mstate fm = (mstate)msp;
-#endif /* FOOTERS */
-    if (!ok_magic(fm)) {
-      USAGE_ERROR_ACTION(fm, p);
-      return;
-    }
-    if (!PREACTION(fm)) {
-      check_inuse_chunk(fm, p);
-      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
-        size_t psize = chunksize(p);
-        mchunkptr next = chunk_plus_offset(p, psize);
-        if (!pinuse(p)) {
-          size_t prevsize = p->prev_foot;
-          if (is_mmapped(p)) {
-            psize += prevsize + MMAP_FOOT_PAD;
-            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
-              fm->footprint -= psize;
-            goto postaction;
-          }
-          else {
-            mchunkptr prev = chunk_minus_offset(p, prevsize);
-            psize += prevsize;
-            p = prev;
-            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
-              if (p != fm->dv) {
-                unlink_chunk(fm, p, prevsize);
-              }
-              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
-                fm->dvsize = psize;
-                set_free_with_pinuse(p, psize, next);
-                goto postaction;
-              }
-            }
-            else
-              goto erroraction;
-          }
-        }
-
-        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
-          if (!cinuse(next)) {  /* consolidate forward */
-            if (next == fm->top) {
-              size_t tsize = fm->topsize += psize;
-              fm->top = p;
-              p->head = tsize | PINUSE_BIT;
-              if (p == fm->dv) {
-                fm->dv = 0;
-                fm->dvsize = 0;
-              }
-              if (should_trim(fm, tsize))
-                sys_trim(fm, 0);
-              goto postaction;
-            }
-            else if (next == fm->dv) {
-              size_t dsize = fm->dvsize += psize;
-              fm->dv = p;
-              set_size_and_pinuse_of_free_chunk(p, dsize);
-              goto postaction;
-            }
-            else {
-              size_t nsize = chunksize(next);
-              psize += nsize;
-              unlink_chunk(fm, next, nsize);
-              set_size_and_pinuse_of_free_chunk(p, psize);
-              if (p == fm->dv) {
-                fm->dvsize = psize;
-                goto postaction;
-              }
-            }
-          }
-          else
-            set_free_with_pinuse(p, psize, next);
-
-          if (is_small(psize)) {
-            insert_small_chunk(fm, p, psize);
-            check_free_chunk(fm, p);
-          }
-          else {
-            tchunkptr tp = (tchunkptr)p;
-            insert_large_chunk(fm, tp, psize);
-            check_free_chunk(fm, p);
-            if (--fm->release_checks == 0)
-              release_unused_segments(fm);
-          }
-          goto postaction;
-        }
-      }
-    erroraction:
-      USAGE_ERROR_ACTION(fm, p);
-    postaction:
-      POSTACTION(fm);
-    }
-  }
-}
-
-void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
-  void* mem;
-  size_t req = 0;
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  if (n_elements != 0) {
-    req = n_elements * elem_size;
-    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
-        (req / n_elements != elem_size))
-      req = MAX_SIZE_T; /* force downstream failure on overflow */
-  }
-  mem = internal_malloc(ms, req);
-  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
-    memset(mem, 0, req);
-  return mem;
-}
-
-void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
-  if (oldmem == 0)
-    return mspace_malloc(msp, bytes);
-#ifdef REALLOC_ZERO_BYTES_FREES
-  if (bytes == 0) {
-    mspace_free(msp, oldmem);
-    return 0;
-  }
-#endif /* REALLOC_ZERO_BYTES_FREES */
-  else {
-#if FOOTERS
-    mchunkptr p  = mem2chunk(oldmem);
-    mstate ms = get_mstate_for(p);
-#else /* FOOTERS */
-    mstate ms = (mstate)msp;
-#endif /* FOOTERS */
-    if (!ok_magic(ms)) {
-      USAGE_ERROR_ACTION(ms,ms);
-      return 0;
-    }
-    return internal_realloc(ms, oldmem, bytes);
-  }
-}
-
-void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  return internal_memalign(ms, alignment, bytes);
-}
-
-void** mspace_independent_calloc(mspace msp, size_t n_elements,
-                                 size_t elem_size, void* chunks[]) {
-  size_t sz = elem_size; /* serves as 1-element array */
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  return ialloc(ms, n_elements, &sz, 3, chunks);
-}
-
-void** mspace_independent_comalloc(mspace msp, size_t n_elements,
-                                   size_t sizes[], void* chunks[]) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  return ialloc(ms, n_elements, sizes, 0, chunks);
-}
-
-int mspace_trim(mspace msp, size_t pad) {
-  int result = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    if (!PREACTION(ms)) {
-      result = sys_trim(ms, pad);
-      POSTACTION(ms);
-    }
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return result;
-}
-
-void mspace_malloc_stats(mspace msp) {
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    internal_malloc_stats(ms);
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-}
-
-size_t mspace_footprint(mspace msp) {
-  size_t result = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    result = ms->footprint;
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return result;
-}
-
-
-size_t mspace_max_footprint(mspace msp) {
-  size_t result = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    result = ms->max_footprint;
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return result;
-}
-
-
-#if !NO_MALLINFO
-struct mallinfo mspace_mallinfo(mspace msp) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return internal_mallinfo(ms);
-}
-#endif /* NO_MALLINFO */
-
-size_t mspace_usable_size(void* mem) {
-  if (mem != 0) {
-    mchunkptr p = mem2chunk(mem);
-    if (is_inuse(p))
-      return chunksize(p) - overhead_for(p);
-  }
-  return 0;
-}
-
-int mspace_mallopt(int param_number, int value) {
-  return change_mparam(param_number, value);
-}
-
-#endif /* MSPACES */
-
-
-/* -------------------- Alternative MORECORE functions ------------------- */
-
-/*
-  Guidelines for creating a custom version of MORECORE:
-
-  * For best performance, MORECORE should allocate in multiples of pagesize.
-  * MORECORE may allocate more memory than requested. (Or even less,
-      but this will usually result in a malloc failure.)
-  * MORECORE must not allocate memory when given argument zero, but
-      instead return one past the end address of memory from previous
-      nonzero call.
-  * For best performance, consecutive calls to MORECORE with positive
-      arguments should return increasing addresses, indicating that
-      space has been contiguously extended.
-  * Even though consecutive calls to MORECORE need not return contiguous
-      addresses, it must be OK for malloc'ed chunks to span multiple
-      regions in those cases where they do happen to be contiguous.
-  * MORECORE need not handle negative arguments -- it may instead
-      just return MFAIL when given negative arguments.
-      Negative arguments are always multiples of pagesize. MORECORE
-      must not misinterpret negative args as large positive unsigned
-      args. You can suppress all such calls from even occurring by defining
-      MORECORE_CANNOT_TRIM,
-
-  As an example alternative MORECORE, here is a custom allocator
-  kindly contributed for pre-OSX macOS.  It uses virtually but not
-  necessarily physically contiguous non-paged memory (locked in,
-  present and won't get swapped out).  You can use it by uncommenting
-  this section, adding some #includes, and setting up the appropriate
-  defines above:
-
-      #define MORECORE osMoreCore
-
-  There is also a shutdown routine that should somehow be called for
-  cleanup upon program exit.
-
-  #define MAX_POOL_ENTRIES 100
-  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
-  static int next_os_pool;
-  void *our_os_pools[MAX_POOL_ENTRIES];
-
-  void *osMoreCore(int size)
-  {
-    void *ptr = 0;
-    static void *sbrk_top = 0;
-
-    if (size > 0)
-    {
-      if (size < MINIMUM_MORECORE_SIZE)
-         size = MINIMUM_MORECORE_SIZE;
-      if (CurrentExecutionLevel() == kTaskLevel)
-         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
-      if (ptr == 0)
-      {
-        return (void *) MFAIL;
-      }
-      // save ptrs so they can be freed during cleanup
-      our_os_pools[next_os_pool] = ptr;
-      next_os_pool++;
-      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
-      sbrk_top = (char *) ptr + size;
-      return ptr;
-    }
-    else if (size < 0)
-    {
-      // we don't currently support shrink behavior
-      return (void *) MFAIL;
-    }
-    else
-    {
-      return sbrk_top;
-    }
-  }
-
-  // cleanup any allocated memory pools
-  // called as last thing before shutting down driver
-
-  void osCleanupMem(void)
-  {
-    void **ptr;
-
-    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
-      if (*ptr)
-      {
-         PoolDeallocate(*ptr);
-         *ptr = 0;
-      }
-  }
-
-*/
-
-
-/* -----------------------------------------------------------------------
-History:
-    V2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
-      * Use zeros instead of prev foot for is_mmapped
-      * Add mspace_track_large_chunks; thanks to Jean Brouwers
-      * Fix set_inuse in internal_realloc; thanks to Jean Brouwers
-      * Fix insufficient sys_alloc padding when using 16byte alignment
-      * Fix bad error check in mspace_footprint
-      * Adaptations for ptmalloc; thanks to Wolfram Gloger.
-      * Reentrant spin locks; thanks to Earl Chew and others
-      * Win32 improvements; thanks to Niall Douglas and Earl Chew
-      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
-      * Extension hook in malloc_state
-      * Various small adjustments to reduce warnings on some compilers
-      * Various configuration extensions/changes for more platforms. Thanks
-         to all who contributed these.
-
-    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
-      * Add max_footprint functions
-      * Ensure all appropriate literals are size_t
-      * Fix conditional compilation problem for some #define settings
-      * Avoid concatenating segments with the one provided
-        in create_mspace_with_base
-      * Rename some variables to avoid compiler shadowing warnings
-      * Use explicit lock initialization.
-      * Better handling of sbrk interference.
-      * Simplify and fix segment insertion, trimming and mspace_destroy
-      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
-      * Thanks especially to Dennis Flanagan for help on these.
-
-    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
-      * Fix memalign brace error.
-
-    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
-      * Fix improper #endif nesting in C++
-      * Add explicit casts needed for C++
-
-    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
-      * Use trees for large bins
-      * Support mspaces
-      * Use segments to unify sbrk-based and mmap-based system allocation,
-        removing need for emulation on most platforms without sbrk.
-      * Default safety checks
-      * Optional footer checks. Thanks to William Robertson for the idea.
-      * Internal code refactoring
-      * Incorporate suggestions and platform-specific changes.
-        Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
-        Aaron Bachmann,  Emery Berger, and others.
-      * Speed up non-fastbin processing enough to remove fastbins.
-      * Remove useless cfree() to avoid conflicts with other apps.
-      * Remove internal memcpy, memset. Compilers handle builtins better.
-      * Remove some options that no one ever used and rename others.
-
-    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
-      * Fix malloc_state bitmap array misdeclaration
-
-    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
-      * Allow tuning of FIRST_SORTED_BIN_SIZE
-      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
-      * Better detection and support for non-contiguousness of MORECORE.
-        Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
-      * Bypass most of malloc if no frees. Thanks To Emery Berger.
-      * Fix freeing of old top non-contiguous chunk im sysmalloc.
-      * Raised default trim and map thresholds to 256K.
-      * Fix mmap-related #defines. Thanks to Lubos Lunak.
-      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
-      * Branch-free bin calculation
-      * Default trim and mmap thresholds now 256K.
-
-    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
-      * Introduce independent_comalloc and independent_calloc.
-        Thanks to Michael Pachos for motivation and help.
-      * Make optional .h file available
-      * Allow > 2GB requests on 32bit systems.
-      * new WIN32 sbrk, mmap, munmap, lock code from <[email protected]>.
-        Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
-        and Anonymous.
-      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
-        helping test this.)
-      * memalign: check alignment arg
-      * realloc: don't try to shift chunks backwards, since this
-        leads to  more fragmentation in some programs and doesn't
-        seem to help in any others.
-      * Collect all cases in malloc requiring system memory into sysmalloc
-      * Use mmap as backup to sbrk
-      * Place all internal state in malloc_state
-      * Introduce fastbins (although similar to 2.5.1)
-      * Many minor tunings and cosmetic improvements
-      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
-      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
-        Thanks to Tony E. Bennett <[email protected]> and others.
-      * Include errno.h to support default failure action.
-
-    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
-      * return null for negative arguments
-      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
-         * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
-          (e.g. WIN32 platforms)
-         * Cleanup header file inclusion for WIN32 platforms
-         * Cleanup code to avoid Microsoft Visual C++ compiler complaints
-         * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
-           memory allocation routines
-         * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
-         * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
-           usage of 'assert' in non-WIN32 code
-         * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
-           avoid infinite loop
-      * Always call 'fREe()' rather than 'free()'
-
-    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
-      * Fixed ordering problem with boundary-stamping
-
-    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
-      * Added pvalloc, as recommended by H.J. Liu
-      * Added 64bit pointer support mainly from Wolfram Gloger
-      * Added anonymously donated WIN32 sbrk emulation
-      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
-      * malloc_extend_top: fix mask error that caused wastage after
-        foreign sbrks
-      * Add linux mremap support code from HJ Liu
-
-    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
-      * Integrated most documentation with the code.
-      * Add support for mmap, with help from
-        Wolfram Gloger ([email protected]).
-      * Use last_remainder in more cases.
-      * Pack bins using idea from  [email protected]
-      * Use ordered bins instead of best-fit threshhold
-      * Eliminate block-local decls to simplify tracing and debugging.
-      * Support another case of realloc via move into top
-      * Fix error occuring when initial sbrk_base not word-aligned.
-      * Rely on page size for units instead of SBRK_UNIT to
-        avoid surprises about sbrk alignment conventions.
-      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
-        ([email protected]) for the suggestion.
-      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
-      * More precautions for cases where other routines call sbrk,
-        courtesy of Wolfram Gloger ([email protected]).
-      * Added macros etc., allowing use in linux libc from
-        H.J. Lu ([email protected])
-      * Inverted this history list
-
-    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
-      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
-      * Removed all preallocation code since under current scheme
-        the work required to undo bad preallocations exceeds
-        the work saved in good cases for most test programs.
-      * No longer use return list or unconsolidated bins since
-        no scheme using them consistently outperforms those that don't
-        given above changes.
-      * Use best fit for very large chunks to prevent some worst-cases.
-      * Added some support for debugging
-
-    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
-      * Removed footers when chunks are in use. Thanks to
-        Paul Wilson ([email protected]) for the suggestion.
-
-    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
-      * Added malloc_trim, with help from Wolfram Gloger
-        ([email protected]).
-
-    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)
-
-    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
-      * realloc: try to expand in both directions
-      * malloc: swap order of clean-bin strategy;
-      * realloc: only conditionally expand backwards
-      * Try not to scavenge used bins
-      * Use bin counts as a guide to preallocation
-      * Occasionally bin return list chunks in first scan
-      * Add a few optimizations from [email protected]
-
-    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
-      * faster bin computation & slightly different binning
-      * merged all consolidations to one part of malloc proper
-         (eliminating old malloc_find_space & malloc_clean_bin)
-      * Scan 2 returns chunks (not just 1)
-      * Propagate failure in realloc if malloc returns 0
-      * Add stuff to allow compilation on non-ANSI compilers
-          from [email protected]
-
-    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
-      * removed potential for odd address access in prev_chunk
-      * removed dependency on getpagesize.h
-      * misc cosmetics and a bit more internal documentation
-      * anticosmetics: mangled names in macros to evade debugger strangeness
-      * tested on sparc, hp-700, dec-mips, rs6000
-          with gcc & native cc (hp, dec only) allowing
-          Detlefs & Zorn comparison study (in SIGPLAN Notices.)
-
-    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
-      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
-         structure of old version,  but most details differ.)
-
-*/
-
-#endif
+#ifdef NEDMALLOC_ENABLED
+/*
+  This is a version (aka dlmalloc) of malloc/free/realloc written by
+  Doug Lea and released to the public domain, as explained at
+  http://creativecommons.org/licenses/publicdomain.  Send questions,
+  comments, complaints, performance data, etc to [email protected]
+
+* Version 2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
+
+   Note: There may be an updated version of this malloc obtainable at
+           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
+         Check before installing!
+
+* Quickstart
+
+  This library is all in one file to simplify the most common usage:
+  ftp it, compile it (-O3), and link it into another program. All of
+  the compile-time options default to reasonable values for use on
+  most platforms.  You might later want to step through various
+  compile-time and dynamic tuning options.
+
+  For convenience, an include file for code using this malloc is at:
+     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.4.h
+  You don't really need this .h file unless you call functions not
+  defined in your system include files.  The .h file contains only the
+  excerpts from this file needed for using this malloc on ANSI C/C++
+  systems, so long as you haven't changed compile-time options about
+  naming and tuning parameters.  If you do, then you can create your
+  own malloc.h that does include all settings by cutting at the point
+  indicated below. Note that you may already by default be using a C
+  library containing a malloc that is based on some version of this
+  malloc (for example in linux). You might still want to use the one
+  in this file to customize settings or to avoid overheads associated
+  with library versions.
+
+* Vital statistics:
+
+  Supported pointer/size_t representation:       4 or 8 bytes
+       size_t MUST be an unsigned type of the same width as
+       pointers. (If you are using an ancient system that declares
+       size_t as a signed type, or need it to be a different width
+       than pointers, you can use a previous release of this malloc
+       (e.g. 2.7.2) supporting these.)
+
+  Alignment:                                     8 bytes (default)
+       This suffices for nearly all current machines and C compilers.
+       However, you can define MALLOC_ALIGNMENT to be wider than this
+       if necessary (up to 128bytes), at the expense of using more space.
+
+  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
+                                          8 or 16 bytes (if 8byte sizes)
+       Each malloced chunk has a hidden word of overhead holding size
+       and status information, and additional cross-check word
+       if FOOTERS is defined.
+
+  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
+                          8-byte ptrs:  32 bytes    (including overhead)
+
+       Even a request for zero bytes (i.e., malloc(0)) returns a
+       pointer to something of the minimum allocatable size.
+       The maximum overhead wastage (i.e., number of extra bytes
+       allocated than were requested in malloc) is less than or equal
+       to the minimum size, except for requests >= mmap_threshold that
+       are serviced via mmap(), where the worst case wastage is about
+       32 bytes plus the remainder from a system page (the minimal
+       mmap unit); typically 4096 or 8192 bytes.
+
+  Security: static-safe; optionally more or less
+       The "security" of malloc refers to the ability of malicious
+       code to accentuate the effects of errors (for example, freeing
+       space that is not currently malloc'ed or overwriting past the
+       ends of chunks) in code that calls malloc.  This malloc
+       guarantees not to modify any memory locations below the base of
+       heap, i.e., static variables, even in the presence of usage
+       errors.  The routines additionally detect most improper frees
+       and reallocs.  All this holds as long as the static bookkeeping
+       for malloc itself is not corrupted by some other means.  This
+       is only one aspect of security -- these checks do not, and
+       cannot, detect all possible programming errors.
+
+       If FOOTERS is defined nonzero, then each allocated chunk
+       carries an additional check word to verify that it was malloced
+       from its space.  These check words are the same within each
+       execution of a program using malloc, but differ across
+       executions, so externally crafted fake chunks cannot be
+       freed. This improves security by rejecting frees/reallocs that
+       could corrupt heap memory, in addition to the checks preventing
+       writes to statics that are always on.  This may further improve
+       security at the expense of time and space overhead.  (Note that
+       FOOTERS may also be worth using with MSPACES.)
+
+       By default detected errors cause the program to abort (calling
+       "abort()"). You can override this to instead proceed past
+       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
+       has no effect, and a malloc that encounters a bad address
+       caused by user overwrites will ignore the bad address by
+       dropping pointers and indices to all known memory. This may
+       be appropriate for programs that should continue if at all
+       possible in the face of programming errors, although they may
+       run out of memory because dropped memory is never reclaimed.
+
+       If you don't like either of these options, you can define
+       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
+       else. And if if you are sure that your program using malloc has
+       no errors or vulnerabilities, you can define INSECURE to 1,
+       which might (or might not) provide a small performance improvement.
+
+  Thread-safety: NOT thread-safe unless USE_LOCKS defined
+       When USE_LOCKS is defined, each public call to malloc, free,
+       etc is surrounded with either a pthread mutex or a win32
+       spinlock (depending on WIN32). This is not especially fast, and
+       can be a major bottleneck.  It is designed only to provide
+       minimal protection in concurrent environments, and to provide a
+       basis for extensions.  If you are using malloc in a concurrent
+       program, consider instead using nedmalloc
+       (http://www.nedprod.com/programs/portable/nedmalloc/) or
+       ptmalloc (See http://www.malloc.de), which are derived
+       from versions of this malloc.
+
+  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
+       This malloc can use unix sbrk or any emulation (invoked using
+       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
+       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
+       memory.  On most unix systems, it tends to work best if both
+       MORECORE and MMAP are enabled.  On Win32, it uses emulations
+       based on VirtualAlloc. It also uses common C library functions
+       like memset.
+
+  Compliance: I believe it is compliant with the Single Unix Specification
+       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
+       others as well.
+
+* Overview of algorithms
+
+  This is not the fastest, most space-conserving, most portable, or
+  most tunable malloc ever written. However it is among the fastest
+  while also being among the most space-conserving, portable and
+  tunable.  Consistent balance across these factors results in a good
+  general-purpose allocator for malloc-intensive programs.
+
+  In most ways, this malloc is a best-fit allocator. Generally, it
+  chooses the best-fitting existing chunk for a request, with ties
+  broken in approximately least-recently-used order. (This strategy
+  normally maintains low fragmentation.) However, for requests less
+  than 256bytes, it deviates from best-fit when there is not an
+  exactly fitting available chunk by preferring to use space adjacent
+  to that used for the previous small request, as well as by breaking
+  ties in approximately most-recently-used order. (These enhance
+  locality of series of small allocations.)  And for very large requests
+  (>= 256Kb by default), it relies on system memory mapping
+  facilities, if supported.  (This helps avoid carrying around and
+  possibly fragmenting memory used only for large chunks.)
+
+  All operations (except malloc_stats and mallinfo) have execution
+  times that are bounded by a constant factor of the number of bits in
+  a size_t, not counting any clearing in calloc or copying in realloc,
+  or actions surrounding MORECORE and MMAP that have times
+  proportional to the number of non-contiguous regions returned by
+  system allocation routines, which is often just 1. In real-time
+  applications, you can optionally suppress segment traversals using
+  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
+  system allocators return non-contiguous spaces, at the typical
+  expense of carrying around more memory and increased fragmentation.
+
+  The implementation is not very modular and seriously overuses
+  macros. Perhaps someday all C compilers will do as good a job
+  inlining modular code as can now be done by brute-force expansion,
+  but now, enough of them seem not to.
+
+  Some compilers issue a lot of warnings about code that is
+  dead/unreachable only on some platforms, and also about intentional
+  uses of negation on unsigned types. All known cases of each can be
+  ignored.
+
+  For a longer but out of date high-level description, see
+     http://gee.cs.oswego.edu/dl/html/malloc.html
+
+* MSPACES
+  If MSPACES is defined, then in addition to malloc, free, etc.,
+  this file also defines mspace_malloc, mspace_free, etc. These
+  are versions of malloc routines that take an "mspace" argument
+  obtained using create_mspace, to control all internal bookkeeping.
+  If ONLY_MSPACES is defined, only these versions are compiled.
+  So if you would like to use this allocator for only some allocations,
+  and your system malloc for others, you can compile with
+  ONLY_MSPACES and then do something like...
+    static mspace mymspace = create_mspace(0,0); // for example
+    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)
+
+  (Note: If you only need one instance of an mspace, you can instead
+  use "USE_DL_PREFIX" to relabel the global malloc.)
+
+  You can similarly create thread-local allocators by storing
+  mspaces as thread-locals. For example:
+    static __thread mspace tlms = 0;
+    void*  tlmalloc(size_t bytes) {
+      if (tlms == 0) tlms = create_mspace(0, 0);
+      return mspace_malloc(tlms, bytes);
+    }
+    void  tlfree(void* mem) { mspace_free(tlms, mem); }
+
+  Unless FOOTERS is defined, each mspace is completely independent.
+  You cannot allocate from one and free to another (although
+  conformance is only weakly checked, so usage errors are not always
+  caught). If FOOTERS is defined, then each chunk carries around a tag
+  indicating its originating mspace, and frees are directed to their
+  originating spaces.
+
+ -------------------------  Compile-time options ---------------------------
+
+Be careful in setting #define values for numerical constants of type
+size_t. On some systems, literal values are not automatically extended
+to size_t precision unless they are explicitly casted. You can also
+use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.
+
+WIN32                    default: defined if _WIN32 defined
+  Defining WIN32 sets up defaults for MS environment and compilers.
+  Otherwise defaults are for unix. Beware that there seem to be some
+  cases where this malloc might not be a pure drop-in replacement for
+  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
+  SetDIBits()) may be due to bugs in some video driver implementations
+  when pixel buffers are malloc()ed, and the region spans more than
+  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
+  default granularity, pixel buffers may straddle virtual allocation
+  regions more often than when using the Microsoft allocator.  You can
+  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
+  buffers rather than using malloc().  If this is not possible,
+  recompile this malloc with a larger DEFAULT_GRANULARITY.
+
+MALLOC_ALIGNMENT         default: (size_t)8
+  Controls the minimum alignment for malloc'ed chunks.  It must be a
+  power of two and at least 8, even on machines for which smaller
+  alignments would suffice. It may be defined as larger than this
+  though. Note however that code and data structures are optimized for
+  the case of 8-byte alignment.
+
+MSPACES                  default: 0 (false)
+  If true, compile in support for independent allocation spaces.
+  This is only supported if HAVE_MMAP is true.
+
+ONLY_MSPACES             default: 0 (false)
+  If true, only compile in mspace versions, not regular versions.
+
+USE_LOCKS                default: 0 (false)
+  Causes each call to each public routine to be surrounded with
+  pthread or WIN32 mutex lock/unlock. (If set true, this can be
+  overridden on a per-mspace basis for mspace versions.) If set to a
+  non-zero value other than 1, locks are used, but their
+  implementation is left out, so lock functions must be supplied manually,
+  as described below.
+
+USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and on x86 using gcc or MSC
+  If true, uses custom spin locks for locking. This is currently
+  supported only for x86 platforms using gcc or recent MS compilers.
+  Otherwise, posix locks or win32 critical sections are used.
+
+FOOTERS                  default: 0
+  If true, provide extra checking and dispatching by placing
+  information in the footers of allocated chunks. This adds
+  space and time overhead.
+
+INSECURE                 default: 0
+  If true, omit checks for usage errors and heap space overwrites.
+
+USE_DL_PREFIX            default: NOT defined
+  Causes compiler to prefix all public routines with the string 'dl'.
+  This can be useful when you only want to use this malloc in one part
+  of a program, using your regular system malloc elsewhere.
+
+ABORT                    default: defined as abort()
+  Defines how to abort on failed checks.  On most systems, a failed
+  check cannot die with an "assert" or even print an informative
+  message, because the underlying print routines in turn call malloc,
+  which will fail again.  Generally, the best policy is to simply call
+  abort(). It's not very useful to do more than this because many
+  errors due to overwriting will show up as address faults (null, odd
+  addresses etc) rather than malloc-triggered checks, so will also
+  abort.  Also, most compilers know that abort() does not return, so
+  can better optimize code conditionally calling it.
+
+PROCEED_ON_ERROR           default: defined as 0 (false)
+  Controls whether detected bad addresses cause them to bypassed
+  rather than aborting. If set, detected bad arguments to free and
+  realloc are ignored. And all bookkeeping information is zeroed out
+  upon a detected overwrite of freed heap space, thus losing the
+  ability to ever return it from malloc again, but enabling the
+  application to proceed. If PROCEED_ON_ERROR is defined, the
+  static variable malloc_corruption_error_count is compiled in
+  and can be examined to see if errors have occurred. This option
+  generates slower code than the default abort policy.
+
+DEBUG                    default: NOT defined
+  The DEBUG setting is mainly intended for people trying to modify
+  this code or diagnose problems when porting to new platforms.
+  However, it may also be able to better isolate user errors than just
+  using runtime checks.  The assertions in the check routines spell
+  out in more detail the assumptions and invariants underlying the
+  algorithms.  The checking is fairly extensive, and will slow down
+  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
+  set will attempt to check every non-mmapped allocated and free chunk
+  in the course of computing the summaries.
+
+ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
+  Debugging assertion failures can be nearly impossible if your
+  version of the assert macro causes malloc to be called, which will
+  lead to a cascade of further failures, blowing the runtime stack.
+  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
+  which will usually make debugging easier.
+
+MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
+  The action to take before "return 0" when malloc fails to be able to
+  return memory because there is none available.
+
+HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
+  True if this system supports sbrk or an emulation of it.
+
+MORECORE                  default: sbrk
+  The name of the sbrk-style system routine to call to obtain more
+  memory.  See below for guidance on writing custom MORECORE
+  functions. The type of the argument to sbrk/MORECORE varies across
+  systems.  It cannot be size_t, because it supports negative
+  arguments, so it is normally the signed type of the same width as
+  size_t (sometimes declared as "intptr_t").  It doesn't much matter
+  though. Internally, we only call it with arguments less than half
+  the max value of a size_t, which should work across all reasonable
+  possibilities, although sometimes generating compiler warnings.
+
+MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
+  If true, take advantage of fact that consecutive calls to MORECORE
+  with positive arguments always return contiguous increasing
+  addresses.  This is true of unix sbrk. It does not hurt too much to
+  set it true anyway, since malloc copes with non-contiguities.
+  Setting it false when definitely non-contiguous saves time
+  and possibly wasted space it would take to discover this though.
+
+MORECORE_CANNOT_TRIM      default: NOT defined
+  True if MORECORE cannot release space back to the system when given
+  negative arguments. This is generally necessary only if you are
+  using a hand-crafted MORECORE function that cannot handle negative
+  arguments.
+
+NO_SEGMENT_TRAVERSAL       default: 0
+  If non-zero, suppresses traversals of memory segments
+  returned by either MORECORE or CALL_MMAP. This disables
+  merging of segments that are contiguous, and selectively
+  releasing them to the OS if unused, but bounds execution times.
+
+HAVE_MMAP                 default: 1 (true)
+  True if this system supports mmap or an emulation of it.  If so, and
+  HAVE_MORECORE is not true, MMAP is used for all system
+  allocation. If set and HAVE_MORECORE is true as well, MMAP is
+  primarily used to directly allocate very large blocks. It is also
+  used as a backup strategy in cases where MORECORE fails to provide
+  space from system. Note: A single call to MUNMAP is assumed to be
+  able to unmap memory that may have be allocated using multiple calls
+  to MMAP, so long as they are adjacent.
+
+HAVE_MREMAP               default: 1 on linux, else 0
+  If true realloc() uses mremap() to re-allocate large blocks and
+  extend or shrink allocation spaces.
+
+MMAP_CLEARS               default: 1 except on WINCE.
+  True if mmap clears memory so calloc doesn't need to. This is true
+  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.
+
+USE_BUILTIN_FFS            default: 0 (i.e., not used)
+  Causes malloc to use the builtin ffs() function to compute indices.
+  Some compilers may recognize and intrinsify ffs to be faster than the
+  supplied C version. Also, the case of x86 using gcc is special-cased
+  to an asm instruction, so is already as fast as it can be, and so
+  this setting has no effect. Similarly for Win32 under recent MS compilers.
+  (On most x86s, the asm version is only slightly faster than the C version.)
+
+malloc_getpagesize         default: derive from system includes, or 4096.
+  The system page size. To the extent possible, this malloc manages
+  memory from the system in page-size units.  This may be (and
+  usually is) a function rather than a constant. This is ignored
+  if WIN32, where page size is determined using getSystemInfo during
+  initialization. This may be several megabytes if ENABLE_LARGE_PAGES
+  is enabled.
+
+ENABLE_LARGE_PAGES         default: NOT defined
+  Causes the system page size to be the value of GetLargePageMinimum()
+  if that function is available (Windows Server 2003/Vista or later).
+  This allows the use of large page entries in the MMU which can
+  significantly improve performance in large working set applications
+  as TLB cache load is reduced by a factor of three. Note that enabling
+  this option is equal to locking the process' memory in current
+  implementations of Windows and requires the SE_LOCK_MEMORY_PRIVILEGE
+  to be held by the process in order to succeed.
+
+USE_DEV_RANDOM             default: 0 (i.e., not used)
+  Causes malloc to use /dev/random to initialize secure magic seed for
+  stamping footers. Otherwise, the current time is used.
+
+NO_MALLINFO                default: 0
+  If defined, don't compile "mallinfo". This can be a simple way
+  of dealing with mismatches between system declarations and
+  those in this file.
+
+MALLINFO_FIELD_TYPE        default: size_t
+  The type of the fields in the mallinfo struct. This was originally
+  defined as "int" in SVID etc, but is more usefully defined as
+  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set
+
+REALLOC_ZERO_BYTES_FREES    default: not defined
+  This should be set if a call to realloc with zero bytes should
+  be the same as a call to free. Some people think it should. Otherwise,
+  since this malloc returns a unique pointer for malloc(0), so does
+  realloc(p, 0).
+
+LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
+LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
+LACKS_STDLIB_H                default: NOT defined unless on WIN32
+  Define these if your system does not have these header files.
+  You might need to manually insert some of the declarations they provide.
+
+DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
+                                system_info.dwAllocationGranularity in WIN32,
+                                GetLargePageMinimum() if ENABLE_LARGE_PAGES,
+                                otherwise 64K.
+      Also settable using mallopt(M_GRANULARITY, x)
+  The unit for allocating and deallocating memory from the system.  On
+  most systems with contiguous MORECORE, there is no reason to
+  make this more than a page. However, systems with MMAP tend to
+  either require or encourage larger granularities.  You can increase
+  this value to prevent system allocation functions to be called so
+  often, especially if they are slow.  The value must be at least one
+  page and must be a power of two.  Setting to 0 causes initialization
+  to either page size or win32 region size.  (Note: In previous
+  versions of malloc, the equivalent of this option was called
+  "TOP_PAD")
+
+DEFAULT_GRANULARITY_ALIGNED default: undefined (which means page size)
+  Whether to enforce alignment when allocating and deallocating memory
+  from the system i.e. the base address of all allocations will be
+  aligned to DEFAULT_GRANULARITY if it is set. Note that enabling this carries
+  some overhead as multiple calls must now be made when probing for a valid
+  aligned value, however it does greatly ease the checking for whether
+  a given memory pointer was allocated by this allocator rather than
+  some other.
+
+DEFAULT_TRIM_THRESHOLD    default: 2MB
+      Also settable using mallopt(M_TRIM_THRESHOLD, x)
+  The maximum amount of unused top-most memory to keep before
+  releasing via malloc_trim in free().  Automatic trimming is mainly
+  useful in long-lived programs using contiguous MORECORE.  Because
+  trimming via sbrk can be slow on some systems, and can sometimes be
+  wasteful (in cases where programs immediately afterward allocate
+  more large chunks) the value should be high enough so that your
+  overall system performance would improve by releasing this much
+  memory.  As a rough guide, you might set to a value close to the
+  average size of a process (program) running on your system.
+  Releasing this much memory would allow such a process to run in
+  memory.  Generally, it is worth tuning trim thresholds when a
+  program undergoes phases where several large chunks are allocated
+  and released in ways that can reuse each other's storage, perhaps
+  mixed with phases where there are no such chunks at all. The trim
+  value must be greater than page size to have any useful effect.  To
+  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
+  some people use of mallocing a huge space and then freeing it at
+  program startup, in an attempt to reserve system memory, doesn't
+  have the intended effect under automatic trimming, since that memory
+  will immediately be returned to the system.
+
+DEFAULT_MMAP_THRESHOLD       default: 256K
+      Also settable using mallopt(M_MMAP_THRESHOLD, x)
+  The request size threshold for using MMAP to directly service a
+  request. Requests of at least this size that cannot be allocated
+  using already-existing space will be serviced via mmap.  (If enough
+  normal freed space already exists it is used instead.)  Using mmap
+  segregates relatively large chunks of memory so that they can be
+  individually obtained and released from the host system. A request
+  serviced through mmap is never reused by any other request (at least
+  not directly; the system may just so happen to remap successive
+  requests to the same locations).  Segregating space in this way has
+  the benefits that: Mmapped space can always be individually released
+  back to the system, which helps keep the system level memory demands
+  of a long-lived program low.  Also, mapped memory doesn't become
+  `locked' between other chunks, as can happen with normally allocated
+  chunks, which means that even trimming via malloc_trim would not
+  release them.  However, it has the disadvantage that the space
+  cannot be reclaimed, consolidated, and then used to service later
+  requests, as happens with normal chunks.  The advantages of mmap
+  nearly always outweigh disadvantages for "large" chunks, but the
+  value of "large" may vary across systems.  The default is an
+  empirically derived value that works well in most systems. You can
+  disable mmap by setting to MAX_SIZE_T.
+
+MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
+  The number of consolidated frees between checks to release
+  unused segments when freeing. When using non-contiguous segments,
+  especially with multiple mspaces, checking only for topmost space
+  doesn't always suffice to trigger trimming. To compensate for this,
+  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
+  current number of segments, if greater) try to release unused
+  segments to the OS when freeing chunks that result in
+  consolidation. The best value for this parameter is a compromise
+  between slowing down frees with relatively costly checks that
+  rarely trigger versus holding on to unused memory. To effectively
+  disable, set to MAX_SIZE_T. This may lead to a very slight speed
+  improvement at the expense of carrying around more memory.
+*/
+
+/* Version identifier to allow people to support multiple versions */
+#ifndef DLMALLOC_VERSION
+#define DLMALLOC_VERSION 20804
+#endif /* DLMALLOC_VERSION */
+
+#ifndef WIN32
+#ifdef _WIN32
+#define WIN32 1
+#endif  /* _WIN32 */
+#ifdef _WIN32_WCE
+#define LACKS_FCNTL_H
+#define WIN32 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <tchar.h>
+#define HAVE_MMAP 1
+#define HAVE_MORECORE 0
+#define LACKS_UNISTD_H
+#define LACKS_SYS_PARAM_H
+#define LACKS_SYS_MMAN_H
+#define LACKS_STRING_H
+#define LACKS_STRINGS_H
+#define LACKS_SYS_TYPES_H
+#define LACKS_ERRNO_H
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION
+#endif /* MALLOC_FAILURE_ACTION */
+#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
+#define MMAP_CLEARS 0
+#else
+#define MMAP_CLEARS 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+
+#if defined(DARWIN) || defined(_DARWIN)
+/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
+#ifndef HAVE_MORECORE
+#define HAVE_MORECORE 0
+#define HAVE_MMAP 1
+/* OSX allocators provide 16 byte alignment */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)16U)
+#endif
+#endif  /* HAVE_MORECORE */
+#endif  /* DARWIN */
+
+#ifndef LACKS_SYS_TYPES_H
+#include <sys/types.h>  /* For size_t */
+#endif  /* LACKS_SYS_TYPES_H */
+
+#if (defined(__GNUC__) && ((defined(__i386__) || defined(__x86_64__)))) || (defined(_MSC_VER) && _MSC_VER>=1310)
+#define SPIN_LOCKS_AVAILABLE 1
+#else
+#define SPIN_LOCKS_AVAILABLE 0
+#endif
+
+/* The maximum possible size_t value has all bits set */
+#define MAX_SIZE_T           (~(size_t)0)
+
+#ifndef ONLY_MSPACES
+#define ONLY_MSPACES 0     /* define to a value */
+#else
+#define ONLY_MSPACES 1
+#endif  /* ONLY_MSPACES */
+#ifndef MSPACES
+#if ONLY_MSPACES
+#define MSPACES 1
+#else   /* ONLY_MSPACES */
+#define MSPACES 0
+#endif  /* ONLY_MSPACES */
+#endif  /* MSPACES */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)8U)
+#endif  /* MALLOC_ALIGNMENT */
+#ifndef FOOTERS
+#define FOOTERS 0
+#endif  /* FOOTERS */
+#ifndef ABORT
+#define ABORT  abort()
+#endif  /* ABORT */
+#ifndef ABORT_ON_ASSERT_FAILURE
+#define ABORT_ON_ASSERT_FAILURE 1
+#endif  /* ABORT_ON_ASSERT_FAILURE */
+#ifndef PROCEED_ON_ERROR
+#define PROCEED_ON_ERROR 0
+#endif  /* PROCEED_ON_ERROR */
+#ifndef USE_LOCKS
+#define USE_LOCKS 0
+#endif  /* USE_LOCKS */
+#ifndef USE_SPIN_LOCKS
+#if USE_LOCKS && SPIN_LOCKS_AVAILABLE
+#define USE_SPIN_LOCKS 1
+#else
+#define USE_SPIN_LOCKS 0
+#endif /* USE_LOCKS && SPIN_LOCKS_AVAILABLE. */
+#endif /* USE_SPIN_LOCKS */
+#ifndef INSECURE
+#define INSECURE 0
+#endif  /* INSECURE */
+#ifndef HAVE_MMAP
+#define HAVE_MMAP 1
+#endif  /* HAVE_MMAP */
+#ifndef MMAP_CLEARS
+#define MMAP_CLEARS 1
+#endif  /* MMAP_CLEARS */
+#ifndef HAVE_MREMAP
+#ifdef linux
+#define HAVE_MREMAP 1
+#else   /* linux */
+#define HAVE_MREMAP 0
+#endif  /* linux */
+#endif  /* HAVE_MREMAP */
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
+#endif  /* MALLOC_FAILURE_ACTION */
+#ifndef HAVE_MORECORE
+#if ONLY_MSPACES
+#define HAVE_MORECORE 0
+#else   /* ONLY_MSPACES */
+#define HAVE_MORECORE 1
+#endif  /* ONLY_MSPACES */
+#endif  /* HAVE_MORECORE */
+#if !HAVE_MORECORE
+#define MORECORE_CONTIGUOUS 0
+#else   /* !HAVE_MORECORE */
+#define MORECORE_DEFAULT sbrk
+#ifndef MORECORE_CONTIGUOUS
+#define MORECORE_CONTIGUOUS 1
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* HAVE_MORECORE */
+#ifndef DEFAULT_GRANULARITY
+#if (MORECORE_CONTIGUOUS || defined(WIN32))
+#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
+#else   /* MORECORE_CONTIGUOUS */
+#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* DEFAULT_GRANULARITY */
+#ifndef DEFAULT_TRIM_THRESHOLD
+#ifndef MORECORE_CANNOT_TRIM
+#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
+#else   /* MORECORE_CANNOT_TRIM */
+#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
+#endif  /* MORECORE_CANNOT_TRIM */
+#endif  /* DEFAULT_TRIM_THRESHOLD */
+#ifndef DEFAULT_MMAP_THRESHOLD
+#if HAVE_MMAP
+#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
+#else   /* HAVE_MMAP */
+#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
+#endif  /* HAVE_MMAP */
+#endif  /* DEFAULT_MMAP_THRESHOLD */
+#ifndef MAX_RELEASE_CHECK_RATE
+#if HAVE_MMAP
+#define MAX_RELEASE_CHECK_RATE 4095
+#else
+#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
+#endif /* HAVE_MMAP */
+#endif /* MAX_RELEASE_CHECK_RATE */
+#ifndef USE_BUILTIN_FFS
+#define USE_BUILTIN_FFS 0
+#endif  /* USE_BUILTIN_FFS */
+#ifndef USE_DEV_RANDOM
+#define USE_DEV_RANDOM 0
+#endif  /* USE_DEV_RANDOM */
+#ifndef NO_MALLINFO
+#define NO_MALLINFO 0
+#endif  /* NO_MALLINFO */
+#ifndef MALLINFO_FIELD_TYPE
+#define MALLINFO_FIELD_TYPE size_t
+#endif  /* MALLINFO_FIELD_TYPE */
+#ifndef NO_SEGMENT_TRAVERSAL
+#define NO_SEGMENT_TRAVERSAL 0
+#endif /* NO_SEGMENT_TRAVERSAL */
+
+/*
+  mallopt tuning options.  SVID/XPG defines four standard parameter
+  numbers for mallopt, normally defined in malloc.h.  None of these
+  are used in this malloc, so setting them has no effect. But this
+  malloc does support the following options.
+*/
+
+#define M_TRIM_THRESHOLD     (-1)
+#define M_GRANULARITY        (-2)
+#define M_MMAP_THRESHOLD     (-3)
+
+/* ------------------------ Mallinfo declarations ------------------------ */
+
+#if !NO_MALLINFO
+/*
+  This version of malloc supports the standard SVID/XPG mallinfo
+  routine that returns a struct containing usage properties and
+  statistics. It should work on any system that has a
+  /usr/include/malloc.h defining struct mallinfo.  The main
+  declaration needed is the mallinfo struct that is returned (by-copy)
+  by mallinfo().  The malloinfo struct contains a bunch of fields that
+  are not even meaningful in this version of malloc.  These fields are
+  are instead filled by mallinfo() with other numbers that might be of
+  interest.
+
+  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
+  /usr/include/malloc.h file that includes a declaration of struct
+  mallinfo.  If so, it is included; else a compliant version is
+  declared below.  These must be precisely the same for mallinfo() to
+  work.  The original SVID version of this struct, defined on most
+  systems with mallinfo, declares all fields as ints. But some others
+  define as unsigned long. If your system defines the fields using a
+  type of different width than listed here, you MUST #include your
+  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
+*/
+
+/* #define HAVE_USR_INCLUDE_MALLOC_H */
+
+#ifdef HAVE_USR_INCLUDE_MALLOC_H
+#include "/usr/include/malloc.h"
+#else /* HAVE_USR_INCLUDE_MALLOC_H */
+#ifndef STRUCT_MALLINFO_DECLARED
+#define STRUCT_MALLINFO_DECLARED 1
+struct mallinfo {
+  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
+  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
+  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
+  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
+  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
+  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
+  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
+  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
+  MALLINFO_FIELD_TYPE fordblks; /* total free space */
+  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
+};
+#endif /* STRUCT_MALLINFO_DECLARED */
+#endif /* HAVE_USR_INCLUDE_MALLOC_H */
+#endif /* NO_MALLINFO */
+
+/*
+  Try to persuade compilers to inline. The most critical functions for
+  inlining are defined as macros, so these aren't used for them.
+*/
+
+#ifndef FORCEINLINE
+  #if defined(__GNUC__)
+#define FORCEINLINE __inline __attribute__ ((always_inline))
+  #elif defined(_MSC_VER)
+    #define FORCEINLINE __forceinline
+  #endif
+#endif
+#ifndef NOINLINE
+  #if defined(__GNUC__)
+    #define NOINLINE __attribute__ ((noinline))
+  #elif defined(_MSC_VER)
+    #define NOINLINE __declspec(noinline)
+  #else
+    #define NOINLINE
+  #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#ifndef FORCEINLINE
+ #define FORCEINLINE inline
+#endif
+#endif /* __cplusplus */
+#ifndef FORCEINLINE
+ #define FORCEINLINE
+#endif
+
+#if !ONLY_MSPACES
+
+/* ------------------- Declarations of public routines ------------------- */
+
+#ifndef USE_DL_PREFIX
+#define dlcalloc               calloc
+#define dlfree                 free
+#define dlmalloc               malloc
+#define dlmemalign             memalign
+#define dlrealloc              realloc
+#define dlvalloc               valloc
+#define dlpvalloc              pvalloc
+#define dlmallinfo             mallinfo
+#define dlmallopt              mallopt
+#define dlmalloc_trim          malloc_trim
+#define dlmalloc_stats         malloc_stats
+#define dlmalloc_usable_size   malloc_usable_size
+#define dlmalloc_footprint     malloc_footprint
+#define dlmalloc_max_footprint malloc_max_footprint
+#define dlindependent_calloc   independent_calloc
+#define dlindependent_comalloc independent_comalloc
+#endif /* USE_DL_PREFIX */
+
+
+/*
+  malloc(size_t n)
+  Returns a pointer to a newly allocated chunk of at least n bytes, or
+  null if no space is available, in which case errno is set to ENOMEM
+  on ANSI C systems.
+
+  If n is zero, malloc returns a minimum-sized chunk. (The minimum
+  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
+  systems.)  Note that size_t is an unsigned type, so calls with
+  arguments that would be negative if signed are interpreted as
+  requests for huge amounts of space, which will often fail. The
+  maximum supported value of n differs across systems, but is in all
+  cases less than the maximum representable value of a size_t.
+*/
+void* dlmalloc(size_t);
+
+/*
+  free(void* p)
+  Releases the chunk of memory pointed to by p, that had been previously
+  allocated using malloc or a related routine such as realloc.
+  It has no effect if p is null. If p was not malloced or already
+  freed, free(p) will by default cause the current program to abort.
+*/
+void  dlfree(void*);
+
+/*
+  calloc(size_t n_elements, size_t element_size);
+  Returns a pointer to n_elements * element_size bytes, with all locations
+  set to zero.
+*/
+void* dlcalloc(size_t, size_t);
+
+/*
+  realloc(void* p, size_t n)
+  Returns a pointer to a chunk of size n that contains the same data
+  as does chunk p up to the minimum of (n, p's size) bytes, or null
+  if no space is available.
+
+  The returned pointer may or may not be the same as p. The algorithm
+  prefers extending p in most cases when possible, otherwise it
+  employs the equivalent of a malloc-copy-free sequence.
+
+  If p is null, realloc is equivalent to malloc.
+
+  If space is not available, realloc returns null, errno is set (if on
+  ANSI) and p is NOT freed.
+
+  if n is for fewer bytes than already held by p, the newly unused
+  space is lopped off and freed if possible.  realloc with a size
+  argument of zero (re)allocates a minimum-sized chunk.
+
+  The old unix realloc convention of allowing the last-free'd chunk
+  to be used as an argument to realloc is not supported.
+*/
+
+void* dlrealloc(void*, size_t);
+
+/*
+  memalign(size_t alignment, size_t n);
+  Returns a pointer to a newly allocated chunk of n bytes, aligned
+  in accord with the alignment argument.
+
+  The alignment argument should be a power of two. If the argument is
+  not a power of two, the nearest greater power is used.
+  8-byte alignment is guaranteed by normal malloc calls, so don't
+  bother calling memalign with an argument of 8 or less.
+
+  Overreliance on memalign is a sure way to fragment space.
+*/
+void* dlmemalign(size_t, size_t);
+
+/*
+  valloc(size_t n);
+  Equivalent to memalign(pagesize, n), where pagesize is the page
+  size of the system. If the pagesize is unknown, 4096 is used.
+*/
+void* dlvalloc(size_t);
+
+/*
+  mallopt(int parameter_number, int parameter_value)
+  Sets tunable parameters The format is to provide a
+  (parameter-number, parameter-value) pair.  mallopt then sets the
+  corresponding parameter to the argument value if it can (i.e., so
+  long as the value is meaningful), and returns 1 if successful else
+  0.  To workaround the fact that mallopt is specified to use int,
+  not size_t parameters, the value -1 is specially treated as the
+  maximum unsigned size_t value.
+
+  SVID/XPG/ANSI defines four standard param numbers for mallopt,
+  normally defined in malloc.h.  None of these are use in this malloc,
+  so setting them has no effect. But this malloc also supports other
+  options in mallopt. See below for details.  Briefly, supported
+  parameters are as follows (listed defaults are for "typical"
+  configurations).
+
+  Symbol            param #  default    allowed param values
+  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
+  M_GRANULARITY        -2     page size   any power of 2 >= page size
+  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
+*/
+int dlmallopt(int, int);
+
+/*
+  malloc_footprint();
+  Returns the number of bytes obtained from the system.  The total
+  number of bytes allocated by malloc, realloc etc., is less than this
+  value. Unlike mallinfo, this function returns only a precomputed
+  result, so can be called frequently to monitor memory consumption.
+  Even if locks are otherwise defined, this function does not use them,
+  so results might not be up to date.
+*/
+size_t dlmalloc_footprint(void);
+
+/*
+  malloc_max_footprint();
+  Returns the maximum number of bytes obtained from the system. This
+  value will be greater than current footprint if deallocated space
+  has been reclaimed by the system. The peak number of bytes allocated
+  by malloc, realloc etc., is less than this value. Unlike mallinfo,
+  this function returns only a precomputed result, so can be called
+  frequently to monitor memory consumption.  Even if locks are
+  otherwise defined, this function does not use them, so results might
+  not be up to date.
+*/
+size_t dlmalloc_max_footprint(void);
+
+#if !NO_MALLINFO
+/*
+  mallinfo()
+  Returns (by copy) a struct containing various summary statistics:
+
+  arena:     current total non-mmapped bytes allocated from system
+  ordblks:   the number of free chunks
+  smblks:    always zero.
+  hblks:     current number of mmapped regions
+  hblkhd:    total bytes held in mmapped regions
+  usmblks:   the maximum total allocated space. This will be greater
+                than current total if trimming has occurred.
+  fsmblks:   always zero
+  uordblks:  current total allocated space (normal or mmapped)
+  fordblks:  total free space
+  keepcost:  the maximum number of bytes that could ideally be released
+               back to system via malloc_trim. ("ideally" means that
+               it ignores page restrictions etc.)
+
+  Because these fields are ints, but internal bookkeeping may
+  be kept as longs, the reported values may wrap around zero and
+  thus be inaccurate.
+*/
+struct mallinfo dlmallinfo(void);
+#endif /* NO_MALLINFO */
+
+/*
+  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);
+
+  independent_calloc is similar to calloc, but instead of returning a
+  single cleared space, it returns an array of pointers to n_elements
+  independent elements that can hold contents of size elem_size, each
+  of which starts out cleared, and can be independently freed,
+  realloc'ed etc. The elements are guaranteed to be adjacently
+  allocated (this is not guaranteed to occur with multiple callocs or
+  mallocs), which may also improve cache locality in some
+  applications.
+
+  The "chunks" argument is optional (i.e., may be null, which is
+  probably the most typical usage). If it is null, the returned array
+  is itself dynamically allocated and should also be freed when it is
+  no longer needed. Otherwise, the chunks array must be of at least
+  n_elements in length. It is filled in with the pointers to the
+  chunks.
+
+  In either case, independent_calloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and "chunks"
+  is null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use regular calloc and assign pointers into this
+  space to represent elements.  (In this case though, you cannot
+  independently free elements.)
+
+  independent_calloc simplifies and speeds up implementations of many
+  kinds of pools.  It may also be useful when constructing large data
+  structures that initially have a fixed number of fixed-sized nodes,
+  but the number is not known at compile time, and some of the nodes
+  may later need to be freed. For example:
+
+  struct Node { int item; struct Node* next; };
+
+  struct Node* build_list() {
+    struct Node** pool;
+    int n = read_number_of_nodes_needed();
+    if (n <= 0) return 0;
+    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
+    if (pool == 0) die();
+    // organize into a linked list...
+    struct Node* first = pool[0];
+    for (i = 0; i < n-1; ++i)
+      pool[i]->next = pool[i+1];
+    free(pool);     // Can now free the array (or not, if it is needed later)
+    return first;
+  }
+*/
+void** dlindependent_calloc(size_t, size_t, void**);
+
+/*
+  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);
+
+  independent_comalloc allocates, all at once, a set of n_elements
+  chunks with sizes indicated in the "sizes" array.    It returns
+  an array of pointers to these elements, each of which can be
+  independently freed, realloc'ed etc. The elements are guaranteed to
+  be adjacently allocated (this is not guaranteed to occur with
+  multiple callocs or mallocs), which may also improve cache locality
+  in some applications.
+
+  The "chunks" argument is optional (i.e., may be null). If it is null
+  the returned array is itself dynamically allocated and should also
+  be freed when it is no longer needed. Otherwise, the chunks array
+  must be of at least n_elements in length. It is filled in with the
+  pointers to the chunks.
+
+  In either case, independent_comalloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and chunks is
+  null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use a single regular malloc, and assign pointers at
+  particular offsets in the aggregate space. (In this case though, you
+  cannot independently free elements.)
+
+  independent_comallac differs from independent_calloc in that each
+  element may have a different size, and also that it does not
+  automatically clear elements.
+
+  independent_comalloc can be used to speed up allocation in cases
+  where several structs or objects must always be allocated at the
+  same time.  For example:
+
+  struct Head { ... }
+  struct Foot { ... }
+
+  void send_message(char* msg) {
+    int msglen = strlen(msg);
+    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
+    void* chunks[3];
+    if (independent_comalloc(3, sizes, chunks) == 0)
+      die();
+    struct Head* head = (struct Head*)(chunks[0]);
+    char*        body = (char*)(chunks[1]);
+    struct Foot* foot = (struct Foot*)(chunks[2]);
+    // ...
+  }
+
+  In general though, independent_comalloc is worth using only for
+  larger values of n_elements. For small values, you probably won't
+  detect enough difference from series of malloc calls to bother.
+
+  Overuse of independent_comalloc can increase overall memory usage,
+  since it cannot reuse existing noncontiguous small chunks that
+  might be available for some of the elements.
+*/
+void** dlindependent_comalloc(size_t, size_t*, void**);
+
+
+/*
+  pvalloc(size_t n);
+  Equivalent to valloc(minimum-page-that-holds(n)), that is,
+  round up n to nearest pagesize.
+ */
+void*  dlpvalloc(size_t);
+
+/*
+  malloc_trim(size_t pad);
+
+  If possible, gives memory back to the system (via negative arguments
+  to sbrk) if there is unused memory at the `high' end of the malloc
+  pool or in unused MMAP segments. You can call this after freeing
+  large blocks of memory to potentially reduce the system-level memory
+  requirements of a program. However, it cannot guarantee to reduce
+  memory. Under some allocation patterns, some large free blocks of
+  memory will be locked between two used chunks, so they cannot be
+  given back to the system.
+
+  The `pad' argument to malloc_trim represents the amount of free
+  trailing space to leave untrimmed. If this argument is zero, only
+  the minimum amount of memory to maintain internal data structures
+  will be left. Non-zero arguments can be supplied to maintain enough
+  trailing space to service future expected allocations without having
+  to re-obtain memory from the system.
+
+  Malloc_trim returns 1 if it actually released any memory, else 0.
+*/
+int  dlmalloc_trim(size_t);
+
+/*
+  malloc_stats();
+  Prints on stderr the amount of space obtained from the system (both
+  via sbrk and mmap), the maximum amount (which may be more than
+  current if malloc_trim and/or munmap got called), and the current
+  number of bytes allocated via malloc (or realloc, etc) but not yet
+  freed. Note that this is the number of bytes allocated, not the
+  number requested. It will be larger than the number requested
+  because of alignment and bookkeeping overhead. Because it includes
+  alignment wastage as being in use, this figure may be greater than
+  zero even when no user-level chunks are allocated.
+
+  The reported current and maximum system memory can be inaccurate if
+  a program makes other calls to system memory allocation functions
+  (normally sbrk) outside of malloc.
+
+  malloc_stats prints only the most commonly interesting statistics.
+  More information can be obtained by calling mallinfo.
+*/
+void  dlmalloc_stats(void);
+
+#endif /* ONLY_MSPACES */
+
+/*
+  malloc_usable_size(void* p);
+
+  Returns the number of bytes you can actually use in
+  an allocated chunk, which may be more than you requested (although
+  often not) due to alignment and minimum size constraints.
+  You can use this many bytes without worrying about
+  overwriting other allocated objects. This is not a particularly great
+  programming practice. malloc_usable_size can be more useful in
+  debugging and assertions, for example:
+
+  p = malloc(n);
+  assert(malloc_usable_size(p) >= 256);
+*/
+size_t dlmalloc_usable_size(void*);
+
+
+#if MSPACES
+
+/*
+  mspace is an opaque type representing an independent
+  region of space that supports mspace_malloc, etc.
+*/
+typedef void* mspace;
+
+/*
+  create_mspace creates and returns a new independent space with the
+  given initial capacity, or, if 0, the default granularity size.  It
+  returns null if there is no system memory available to create the
+  space.  If argument locked is non-zero, the space uses a separate
+  lock to control access. The capacity of the space will grow
+  dynamically as needed to service mspace_malloc requests.  You can
+  control the sizes of incremental increases of this space by
+  compiling with a different DEFAULT_GRANULARITY or dynamically
+  setting with mallopt(M_GRANULARITY, value).
+*/
+mspace create_mspace(size_t capacity, int locked);
+
+/*
+  destroy_mspace destroys the given space, and attempts to return all
+  of its memory back to the system, returning the total number of
+  bytes freed. After destruction, the results of access to all memory
+  used by the space become undefined.
+*/
+size_t destroy_mspace(mspace msp);
+
+/*
+  create_mspace_with_base uses the memory supplied as the initial base
+  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
+  space is used for bookkeeping, so the capacity must be at least this
+  large. (Otherwise 0 is returned.) When this initial space is
+  exhausted, additional memory will be obtained from the system.
+  Destroying this space will deallocate all additionally allocated
+  space (if possible) but not the initial base.
+*/
+mspace create_mspace_with_base(void* base, size_t capacity, int locked);
+
+/*
+  mspace_track_large_chunks controls whether requests for large chunks
+  are allocated in their own untracked mmapped regions, separate from
+  others in this mspace. By default large chunks are not tracked,
+  which reduces fragmentation. However, such chunks are not
+  necessarily released to the system upon destroy_mspace.  Enabling
+  tracking by setting to true may increase fragmentation, but avoids
+  leakage when relying on destroy_mspace to release all memory
+  allocated using this space.  The function returns the previous
+  setting.
+*/
+int mspace_track_large_chunks(mspace msp, int enable);
+
+
+/*
+  mspace_malloc behaves as malloc, but operates within
+  the given space.
+*/
+void* mspace_malloc(mspace msp, size_t bytes);
+
+/*
+  mspace_free behaves as free, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_free is not actually needed.
+  free may be called instead of mspace_free because freed chunks from
+  any space are handled by their originating spaces.
+*/
+void mspace_free(mspace msp, void* mem);
+
+/*
+  mspace_realloc behaves as realloc, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_realloc is not actually
+  needed.  realloc may be called instead of mspace_realloc because
+  realloced chunks from any space are handled by their originating
+  spaces.
+*/
+void* mspace_realloc(mspace msp, void* mem, size_t newsize);
+
+/*
+  mspace_calloc behaves as calloc, but operates within
+  the given space.
+*/
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);
+
+/*
+  mspace_memalign behaves as memalign, but operates within
+  the given space.
+*/
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);
+
+/*
+  mspace_independent_calloc behaves as independent_calloc, but
+  operates within the given space.
+*/
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+                                 size_t elem_size, void* chunks[]);
+
+/*
+  mspace_independent_comalloc behaves as independent_comalloc, but
+  operates within the given space.
+*/
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+                                   size_t sizes[], void* chunks[]);
+
+/*
+  mspace_footprint() returns the number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_footprint(mspace msp);
+
+/*
+  mspace_max_footprint() returns the peak number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_max_footprint(mspace msp);
+
+
+#if !NO_MALLINFO
+/*
+  mspace_mallinfo behaves as mallinfo, but reports properties of
+  the given space.
+*/
+struct mallinfo mspace_mallinfo(mspace msp);
+#endif /* NO_MALLINFO */
+
+/*
+  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
+*/
+  size_t mspace_usable_size(void* mem);
+
+/*
+  mspace_malloc_stats behaves as malloc_stats, but reports
+  properties of the given space.
+*/
+void mspace_malloc_stats(mspace msp);
+
+/*
+  mspace_trim behaves as malloc_trim, but
+  operates within the given space.
+*/
+int mspace_trim(mspace msp, size_t pad);
+
+/*
+  An alias for mallopt.
+*/
+int mspace_mallopt(int, int);
+
+#endif /* MSPACES */
+
+#ifdef __cplusplus
+}  /* end of extern "C" */
+#endif /* __cplusplus */
+
+/*
+  ========================================================================
+  To make a fully customizable malloc.h header file, cut everything
+  above this line, put into file malloc.h, edit to suit, and #include it
+  on the next line, as well as in programs that use this malloc.
+  ========================================================================
+*/
+
+/* #include "malloc.h" */
+
+/*------------------------------ internal #includes ---------------------- */
+
+#ifdef WIN32
+#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
+#endif /* WIN32 */
+
+#include <stdio.h>       /* for printing in malloc_stats */
+
+#ifndef LACKS_ERRNO_H
+#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
+#endif /* LACKS_ERRNO_H */
+#if FOOTERS || DEBUG
+#include <time.h>        /* for magic initialization */
+#endif /* FOOTERS */
+#ifndef LACKS_STDLIB_H
+#include <stdlib.h>      /* for abort() */
+#endif /* LACKS_STDLIB_H */
+#ifdef DEBUG
+#if ABORT_ON_ASSERT_FAILURE
+#undef assert
+#define assert(x) if(!(x)) ABORT
+#else /* ABORT_ON_ASSERT_FAILURE */
+#include <assert.h>
+#endif /* ABORT_ON_ASSERT_FAILURE */
+#else  /* DEBUG */
+#ifndef assert
+#define assert(x)
+#endif
+#define DEBUG 0
+#endif /* DEBUG */
+#ifndef LACKS_STRING_H
+#include <string.h>      /* for memset etc */
+#endif  /* LACKS_STRING_H */
+#if USE_BUILTIN_FFS
+#ifndef LACKS_STRINGS_H
+#include <strings.h>     /* for ffs */
+#endif /* LACKS_STRINGS_H */
+#endif /* USE_BUILTIN_FFS */
+#if HAVE_MMAP
+#ifndef LACKS_SYS_MMAN_H
+/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
+#if (defined(linux) && !defined(__USE_GNU))
+#define __USE_GNU 1
+#include <sys/mman.h>    /* for mmap */
+#undef __USE_GNU
+#else
+#include <sys/mman.h>    /* for mmap */
+#endif /* linux */
+#endif /* LACKS_SYS_MMAN_H */
+#ifndef LACKS_FCNTL_H
+#include <fcntl.h>
+#endif /* LACKS_FCNTL_H */
+#endif /* HAVE_MMAP */
+#ifndef LACKS_UNISTD_H
+#include <unistd.h>     /* for sbrk, sysconf */
+#else /* LACKS_UNISTD_H */
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
+extern void*     sbrk(ptrdiff_t);
+#endif /* FreeBSD etc */
+#endif /* LACKS_UNISTD_H */
+
+/* Declarations for locking */
+#if USE_LOCKS
+#ifndef WIN32
+#include <pthread.h>
+#if defined (__SVR4) && defined (__sun)  /* solaris */
+#include <thread.h>
+#endif /* solaris */
+#else
+#ifndef _M_AMD64
+/* These are already defined on AMD64 builds */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
+LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* _M_AMD64 */
+#pragma intrinsic (_InterlockedCompareExchange)
+#pragma intrinsic (_InterlockedExchange)
+#define interlockedcompareexchange _InterlockedCompareExchange
+#define interlockedexchange _InterlockedExchange
+#endif /* Win32 */
+#endif /* USE_LOCKS */
+
+/* Declarations for bit scanning on win32 */
+#if defined(_MSC_VER) && _MSC_VER>=1300
+#ifndef BitScanForward	/* Try to avoid pulling in WinNT.h */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
+unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#define BitScanForward _BitScanForward
+#define BitScanReverse _BitScanReverse
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#endif /* BitScanForward */
+#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */
+
+#ifndef WIN32
+#ifndef malloc_getpagesize
+#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
+#    ifndef _SC_PAGE_SIZE
+#      define _SC_PAGE_SIZE _SC_PAGESIZE
+#    endif
+#  endif
+#  ifdef _SC_PAGE_SIZE
+#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
+#  else
+#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
+       extern size_t getpagesize();
+#      define malloc_getpagesize getpagesize()
+#    else
+#      ifdef WIN32 /* use supplied emulation of getpagesize */
+#        define malloc_getpagesize getpagesize()
+#      else
+#        ifndef LACKS_SYS_PARAM_H
+#          include <sys/param.h>
+#        endif
+#        ifdef EXEC_PAGESIZE
+#          define malloc_getpagesize EXEC_PAGESIZE
+#        else
+#          ifdef NBPG
+#            ifndef CLSIZE
+#              define malloc_getpagesize NBPG
+#            else
+#              define malloc_getpagesize (NBPG * CLSIZE)
+#            endif
+#          else
+#            ifdef NBPC
+#              define malloc_getpagesize NBPC
+#            else
+#              ifdef PAGESIZE
+#                define malloc_getpagesize PAGESIZE
+#              else /* just guess */
+#                define malloc_getpagesize ((size_t)4096U)
+#              endif
+#            endif
+#          endif
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+#endif
+
+
+
+/* ------------------- size_t and alignment properties -------------------- */
+
+/* The byte and bit size of a size_t */
+#define SIZE_T_SIZE         (sizeof(size_t))
+#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)
+
+/* Some constants coerced to size_t */
+/* Annoying but necessary to avoid errors on some platforms */
+#define SIZE_T_ZERO         ((size_t)0)
+#define SIZE_T_ONE          ((size_t)1)
+#define SIZE_T_TWO          ((size_t)2)
+#define SIZE_T_FOUR         ((size_t)4)
+#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
+#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
+#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
+#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)
+
+/* The bit mask value corresponding to MALLOC_ALIGNMENT */
+#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)
+
+/* True if address a has acceptable alignment */
+#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)
+
+/* the number of bytes to offset an address to align it */
+#define align_offset(A)\
+ ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
+  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))
+
+/*
+  malloc_params holds global properties, including those that can be
+  dynamically set using mallopt. There is a single instance, mparams,
+  initialized in init_mparams. Note that the non-zeroness of "magic"
+  also serves as an initialization flag.
+*/
+typedef unsigned int flag_t;
+struct malloc_params {
+  volatile size_t magic;
+  size_t page_size;
+  size_t granularity;
+  size_t mmap_threshold;
+  size_t trim_threshold;
+  flag_t default_mflags;
+};
+
+static struct malloc_params mparams;
+
+/* Ensure mparams initialized */
+#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams())
+
+/* -------------------------- MMAP preliminaries ------------------------- */
+
+/*
+   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
+   checks to fail so compiler optimizer can delete code rather than
+   using so many "#if"s.
+*/
+
+
+/* MORECORE and MMAP must return MFAIL on failure */
+#define MFAIL                ((void*)(MAX_SIZE_T))
+#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */
+
+#if HAVE_MMAP
+
+#ifndef WIN32
+#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
+#define MAP_ANONYMOUS        MAP_ANON
+#endif /* MAP_ANON */
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+#define MMAP_IMPL mmap_aligned
+static void* lastAlignedmmap; /* Used as a hint */
+static void* mmap_aligned(void *start, size_t length, int prot, int flags, int fd, off_t offset) {
+  void* baseaddress = 0;
+  void* ptr = 0;
+  if(!start) {
+    baseaddress = lastAlignedmmap;
+    for(;;) {
+      if(baseaddress) flags|=MAP_FIXED;
+      ptr = mmap(baseaddress, length, prot, flags, fd, offset);
+      if(!ptr)
+        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
+      else if((size_t)ptr & (mparams.granularity - SIZE_T_ONE)) {
+        munmap(ptr, length);
+        baseaddress = (void*)(((size_t)ptr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
+      }
+      else break;
+    }
+  }
+  else ptr = mmap(start, length, prot, flags, fd, offset);
+  if(ptr) lastAlignedmmap = (void*)((size_t) ptr + mparams.granularity);
+  return ptr;
+}
+#else
+#define MMAP_IMPL mmap
+#endif /* DEFAULT_GRANULARITY_ALIGNED */
+#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
+#define MMAP_PROT            (PROT_READ|PROT_WRITE)
+#ifdef MAP_ANONYMOUS
+#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
+#define MMAP_DEFAULT(s)       MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
+#else /* MAP_ANONYMOUS */
+/*
+   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
+   is unlikely to be needed, but is supplied just in case.
+*/
+#define MMAP_FLAGS           (MAP_PRIVATE)
+static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
+#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
+           (dev_zero_fd = open("/dev/zero", O_RDWR), \
+            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
+            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
+#endif /* MAP_ANONYMOUS */
+
+#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)
+
+#else /* WIN32 */
+
+/* Win32 MMAP via VirtualAlloc */
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+static void* lastWin32mmap; /* Used as a hint */
+#endif /* DEFAULT_GRANULARITY_ALIGNED */
+#ifdef ENABLE_LARGE_PAGES
+static int largepagesavailable = 1;
+#endif /* ENABLE_LARGE_PAGES */
+static FORCEINLINE void* win32mmap(size_t size) {
+  void* baseaddress = 0;
+  void* ptr = 0;
+#ifdef ENABLE_LARGE_PAGES
+  /* Note that large pages are *always* allocated on a large page boundary.
+  If however granularity is small then don't waste a kernel call if size
+  isn't around the size of a large page */
+  if(largepagesavailable && size >= 1*1024*1024) {
+    ptr = VirtualAlloc(baseaddress, size, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
+    if(!ptr && ERROR_PRIVILEGE_NOT_HELD==GetLastError()) largepagesavailable=0;
+  }
+#endif
+  if(!ptr) {
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+    /* We try to avoid overhead by speculatively reserving at aligned
+    addresses until we succeed */
+    baseaddress = lastWin32mmap;
+    for(;;) {
+      void* reserveaddr = VirtualAlloc(baseaddress, size, MEM_RESERVE, PAGE_READWRITE);
+      if(!reserveaddr)
+        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
+      else if((size_t)reserveaddr & (mparams.granularity - SIZE_T_ONE)) {
+        VirtualFree(reserveaddr, 0, MEM_RELEASE);
+        baseaddress = (void*)(((size_t)reserveaddr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
+      }
+      else break;
+    }
+#endif
+    if(!ptr) ptr = VirtualAlloc(baseaddress, size, baseaddress ? MEM_COMMIT : MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+#if DEBUG
+    if(lastWin32mmap && ptr!=lastWin32mmap) printf("Non-contiguous VirtualAlloc between %p and %p\n", ptr, lastWin32mmap);
+#endif
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+    if(ptr) lastWin32mmap = (void*)((size_t) ptr + mparams.granularity);
+#endif
+  }
+#if DEBUG
+#ifdef ENABLE_LARGE_PAGES
+  printf("VirtualAlloc returns %p size %u. LargePagesAvailable=%d\n", ptr, size, largepagesavailable);
+#else
+  printf("VirtualAlloc returns %p size %u\n", ptr, size);
+#endif
+#endif
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
+static FORCEINLINE void* win32direct_mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
+                           PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* This function supports releasing coalesed segments */
+static FORCEINLINE int win32munmap(void* ptr, size_t size) {
+  MEMORY_BASIC_INFORMATION minfo;
+  char* cptr = (char*)ptr;
+  while (size) {
+    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
+      return -1;
+    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
+        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
+      return -1;
+    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
+      return -1;
+    cptr += minfo.RegionSize;
+    size -= minfo.RegionSize;
+  }
+  return 0;
+}
+
+#define MMAP_DEFAULT(s)             win32mmap(s)
+#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
+#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
+#endif /* WIN32 */
+#endif /* HAVE_MMAP */
+
+#if HAVE_MREMAP
+#ifndef WIN32
+#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
+#endif /* WIN32 */
+#endif /* HAVE_MREMAP */
+
+
+/**
+ * Define CALL_MORECORE
+ */
+#if HAVE_MORECORE
+    #ifdef MORECORE
+        #define CALL_MORECORE(S)    MORECORE(S)
+    #else  /* MORECORE */
+        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
+    #endif /* MORECORE */
+#else  /* HAVE_MORECORE */
+    #define CALL_MORECORE(S)        MFAIL
+#endif /* HAVE_MORECORE */
+
+/**
+ * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
+ */
+#if HAVE_MMAP
+    #define USE_MMAP_BIT            (SIZE_T_ONE)
+
+    #ifdef MMAP
+        #define CALL_MMAP(s)        MMAP(s)
+    #else /* MMAP */
+        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
+    #endif /* MMAP */
+    #ifdef MUNMAP
+        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
+    #else /* MUNMAP */
+        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
+    #endif /* MUNMAP */
+    #ifdef DIRECT_MMAP
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
+    #else /* DIRECT_MMAP */
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
+    #endif /* DIRECT_MMAP */
+#else  /* HAVE_MMAP */
+    #define USE_MMAP_BIT            (SIZE_T_ZERO)
+
+    #define MMAP(s)                 MFAIL
+    #define MUNMAP(a, s)            (-1)
+    #define DIRECT_MMAP(s)          MFAIL
+    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
+    #define CALL_MMAP(s)            MMAP(s)
+    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
+#endif /* HAVE_MMAP */
+
+/**
+ * Define CALL_MREMAP
+ */
+#if HAVE_MMAP && HAVE_MREMAP
+    #ifdef MREMAP
+        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
+    #else /* MREMAP */
+        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
+    #endif /* MREMAP */
+#else  /* HAVE_MMAP && HAVE_MREMAP */
+    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
+#endif /* HAVE_MMAP && HAVE_MREMAP */
+
+/* mstate bit set if continguous morecore disabled or failed */
+#define USE_NONCONTIGUOUS_BIT (4U)
+
+/* segment bit set in create_mspace_with_base */
+#define EXTERN_BIT            (8U)
+
+
+/* --------------------------- Lock preliminaries ------------------------ */
+
+/*
+  When locks are defined, there is one global lock, plus
+  one per-mspace lock.
+
+  The global lock_ensures that mparams.magic and other unique
+  mparams values are initialized only once. It also protects
+  sequences of calls to MORECORE.  In many cases sys_alloc requires
+  two calls, that should not be interleaved with calls by other
+  threads.  This does not protect against direct calls to MORECORE
+  by other threads not using this lock, so there is still code to
+  cope the best we can on interference.
+
+  Per-mspace locks surround calls to malloc, free, etc.  To enable use
+  in layered extensions, per-mspace locks are reentrant.
+
+  Because lock-protected regions generally have bounded times, it is
+  OK to use the supplied simple spinlocks in the custom versions for
+  x86. Spinlocks are likely to improve performance for lightly
+  contended applications, but worsen performance under heavy
+  contention.
+
+  If USE_LOCKS is > 1, the definitions of lock routines here are
+  bypassed, in which case you will need to define the type MLOCK_T,
+  and at least INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK and possibly
+  TRY_LOCK (which is not used in this malloc, but commonly needed in
+  extensions.)  You must also declare a
+    static MLOCK_T malloc_global_mutex = { initialization values };.
+
+*/
+
+#if USE_LOCKS == 1
+
+#if USE_SPIN_LOCKS && SPIN_LOCKS_AVAILABLE
+#ifndef WIN32
+
+/* Custom pthread-style spin locks on x86 and x64 for gcc */
+struct pthread_mlock_t {
+  volatile unsigned int l;
+  char cachelinepadding[64];
+  unsigned int c;
+  pthread_t threadid;
+};
+#define MLOCK_T               struct pthread_mlock_t
+#define CURRENT_THREAD        pthread_self()
+#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
+#define ACQUIRE_LOCK(sl)      pthread_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_release_lock(sl)
+#define TRY_LOCK(sl)          pthread_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+static MLOCK_T malloc_global_mutex = { 0, "", 0, 0};
+
+static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  volatile unsigned int* lp = &sl->l;
+  for (;;) {
+    if (*lp != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+        ++sl->c;
+        return 0;
+      }
+    }
+    else {
+      /* place args to cmpxchgl in locals to evade oddities in some gccs */
+      int cmp = 0;
+      int val = 1;
+      int ret;
+      __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+                             : "=a" (ret)
+                             : "r" (val), "m" (*(lp)), "0"(cmp)
+                             : "memory", "cc");
+      if (!ret) {
+        assert(!sl->threadid);
+        sl->threadid = CURRENT_THREAD;
+        sl->c = 1;
+        return 0;
+      }
+    }
+    if ((++spins & SPINS_PER_YIELD) == 0) {
+#if defined (__SVR4) && defined (__sun) /* solaris */
+      thr_yield();
+#else
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
+      sched_yield();
+#else  /* no-op yield on unknown systems */
+      ;
+#endif /* __linux__ || __FreeBSD__ || __APPLE__ */
+#endif /* solaris */
+    }
+  }
+}
+
+static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) {
+  volatile unsigned int* lp = &sl->l;
+  assert(*lp != 0);
+  assert(sl->threadid == CURRENT_THREAD);
+  if (--sl->c == 0) {
+    sl->threadid = 0;
+    int prev = 0;
+    int ret;
+    __asm__ __volatile__ ("lock; xchgl %0, %1"
+                          : "=r" (ret)
+                          : "m" (*(lp)), "0"(prev)
+                          : "memory");
+  }
+}
+
+static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) {
+  volatile unsigned int* lp = &sl->l;
+  if (*lp != 0) {
+    if (sl->threadid == CURRENT_THREAD) {
+      ++sl->c;
+      return 1;
+    }
+  }
+  else {
+    int cmp = 0;
+    int val = 1;
+    int ret;
+    __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+                           : "=a" (ret)
+                           : "r" (val), "m" (*(lp)), "0"(cmp)
+                           : "memory", "cc");
+    if (!ret) {
+      assert(!sl->threadid);
+      sl->threadid = CURRENT_THREAD;
+      sl->c = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+
+#else /* WIN32 */
+/* Custom win32-style spin locks on x86 and x64 for MSC */
+struct win32_mlock_t {
+  volatile long l;
+  char cachelinepadding[64];
+  unsigned int c;
+  long threadid;
+};
+
+#define MLOCK_T               struct win32_mlock_t
+#define CURRENT_THREAD        ((long)GetCurrentThreadId())
+#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
+#define ACQUIRE_LOCK(sl)      win32_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      win32_release_lock(sl)
+#define TRY_LOCK(sl)          win32_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+static MLOCK_T malloc_global_mutex = { 0, 0, 0};
+
+static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  for (;;) {
+    if (sl->l != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+        ++sl->c;
+        return 0;
+      }
+    }
+    else {
+      if (!interlockedexchange(&sl->l, 1)) {
+        assert(!sl->threadid);
+        sl->threadid = CURRENT_THREAD;
+        sl->c = 1;
+        return 0;
+      }
+    }
+    if ((++spins & SPINS_PER_YIELD) == 0)
+      SleepEx(0, FALSE);
+  }
+}
+
+static FORCEINLINE void win32_release_lock (MLOCK_T *sl) {
+  assert(sl->threadid == CURRENT_THREAD);
+  assert(sl->l != 0);
+  if (--sl->c == 0) {
+    sl->threadid = 0;
+    interlockedexchange (&sl->l, 0);
+  }
+}
+
+static FORCEINLINE int win32_try_lock (MLOCK_T *sl) {
+  if (sl->l != 0) {
+    if (sl->threadid == CURRENT_THREAD) {
+      ++sl->c;
+      return 1;
+    }
+  }
+  else {
+    if (!interlockedexchange(&sl->l, 1)){
+      assert(!sl->threadid);
+      sl->threadid = CURRENT_THREAD;
+      sl->c = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+#endif /* WIN32 */
+#else /* USE_SPIN_LOCKS */
+
+#ifndef WIN32
+/* pthreads-based locks */
+
+#define MLOCK_T               pthread_mutex_t
+#define CURRENT_THREAD        pthread_self()
+#define INITIAL_LOCK(sl)      pthread_init_lock(sl)
+#define ACQUIRE_LOCK(sl)      pthread_mutex_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_mutex_unlock(sl)
+#define TRY_LOCK(sl)          (!pthread_mutex_trylock(sl))
+
+static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Cope with old-style linux recursive lock initialization by adding */
+/* skipped internal declaration from pthread.h */
+#ifdef linux
+#ifndef PTHREAD_MUTEX_RECURSIVE
+extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
+					   int __kind));
+#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
+#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
+#endif
+#endif
+
+static int pthread_init_lock (MLOCK_T *sl) {
+  pthread_mutexattr_t attr;
+  if (pthread_mutexattr_init(&attr)) return 1;
+  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
+  if (pthread_mutex_init(sl, &attr)) return 1;
+  if (pthread_mutexattr_destroy(&attr)) return 1;
+  return 0;
+}
+
+#else /* WIN32 */
+/* Win32 critical sections */
+#define MLOCK_T               CRITICAL_SECTION
+#define CURRENT_THREAD        GetCurrentThreadId()
+#define INITIAL_LOCK(s)       (!InitializeCriticalSectionAndSpinCount((s), 0x80000000|4000))
+#define ACQUIRE_LOCK(s)       (EnterCriticalSection(sl), 0)
+#define RELEASE_LOCK(s)       LeaveCriticalSection(sl)
+#define TRY_LOCK(s)           TryEnterCriticalSection(sl)
+#define NEED_GLOBAL_LOCK_INIT
+
+static MLOCK_T malloc_global_mutex;
+static volatile long malloc_global_mutex_status;
+
+/* Use spin loop to initialize global lock */
+static void init_malloc_global_mutex() {
+  for (;;) {
+    long stat = malloc_global_mutex_status;
+    if (stat > 0)
+      return;
+    /* transition to < 0 while initializing, then to > 0) */
+    if (stat == 0 &&
+        interlockedcompareexchange(&malloc_global_mutex_status, -1, 0) == 0) {
+      InitializeCriticalSection(&malloc_global_mutex);
+      interlockedexchange(&malloc_global_mutex_status,1);
+      return;
+    }
+    SleepEx(0, FALSE);
+  }
+}
+
+#endif /* WIN32 */
+#endif /* USE_SPIN_LOCKS */
+#endif /* USE_LOCKS == 1 */
+
+/* -----------------------  User-defined locks ------------------------ */
+
+#if USE_LOCKS > 1
+/* Define your own lock implementation here */
+/* #define INITIAL_LOCK(sl)  ... */
+/* #define ACQUIRE_LOCK(sl)  ... */
+/* #define RELEASE_LOCK(sl)  ... */
+/* #define TRY_LOCK(sl) ... */
+/* static MLOCK_T malloc_global_mutex = ... */
+#endif /* USE_LOCKS > 1 */
+
+/* -----------------------  Lock-based state ------------------------ */
+
+#if USE_LOCKS
+#define USE_LOCK_BIT               (2U)
+#else  /* USE_LOCKS */
+#define USE_LOCK_BIT               (0U)
+#define INITIAL_LOCK(l)
+#endif /* USE_LOCKS */
+
+#if USE_LOCKS
+#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
+#endif
+#ifndef RELEASE_MALLOC_GLOBAL_LOCK
+#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
+#endif
+#else  /* USE_LOCKS */
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()
+#define RELEASE_MALLOC_GLOBAL_LOCK()
+#endif /* USE_LOCKS */
+
+
+/* -----------------------  Chunk representations ------------------------ */
+
+/*
+  (The following includes lightly edited explanations by Colin Plumb.)
+
+  The malloc_chunk declaration below is misleading (but accurate and
+  necessary).  It declares a "view" into memory allowing access to
+  necessary fields at known offsets from a given base.
+
+  Chunks of memory are maintained using a `boundary tag' method as
+  originally described by Knuth.  (See the paper by Paul Wilson
+  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
+  techniques.)  Sizes of free chunks are stored both in the front of
+  each chunk and at the end.  This makes consolidating fragmented
+  chunks into bigger chunks fast.  The head fields also hold bits
+  representing whether chunks are free or in use.
+
+  Here are some pictures to make it clearer.  They are "exploded" to
+  show that the state of a chunk can be thought of as extending from
+  the high 31 bits of the head field of its header through the
+  prev_foot and PINUSE_BIT bit of the following chunk header.
+
+  A chunk that's in use looks like:
+
+   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+           | Size of previous chunk (if P = 0)                             |
+           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+         | Size of this chunk                                         1| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         |                                                               |
+         +-                                                             -+
+         |                                                               |
+         +-                                                             -+
+         |                                                               :
+         +-      size - sizeof(size_t) available payload bytes          -+
+         :                                                               |
+ chunk-> +-                                                             -+
+         |                                                               |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
+       | Size of next chunk (may or may not be in use)               | +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+    And if it's free, it looks like this:
+
+   chunk-> +-                                                             -+
+           | User payload (must be in use, or we would have merged!)       |
+           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+         | Size of this chunk                                         0| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Next pointer                                                  |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Prev pointer                                                  |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         |                                                               :
+         +-      size - sizeof(struct chunk) unused bytes               -+
+         :                                                               |
+ chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Size of this chunk                                            |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
+       | Size of next chunk (must be in use, or we would have merged)| +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       |                                                               :
+       +- User payload                                                -+
+       :                                                               |
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+                                                                     |0|
+                                                                     +-+
+  Note that since we always merge adjacent free chunks, the chunks
+  adjacent to a free chunk must be in use.
+
+  Given a pointer to a chunk (which can be derived trivially from the
+  payload pointer) we can, in O(1) time, find out whether the adjacent
+  chunks are free, and if so, unlink them from the lists that they
+  are on and merge them with the current chunk.
+
+  Chunks always begin on even word boundaries, so the mem portion
+  (which is returned to the user) is also on an even word boundary, and
+  thus at least double-word aligned.
+
+  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
+  chunk size (which is always a multiple of two words), is an in-use
+  bit for the *previous* chunk.  If that bit is *clear*, then the
+  word before the current chunk size contains the previous chunk
+  size, and can be used to find the front of the previous chunk.
+  The very first chunk allocated always has this bit set, preventing
+  access to non-existent (or non-owned) memory. If pinuse is set for
+  any given chunk, then you CANNOT determine the size of the
+  previous chunk, and might even get a memory addressing fault when
+  trying to do so.
+
+  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
+  the chunk size redundantly records whether the current chunk is
+  inuse (unless the chunk is mmapped). This redundancy enables usage
+  checks within free and realloc, and reduces indirection when freeing
+  and consolidating chunks.
+
+  Each freshly allocated chunk must have both cinuse and pinuse set.
+  That is, each allocated chunk borders either a previously allocated
+  and still in-use chunk, or the base of its memory arena. This is
+  ensured by making all allocations from the the `lowest' part of any
+  found chunk.  Further, no free chunk physically borders another one,
+  so each free chunk is known to be preceded and followed by either
+  inuse chunks or the ends of memory.
+
+  Note that the `foot' of the current chunk is actually represented
+  as the prev_foot of the NEXT chunk. This makes it easier to
+  deal with alignments etc but can be very confusing when trying
+  to extend or adapt this code.
+
+  The exceptions to all this are
+
+     1. The special chunk `top' is the top-most available chunk (i.e.,
+        the one bordering the end of available memory). It is treated
+        specially.  Top is never included in any bin, is used only if
+        no other chunk is available, and is released back to the
+        system if it is very large (see M_TRIM_THRESHOLD).  In effect,
+        the top chunk is treated as larger (and thus less well
+        fitting) than any other available chunk.  The top chunk
+        doesn't update its trailing size field since there is no next
+        contiguous chunk that would have to index off it. However,
+        space is still allocated for it (TOP_FOOT_SIZE) to enable
+        separation or merging when space is extended.
+
+     3. Chunks allocated via mmap, have both cinuse and pinuse bits
+        cleared in their head fields.  Because they are allocated
+        one-by-one, each must carry its own prev_foot field, which is
+        also used to hold the offset this chunk has within its mmapped
+        region, which is needed to preserve alignment. Each mmapped
+        chunk is trailed by the first two fields of a fake next-chunk
+        for sake of usage checks.
+
+*/
+
+struct malloc_chunk {
+  size_t               prev_foot;  /* Size of previous chunk (if free).  */
+  size_t               head;       /* Size and inuse bits. */
+  struct malloc_chunk* fd;         /* double links -- used only if free. */
+  struct malloc_chunk* bk;
+};
+
+typedef struct malloc_chunk  mchunk;
+typedef struct malloc_chunk* mchunkptr;
+typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
+typedef unsigned int bindex_t;         /* Described below */
+typedef unsigned int binmap_t;         /* Described below */
+
+/* ------------------- Chunks sizes and alignments ----------------------- */
+
+#define MCHUNK_SIZE         (sizeof(mchunk))
+
+#if FOOTERS
+#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
+#else /* FOOTERS */
+#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
+#endif /* FOOTERS */
+
+/* MMapped chunks need a second word of overhead ... */
+#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
+/* ... and additional padding for fake next-chunk at foot */
+#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)
+
+/* The smallest size we can malloc is an aligned minimal chunk */
+#define MIN_CHUNK_SIZE\
+  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* conversion from malloc headers to user pointers, and back */
+#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
+#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
+/* chunk associated with aligned address A */
+#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))
+
+/* Bounds on request (not chunk) sizes. */
+#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
+#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)
+
+/* pad request bytes into a usable size */
+#define pad_request(req) \
+   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* pad request, checking for minimum (but not maximum) */
+#define request2size(req) \
+  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))
+
+
+/* ------------------ Operations on head and foot fields ----------------- */
+
+/*
+  The head field of a chunk is or'ed with PINUSE_BIT when previous
+  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
+  use, unless mmapped, in which case both bits are cleared.
+
+  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
+*/
+
+#define PINUSE_BIT          (SIZE_T_ONE)
+#define CINUSE_BIT          (SIZE_T_TWO)
+#define FLAG4_BIT           (SIZE_T_FOUR)
+#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
+#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)
+
+/* Head value for fenceposts */
+#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)
+
+/* extraction of fields from head words */
+#define cinuse(p)           ((p)->head & CINUSE_BIT)
+#define pinuse(p)           ((p)->head & PINUSE_BIT)
+#define is_inuse(p)         (((p)->head & INUSE_BITS) != PINUSE_BIT)
+#define is_mmapped(p)       (((p)->head & INUSE_BITS) == 0)
+
+#define chunksize(p)        ((p)->head & ~(FLAG_BITS))
+
+#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
+
+/* Treat space at ptr +/- offset as a chunk */
+#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
+#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))
+
+/* Ptr to next or previous physical malloc_chunk. */
+#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
+#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))
+
+/* extract next chunk's pinuse bit */
+#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)
+
+/* Get/set size at footer */
+#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
+#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))
+
+/* Set size, pinuse bit, and foot */
+#define set_size_and_pinuse_of_free_chunk(p, s)\
+  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))
+
+/* Set size, pinuse bit, foot, and clear next pinuse */
+#define set_free_with_pinuse(p, s, n)\
+  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))
+
+/* Get the internal overhead associated with chunk p */
+#define overhead_for(p)\
+ (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)
+
+/* Return true if malloced space is not necessarily cleared */
+#if MMAP_CLEARS
+#define calloc_must_clear(p) (!is_mmapped(p))
+#else /* MMAP_CLEARS */
+#define calloc_must_clear(p) (1)
+#endif /* MMAP_CLEARS */
+
+/* ---------------------- Overlaid data structures ----------------------- */
+
+/*
+  When chunks are not in use, they are treated as nodes of either
+  lists or trees.
+
+  "Small"  chunks are stored in circular doubly-linked lists, and look
+  like this:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk                            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Forward pointer to next chunk in list             |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Back pointer to previous chunk in list            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Unused space (may be 0 bytes long)                .
+            .                                                               .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Larger chunks are kept in a form of bitwise digital trees (aka
+  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
+  free chunks greater than 256 bytes, their size doesn't impose any
+  constraints on user chunk sizes.  Each node looks like:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk                            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Forward pointer to next chunk of same size        |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Back pointer to previous chunk of same size       |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to left child (child[0])                  |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to right child (child[1])                 |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to parent                                 |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             bin index of this chunk                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Unused space                                      .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
+  of the same size are arranged in a circularly-linked list, with only
+  the oldest chunk (the next to be used, in our FIFO ordering)
+  actually in the tree.  (Tree members are distinguished by a non-null
+  parent pointer.)  If a chunk with the same size an an existing node
+  is inserted, it is linked off the existing node using pointers that
+  work in the same way as fd/bk pointers of small chunks.
+
+  Each tree contains a power of 2 sized range of chunk sizes (the
+  smallest is 0x100 <= x < 0x180), which is is divided in half at each
+  tree level, with the chunks in the smaller half of the range (0x100
+  <= x < 0x140 for the top nose) in the left subtree and the larger
+  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
+  done by inspecting individual bits.
+
+  Using these rules, each node's left subtree contains all smaller
+  sizes than its right subtree.  However, the node at the root of each
+  subtree has no particular ordering relationship to either.  (The
+  dividing line between the subtree sizes is based on trie relation.)
+  If we remove the last chunk of a given size from the interior of the
+  tree, we need to replace it with a leaf node.  The tree ordering
+  rules permit a node to be replaced by any leaf below it.
+
+  The smallest chunk in a tree (a common operation in a best-fit
+  allocator) can be found by walking a path to the leftmost leaf in
+  the tree.  Unlike a usual binary tree, where we follow left child
+  pointers until we reach a null, here we follow the right child
+  pointer any time the left one is null, until we reach a leaf with
+  both child pointers null. The smallest chunk in the tree will be
+  somewhere along that path.
+
+  The worst case number of steps to add, find, or remove a node is
+  bounded by the number of bits differentiating chunks within
+  bins. Under current bin calculations, this ranges from 6 up to 21
+  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
+  is of course much better.
+*/
+
+struct malloc_tree_chunk {
+  /* The first four fields must be compatible with malloc_chunk */
+  size_t                    prev_foot;
+  size_t                    head;
+  struct malloc_tree_chunk* fd;
+  struct malloc_tree_chunk* bk;
+
+  struct malloc_tree_chunk* child[2];
+  struct malloc_tree_chunk* parent;
+  bindex_t                  index;
+};
+
+typedef struct malloc_tree_chunk  tchunk;
+typedef struct malloc_tree_chunk* tchunkptr;
+typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */
+
+/* A little helper macro for trees */
+#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])
+
+/* ----------------------------- Segments -------------------------------- */
+
+/*
+  Each malloc space may include non-contiguous segments, held in a
+  list headed by an embedded malloc_segment record representing the
+  top-most space. Segments also include flags holding properties of
+  the space. Large chunks that are directly allocated by mmap are not
+  included in this list. They are instead independently created and
+  destroyed without otherwise keeping track of them.
+
+  Segment management mainly comes into play for spaces allocated by
+  MMAP.  Any call to MMAP might or might not return memory that is
+  adjacent to an existing segment.  MORECORE normally contiguously
+  extends the current space, so this space is almost always adjacent,
+  which is simpler and faster to deal with. (This is why MORECORE is
+  used preferentially to MMAP when both are available -- see
+  sys_alloc.)  When allocating using MMAP, we don't use any of the
+  hinting mechanisms (inconsistently) supported in various
+  implementations of unix mmap, or distinguish reserving from
+  committing memory. Instead, we just ask for space, and exploit
+  contiguity when we get it.  It is probably possible to do
+  better than this on some systems, but no general scheme seems
+  to be significantly better.
+
+  Management entails a simpler variant of the consolidation scheme
+  used for chunks to reduce fragmentation -- new adjacent memory is
+  normally prepended or appended to an existing segment. However,
+  there are limitations compared to chunk consolidation that mostly
+  reflect the fact that segment processing is relatively infrequent
+  (occurring only when getting memory from system) and that we
+  don't expect to have huge numbers of segments:
+
+  * Segments are not indexed, so traversal requires linear scans.  (It
+    would be possible to index these, but is not worth the extra
+    overhead and complexity for most programs on most platforms.)
+  * New segments are only appended to old ones when holding top-most
+    memory; if they cannot be prepended to others, they are held in
+    different segments.
+
+  Except for the top-most segment of an mstate, each segment record
+  is kept at the tail of its segment. Segments are added by pushing
+  segment records onto the list headed by &mstate.seg for the
+  containing mstate.
+
+  Segment flags control allocation/merge/deallocation policies:
+  * If EXTERN_BIT set, then we did not allocate this segment,
+    and so should not try to deallocate or merge with others.
+    (This currently holds only for the initial segment passed
+    into create_mspace_with_base.)
+  * If USE_MMAP_BIT set, the segment may be merged with
+    other surrounding mmapped segments and trimmed/de-allocated
+    using munmap.
+  * If neither bit is set, then the segment was obtained using
+    MORECORE so can be merged with surrounding MORECORE'd segments
+    and deallocated/trimmed using MORECORE with negative arguments.
+*/
+
+struct malloc_segment {
+  char*        base;             /* base address */
+  size_t       size;             /* allocated size */
+  struct malloc_segment* next;   /* ptr to next segment */
+  flag_t       sflags;           /* mmap and extern flag */
+};
+
+#define is_mmapped_segment(S)  ((S)->sflags & USE_MMAP_BIT)
+#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)
+
+typedef struct malloc_segment  msegment;
+typedef struct malloc_segment* msegmentptr;
+
+/* ---------------------------- malloc_state ----------------------------- */
+
+/*
+   A malloc_state holds all of the bookkeeping for a space.
+   The main fields are:
+
+  Top
+    The topmost chunk of the currently active segment. Its size is
+    cached in topsize.  The actual size of topmost space is
+    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
+    fenceposts and segment records if necessary when getting more
+    space from the system.  The size at which to autotrim top is
+    cached from mparams in trim_check, except that it is disabled if
+    an autotrim fails.
+
+  Designated victim (dv)
+    This is the preferred chunk for servicing small requests that
+    don't have exact fits.  It is normally the chunk split off most
+    recently to service another small request.  Its size is cached in
+    dvsize. The link fields of this chunk are not maintained since it
+    is not kept in a bin.
+
+  SmallBins
+    An array of bin headers for free chunks.  These bins hold chunks
+    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
+    chunks of all the same size, spaced 8 bytes apart.  To simplify
+    use in double-linked lists, each bin header acts as a malloc_chunk
+    pointing to the real first node, if it exists (else pointing to
+    itself).  This avoids special-casing for headers.  But to avoid
+    waste, we allocate only the fd/bk pointers of bins, and then use
+    repositioning tricks to treat these as the fields of a chunk.
+
+  TreeBins
+    Treebins are pointers to the roots of trees holding a range of
+    sizes. There are 2 equally spaced treebins for each power of two
+    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
+    larger.
+
+  Bin maps
+    There is one bit map for small bins ("smallmap") and one for
+    treebins ("treemap).  Each bin sets its bit when non-empty, and
+    clears the bit when empty.  Bit operations are then used to avoid
+    bin-by-bin searching -- nearly all "search" is done without ever
+    looking at bins that won't be selected.  The bit maps
+    conservatively use 32 bits per map word, even if on 64bit system.
+    For a good description of some of the bit-based techniques used
+    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
+    supplement at http://hackersdelight.org/). Many of these are
+    intended to reduce the branchiness of paths through malloc etc, as
+    well as to reduce the number of memory locations read or written.
+
+  Segments
+    A list of segments headed by an embedded malloc_segment record
+    representing the initial space.
+
+  Address check support
+    The least_addr field is the least address ever obtained from
+    MORECORE or MMAP. Attempted frees and reallocs of any address less
+    than this are trapped (unless INSECURE is defined).
+
+  Magic tag
+    A cross-check field that should always hold same value as mparams.magic.
+
+  Flags
+    Bits recording whether to use MMAP, locks, or contiguous MORECORE
+
+  Statistics
+    Each space keeps track of current and maximum system memory
+    obtained via MORECORE or MMAP.
+
+  Trim support
+    Fields holding the amount of unused topmost memory that should trigger
+    timming, and a counter to force periodic scanning to release unused
+    non-topmost segments.
+
+  Locking
+    If USE_LOCKS is defined, the "mutex" lock is acquired and released
+    around every public call using this mspace.
+
+  Extension support
+    A void* pointer and a size_t field that can be used to help implement
+    extensions to this malloc.
+*/
+
+/* Bin types, widths and sizes */
+#define NSMALLBINS        (32U)
+#define NTREEBINS         (32U)
+#define SMALLBIN_SHIFT    (3U)
+#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
+#define TREEBIN_SHIFT     (8U)
+#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
+#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
+#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)
+
+struct malloc_state {
+  binmap_t   smallmap;
+  binmap_t   treemap;
+  size_t     dvsize;
+  size_t     topsize;
+  char*      least_addr;
+  mchunkptr  dv;
+  mchunkptr  top;
+  size_t     trim_check;
+  size_t     release_checks;
+  size_t     magic;
+  mchunkptr  smallbins[(NSMALLBINS+1)*2];
+  tbinptr    treebins[NTREEBINS];
+  size_t     footprint;
+  size_t     max_footprint;
+  flag_t     mflags;
+  msegment   seg;
+#if USE_LOCKS
+  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
+#endif /* USE_LOCKS */
+  void*      extp;      /* Unused but available for extensions */
+  size_t     exts;
+};
+
+typedef struct malloc_state*    mstate;
+
+/* ------------- Global malloc_state and malloc_params ------------------- */
+
+#if !ONLY_MSPACES
+
+/* The global malloc_state used for all non-"mspace" calls */
+static struct malloc_state _gm_;
+#define gm                 (&_gm_)
+#define is_global(M)       ((M) == &_gm_)
+
+#endif /* !ONLY_MSPACES */
+
+#define is_initialized(M)  ((M)->top != 0)
+
+/* -------------------------- system alloc setup ------------------------- */
+
+/* Operations on mflags */
+
+#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
+#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
+#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
+
+#define use_mmap(M)           ((M)->mflags &   USE_MMAP_BIT)
+#define enable_mmap(M)        ((M)->mflags |=  USE_MMAP_BIT)
+#define disable_mmap(M)       ((M)->mflags &= ~USE_MMAP_BIT)
+
+#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
+#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)
+
+#define set_lock(M,L)\
+ ((M)->mflags = (L)?\
+  ((M)->mflags | USE_LOCK_BIT) :\
+  ((M)->mflags & ~USE_LOCK_BIT))
+
+/* page-align a size */
+#define page_align(S)\
+ (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))
+
+/* granularity-align a size */
+#define granularity_align(S)\
+  (((S) + (mparams.granularity - SIZE_T_ONE))\
+   & ~(mparams.granularity - SIZE_T_ONE))
+
+
+/* For mmap, use granularity alignment on windows, else page-align */
+#ifdef WIN32
+#define mmap_align(S) granularity_align(S)
+#else
+#define mmap_align(S) page_align(S)
+#endif
+
+/* For sys_alloc, enough padding to ensure can malloc request on success */
+#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)
+
+#define is_page_aligned(S)\
+   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
+#define is_granularity_aligned(S)\
+   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)
+
+/*  True if segment S holds address A */
+#define segment_holds(S, A)\
+  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)
+
+/* Return segment holding given address */
+static msegmentptr segment_holding(mstate m, char* addr) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if (addr >= sp->base && addr < sp->base + sp->size)
+      return sp;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+/* Return true if segment contains a segment link */
+static int has_segment_link(mstate m, msegmentptr ss) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
+      return 1;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+#ifndef MORECORE_CANNOT_TRIM
+#define should_trim(M,s)  ((s) > (M)->trim_check)
+#else  /* MORECORE_CANNOT_TRIM */
+#define should_trim(M,s)  (0)
+#endif /* MORECORE_CANNOT_TRIM */
+
+/*
+  TOP_FOOT_SIZE is padding at the end of a segment, including space
+  that may be needed to place segment records and fenceposts when new
+  noncontiguous segments are added.
+*/
+#define TOP_FOOT_SIZE\
+  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)
+
+
+/* -------------------------------  Hooks -------------------------------- */
+
+/*
+  PREACTION should be defined to return 0 on success, and nonzero on
+  failure. If you are not using locking, you can redefine these to do
+  anything you like.
+*/
+
+#if USE_LOCKS
+
+#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
+#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
+#else /* USE_LOCKS */
+
+#ifndef PREACTION
+#define PREACTION(M) (0)
+#endif  /* PREACTION */
+
+#ifndef POSTACTION
+#define POSTACTION(M)
+#endif  /* POSTACTION */
+
+#endif /* USE_LOCKS */
+
+/*
+  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
+  USAGE_ERROR_ACTION is triggered on detected bad frees and
+  reallocs. The argument p is an address that might have triggered the
+  fault. It is ignored by the two predefined actions, but might be
+  useful in custom actions that try to help diagnose errors.
+*/
+
+#if PROCEED_ON_ERROR
+
+/* A count of the number of corruption errors causing resets */
+int malloc_corruption_error_count;
+
+/* default corruption action */
+static void reset_on_error(mstate m);
+
+#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
+#define USAGE_ERROR_ACTION(m, p)
+
+#else /* PROCEED_ON_ERROR */
+
+#ifndef CORRUPTION_ERROR_ACTION
+#define CORRUPTION_ERROR_ACTION(m) ABORT
+#endif /* CORRUPTION_ERROR_ACTION */
+
+#ifndef USAGE_ERROR_ACTION
+#define USAGE_ERROR_ACTION(m,p) ABORT
+#endif /* USAGE_ERROR_ACTION */
+
+#endif /* PROCEED_ON_ERROR */
+
+/* -------------------------- Debugging setup ---------------------------- */
+
+#if ! DEBUG
+
+#define check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)
+#define check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)
+#define check_malloc_state(M)
+#define check_top_chunk(M,P)
+
+#else /* DEBUG */
+#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
+#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
+#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
+#define check_malloc_state(M)       do_check_malloc_state(M)
+
+static void   do_check_any_chunk(mstate m, mchunkptr p);
+static void   do_check_top_chunk(mstate m, mchunkptr p);
+static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
+static void   do_check_inuse_chunk(mstate m, mchunkptr p);
+static void   do_check_free_chunk(mstate m, mchunkptr p);
+static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
+static void   do_check_tree(mstate m, tchunkptr t);
+static void   do_check_treebin(mstate m, bindex_t i);
+static void   do_check_smallbin(mstate m, bindex_t i);
+static void   do_check_malloc_state(mstate m);
+static int    bin_find(mstate m, mchunkptr x);
+static size_t traverse_and_check(mstate m);
+#endif /* DEBUG */
+
+/* ---------------------------- Indexing Bins ---------------------------- */
+
+#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
+#define small_index(s)      (bindex_t)((s)  >> SMALLBIN_SHIFT)
+#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
+#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))
+
+/* addressing by index. See above about smallbin repositioning */
+#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
+#define treebin_at(M,i)     (&((M)->treebins[i]))
+
+/* assign tree index for size S to variable I. Use x86 asm if possible  */
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_tree_index(S, I)\
+{\
+  unsigned int X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "g"  (X));\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K = _bit_scan_reverse (X); \
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    _BitScanReverse((DWORD *) &K, (DWORD) X);\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#else /* GNUC */
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int Y = (unsigned int)X;\
+    unsigned int N = ((Y - 0x100) >> 16) & 8;\
+    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
+    N += K;\
+    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
+    K = 14 - N + ((Y <<= K) >> 15);\
+    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
+  }\
+}
+#endif /* GNUC */
+
+/* Bit representing maximum resolved size in a treebin at i */
+#define bit_for_tree_index(i) \
+   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)
+
+/* Shift placing maximum resolved bit in a treebin at i as sign bit */
+#define leftshift_for_tree_index(i) \
+   ((i == NTREEBINS-1)? 0 : \
+    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))
+
+/* The size of the smallest chunk held in bin with index i */
+#define minsize_for_tree_index(i) \
+   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
+   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))
+
+
+/* ------------------------ Operations on bin maps ----------------------- */
+
+/* bit corresponding to given index */
+#define idx2bit(i)              ((binmap_t)(1) << (i))
+
+/* Mark/Clear bits with given index */
+#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
+#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
+#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))
+
+#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
+#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
+#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))
+
+/* isolate the least set bit of a bitmap */
+#define least_bit(x)         ((x) & -(x))
+
+/* mask with all bits to left of least bit of x on */
+#define left_bits(x)         ((x<<1) | -(x<<1))
+
+/* mask with all bits to left of or equal to least bit of x on */
+#define same_or_left_bits(x) ((x) | -(x))
+
+/* index corresponding to given bit. Use x86 asm if possible */
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "g" (X));\
+  I = (bindex_t)J;\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  J = _bit_scan_forward (X); \
+  I = (bindex_t)J;\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  _BitScanForward((DWORD *) &J, X);\
+  I = (bindex_t)J;\
+}
+
+#elif USE_BUILTIN_FFS
+#define compute_bit2idx(X, I) I = ffs(X)-1
+
+#else
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int Y = X - 1;\
+  unsigned int K = Y >> (16-4) & 16;\
+  unsigned int N = K;        Y >>= K;\
+  N += K = Y >> (8-3) &  8;  Y >>= K;\
+  N += K = Y >> (4-2) &  4;  Y >>= K;\
+  N += K = Y >> (2-1) &  2;  Y >>= K;\
+  N += K = Y >> (1-0) &  1;  Y >>= K;\
+  I = (bindex_t)(N + Y);\
+}
+#endif /* GNUC */
+
+
+/* ----------------------- Runtime Check Support ------------------------- */
+
+/*
+  For security, the main invariant is that malloc/free/etc never
+  writes to a static address other than malloc_state, unless static
+  malloc_state itself has been corrupted, which cannot occur via
+  malloc (because of these checks). In essence this means that we
+  believe all pointers, sizes, maps etc held in malloc_state, but
+  check all of those linked or offsetted from other embedded data
+  structures.  These checks are interspersed with main code in a way
+  that tends to minimize their run-time cost.
+
+  When FOOTERS is defined, in addition to range checking, we also
+  verify footer fields of inuse chunks, which can be used guarantee
+  that the mstate controlling malloc/free is intact.  This is a
+  streamlined version of the approach described by William Robertson
+  et al in "Run-time Detection of Heap-based Overflows" LISA'03
+  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
+  of an inuse chunk holds the xor of its mstate and a random seed,
+  that is checked upon calls to free() and realloc().  This is
+  (probablistically) unguessable from outside the program, but can be
+  computed by any code successfully malloc'ing any chunk, so does not
+  itself provide protection against code that has already broken
+  security through some other means.  Unlike Robertson et al, we
+  always dynamically check addresses of all offset chunks (previous,
+  next, etc). This turns out to be cheaper than relying on hashes.
+*/
+
+#if !INSECURE
+/* Check if address a is at least as high as any from MORECORE or MMAP */
+#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
+/* Check if address of next chunk n is higher than base chunk p */
+#define ok_next(p, n)    ((char*)(p) < (char*)(n))
+/* Check if p has inuse status */
+#define ok_inuse(p)     is_inuse(p)
+/* Check if p has its pinuse bit on */
+#define ok_pinuse(p)     pinuse(p)
+
+#else /* !INSECURE */
+#define ok_address(M, a) (1)
+#define ok_next(b, n)    (1)
+#define ok_inuse(p)      (1)
+#define ok_pinuse(p)     (1)
+#endif /* !INSECURE */
+
+#if (FOOTERS && !INSECURE)
+/* Check if (alleged) mstate m has expected magic field */
+#define ok_magic(M)      ((M)->magic == mparams.magic)
+#else  /* (FOOTERS && !INSECURE) */
+#define ok_magic(M)      (1)
+#endif /* (FOOTERS && !INSECURE) */
+
+
+/* In gcc, use __builtin_expect to minimize impact of checks */
+#if !INSECURE
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define RTCHECK(e)  __builtin_expect(e, 1)
+#else /* GNUC */
+#define RTCHECK(e)  (e)
+#endif /* GNUC */
+#else /* !INSECURE */
+#define RTCHECK(e)  (1)
+#endif /* !INSECURE */
+
+/* macros to set up inuse chunks with or without footers */
+
+#if !FOOTERS
+
+#define mark_inuse_foot(M,p,s)
+
+/* Macros for setting head/foot of non-mmapped chunks */
+
+/* Set cinuse bit and pinuse bit of next chunk */
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set size, cinuse and pinuse bit of this chunk */
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))
+
+#else /* FOOTERS */
+
+/* Set foot of inuse chunk to be xor of mstate and seed */
+#define mark_inuse_foot(M,p,s)\
+  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))
+
+#define get_mstate_for(p)\
+  ((mstate)(((mchunkptr)((char*)(p) +\
+    (chunksize(p))))->prev_foot ^ mparams.magic))
+
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
+  mark_inuse_foot(M,p,s))
+
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
+ mark_inuse_foot(M,p,s))
+
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  mark_inuse_foot(M, p, s))
+
+#endif /* !FOOTERS */
+
+/* ---------------------------- setting mparams -------------------------- */
+
+#ifdef ENABLE_LARGE_PAGES
+typedef size_t (WINAPI *GetLargePageMinimum_t)(void);
+#endif
+
+/* Initialize mparams */
+static int init_mparams(void) {
+#ifdef NEED_GLOBAL_LOCK_INIT
+  if (malloc_global_mutex_status <= 0)
+    init_malloc_global_mutex();
+#endif
+
+  ACQUIRE_MALLOC_GLOBAL_LOCK();
+  if (mparams.magic == 0) {
+    size_t magic;
+    size_t psize;
+    size_t gsize;
+
+#ifndef WIN32
+    psize = malloc_getpagesize;
+    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
+#else /* WIN32 */
+    {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      psize = system_info.dwPageSize;
+      gsize = ((DEFAULT_GRANULARITY != 0)?
+               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
+#ifdef ENABLE_LARGE_PAGES
+      { 
+          GetLargePageMinimum_t GetLargePageMinimum_ = (GetLargePageMinimum_t) GetProcAddress(GetModuleHandle(__T("kernel32.dll")), "GetLargePageMinimum");
+          if(GetLargePageMinimum_) {
+              size_t largepagesize = GetLargePageMinimum_();
+              if(largepagesize) {
+                  psize = largepagesize;
+                  gsize = ((DEFAULT_GRANULARITY != 0)?
+                           DEFAULT_GRANULARITY : largepagesize);
+                  if(gsize < largepagesize) gsize = largepagesize;
+              }
+          }
+      }
+#endif
+    }
+#endif /* WIN32 */
+
+    /* Sanity-check configuration:
+       size_t must be unsigned and as wide as pointer type.
+       ints must be at least 4 bytes.
+       alignment must be at least 8.
+       Alignment, min chunk size, and page size must all be powers of 2.
+    */
+    if ((sizeof(size_t) != sizeof(char*)) ||
+        (MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
+        (sizeof(int) < 4)  ||
+        (MALLOC_ALIGNMENT < (size_t)8U) ||
+        ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
+        ((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
+        ((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
+        ((psize            & (psize-SIZE_T_ONE))            != 0))
+      ABORT;
+
+    mparams.granularity = gsize;
+    mparams.page_size = psize;
+    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
+    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
+#if MORECORE_CONTIGUOUS
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
+#else  /* MORECORE_CONTIGUOUS */
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
+#endif /* MORECORE_CONTIGUOUS */
+
+#if !ONLY_MSPACES
+    /* Set up lock for main malloc area */
+    gm->mflags = mparams.default_mflags;
+    INITIAL_LOCK(&gm->mutex);
+#endif
+
+    {
+#if USE_DEV_RANDOM
+      int fd;
+      unsigned char buf[sizeof(size_t)];
+      /* Try to use /dev/urandom, else fall back on using time */
+      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
+          read(fd, buf, sizeof(buf)) == sizeof(buf)) {
+        magic = *((size_t *) buf);
+        close(fd);
+      }
+      else
+#endif /* USE_DEV_RANDOM */
+#ifdef WIN32
+        magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
+#else
+        magic = (size_t)(time(0) ^ (size_t)0x55555555U);
+#endif
+      magic |= (size_t)8U;    /* ensure nonzero */
+      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
+      mparams.magic = magic;
+    }
+  }
+
+  RELEASE_MALLOC_GLOBAL_LOCK();
+  return 1;
+}
+
+/* support for mallopt */
+static int change_mparam(int param_number, int value) {
+  size_t val;
+  ensure_initialization();
+  val = (value == -1)? MAX_SIZE_T : (size_t)value;
+  switch(param_number) {
+  case M_TRIM_THRESHOLD:
+    mparams.trim_threshold = val;
+    return 1;
+  case M_GRANULARITY:
+    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
+      mparams.granularity = val;
+      return 1;
+    }
+    else
+      return 0;
+  case M_MMAP_THRESHOLD:
+    mparams.mmap_threshold = val;
+    return 1;
+  default:
+    return 0;
+  }
+}
+
+#if DEBUG
+/* ------------------------- Debugging Support --------------------------- */
+
+/* Check properties of any chunk, whether free, inuse, mmapped etc  */
+static void do_check_any_chunk(mstate m, mchunkptr p) {
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+}
+
+/* Check properties of top chunk */
+static void do_check_top_chunk(mstate m, mchunkptr p) {
+  msegmentptr sp = segment_holding(m, (char*)p);
+  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
+  assert(sp != 0);
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(sz == m->topsize);
+  assert(sz > 0);
+  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
+  assert(pinuse(p));
+  assert(!pinuse(chunk_plus_offset(p, sz)));
+}
+
+/* Check properties of (inuse) mmapped chunks */
+static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
+  size_t  sz = chunksize(p);
+  size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD);
+  assert(is_mmapped(p));
+  assert(use_mmap(m));
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(!is_small(sz));
+  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
+  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
+  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
+}
+
+/* Check properties of inuse chunks */
+static void do_check_inuse_chunk(mstate m, mchunkptr p) {
+  do_check_any_chunk(m, p);
+  assert(is_inuse(p));
+  assert(next_pinuse(p));
+  /* If not pinuse and not mmapped, previous chunk has OK offset */
+  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
+  if (is_mmapped(p))
+    do_check_mmapped_chunk(m, p);
+}
+
+/* Check properties of free chunks */
+static void do_check_free_chunk(mstate m, mchunkptr p) {
+  size_t sz = chunksize(p);
+  mchunkptr next = chunk_plus_offset(p, sz);
+  do_check_any_chunk(m, p);
+  assert(!is_inuse(p));
+  assert(!next_pinuse(p));
+  assert (!is_mmapped(p));
+  if (p != m->dv && p != m->top) {
+    if (sz >= MIN_CHUNK_SIZE) {
+      assert((sz & CHUNK_ALIGN_MASK) == 0);
+      assert(is_aligned(chunk2mem(p)));
+      assert(next->prev_foot == sz);
+      assert(pinuse(p));
+      assert (next == m->top || is_inuse(next));
+      assert(p->fd->bk == p);
+      assert(p->bk->fd == p);
+    }
+    else  /* markers are always of size SIZE_T_SIZE */
+      assert(sz == SIZE_T_SIZE);
+  }
+}
+
+/* Check properties of malloced chunks at the point they are malloced */
+static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    size_t sz = p->head & ~INUSE_BITS;
+    do_check_inuse_chunk(m, p);
+    assert((sz & CHUNK_ALIGN_MASK) == 0);
+    assert(sz >= MIN_CHUNK_SIZE);
+    assert(sz >= s);
+    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
+    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
+  }
+}
+
+/* Check a tree and its subtrees.  */
+static void do_check_tree(mstate m, tchunkptr t) {
+  tchunkptr head = 0;
+  tchunkptr u = t;
+  bindex_t tindex = t->index;
+  size_t tsize = chunksize(t);
+  bindex_t idx;
+  compute_tree_index(tsize, idx);
+  assert(tindex == idx);
+  assert(tsize >= MIN_LARGE_SIZE);
+  assert(tsize >= minsize_for_tree_index(idx));
+  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));
+
+  do { /* traverse through chain of same-sized nodes */
+    do_check_any_chunk(m, ((mchunkptr)u));
+    assert(u->index == tindex);
+    assert(chunksize(u) == tsize);
+    assert(!is_inuse(u));
+    assert(!next_pinuse(u));
+    assert(u->fd->bk == u);
+    assert(u->bk->fd == u);
+    if (u->parent == 0) {
+      assert(u->child[0] == 0);
+      assert(u->child[1] == 0);
+    }
+    else {
+      assert(head == 0); /* only one node on chain has parent */
+      head = u;
+      assert(u->parent != u);
+      assert (u->parent->child[0] == u ||
+              u->parent->child[1] == u ||
+              *((tbinptr*)(u->parent)) == u);
+      if (u->child[0] != 0) {
+        assert(u->child[0]->parent == u);
+        assert(u->child[0] != u);
+        do_check_tree(m, u->child[0]);
+      }
+      if (u->child[1] != 0) {
+        assert(u->child[1]->parent == u);
+        assert(u->child[1] != u);
+        do_check_tree(m, u->child[1]);
+      }
+      if (u->child[0] != 0 && u->child[1] != 0) {
+        assert(chunksize(u->child[0]) < chunksize(u->child[1]));
+      }
+    }
+    u = u->fd;
+  } while (u != t);
+  assert(head != 0);
+}
+
+/*  Check all the chunks in a treebin.  */
+static void do_check_treebin(mstate m, bindex_t i) {
+  tbinptr* tb = treebin_at(m, i);
+  tchunkptr t = *tb;
+  int empty = (m->treemap & (1U << i)) == 0;
+  if (t == 0)
+    assert(empty);
+  if (!empty)
+    do_check_tree(m, t);
+}
+
+/*  Check all the chunks in a smallbin.  */
+static void do_check_smallbin(mstate m, bindex_t i) {
+  sbinptr b = smallbin_at(m, i);
+  mchunkptr p = b->bk;
+  unsigned int empty = (m->smallmap & (1U << i)) == 0;
+  if (p == b)
+    assert(empty);
+  if (!empty) {
+    for (; p != b; p = p->bk) {
+      size_t size = chunksize(p);
+      mchunkptr q;
+      /* each chunk claims to be free */
+      do_check_free_chunk(m, p);
+      /* chunk belongs in bin */
+      assert(small_index(size) == i);
+      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
+      /* chunk is followed by an inuse chunk */
+      q = next_chunk(p);
+      if (q->head != FENCEPOST_HEAD)
+        do_check_inuse_chunk(m, q);
+    }
+  }
+}
+
+/* Find x in a bin. Used in other check functions. */
+static int bin_find(mstate m, mchunkptr x) {
+  size_t size = chunksize(x);
+  if (is_small(size)) {
+    bindex_t sidx = small_index(size);
+    sbinptr b = smallbin_at(m, sidx);
+    if (smallmap_is_marked(m, sidx)) {
+      mchunkptr p = b;
+      do {
+        if (p == x)
+          return 1;
+      } while ((p = p->fd) != b);
+    }
+  }
+  else {
+    bindex_t tidx;
+    compute_tree_index(size, tidx);
+    if (treemap_is_marked(m, tidx)) {
+      tchunkptr t = *treebin_at(m, tidx);
+      size_t sizebits = size << leftshift_for_tree_index(tidx);
+      while (t != 0 && chunksize(t) != size) {
+        t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+        sizebits <<= 1;
+      }
+      if (t != 0) {
+        tchunkptr u = t;
+        do {
+          if (u == (tchunkptr)x)
+            return 1;
+        } while ((u = u->fd) != t);
+      }
+    }
+  }
+  return 0;
+}
+
+/* Traverse each chunk and check it; return total */
+static size_t traverse_and_check(mstate m) {
+  size_t sum = 0;
+  if (is_initialized(m)) {
+    msegmentptr s = &m->seg;
+    sum += m->topsize + TOP_FOOT_SIZE;
+    while (s != 0) {
+      mchunkptr q = align_as_chunk(s->base);
+      mchunkptr lastq = 0;
+      assert(pinuse(q));
+      while (segment_holds(s, q) &&
+             q != m->top && q->head != FENCEPOST_HEAD) {
+        sum += chunksize(q);
+        if (is_inuse(q)) {
+          assert(!bin_find(m, q));
+          do_check_inuse_chunk(m, q);
+        }
+        else {
+          assert(q == m->dv || bin_find(m, q));
+          assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */
+          do_check_free_chunk(m, q);
+        }
+        lastq = q;
+        q = next_chunk(q);
+      }
+      s = s->next;
+    }
+  }
+  return sum;
+}
+
+/* Check all properties of malloc_state. */
+static void do_check_malloc_state(mstate m) {
+  bindex_t i;
+  size_t total;
+  /* check bins */
+  for (i = 0; i < NSMALLBINS; ++i)
+    do_check_smallbin(m, i);
+  for (i = 0; i < NTREEBINS; ++i)
+    do_check_treebin(m, i);
+
+  if (m->dvsize != 0) { /* check dv chunk */
+    do_check_any_chunk(m, m->dv);
+    assert(m->dvsize == chunksize(m->dv));
+    assert(m->dvsize >= MIN_CHUNK_SIZE);
+    assert(bin_find(m, m->dv) == 0);
+  }
+
+  if (m->top != 0) {   /* check top chunk */
+    do_check_top_chunk(m, m->top);
+    /*assert(m->topsize == chunksize(m->top)); redundant */
+    assert(m->topsize > 0);
+    assert(bin_find(m, m->top) == 0);
+  }
+
+  total = traverse_and_check(m);
+  assert(total <= m->footprint);
+  assert(m->footprint <= m->max_footprint);
+}
+#endif /* DEBUG */
+
+/* ----------------------------- statistics ------------------------------ */
+
+#if !NO_MALLINFO
+static struct mallinfo internal_mallinfo(mstate m) {
+  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      size_t nfree = SIZE_T_ONE; /* top always free */
+      size_t mfree = m->topsize + TOP_FOOT_SIZE;
+      size_t sum = mfree;
+      msegmentptr s = &m->seg;
+      while (s != 0) {
+        mchunkptr q = align_as_chunk(s->base);
+        while (segment_holds(s, q) &&
+               q != m->top && q->head != FENCEPOST_HEAD) {
+          size_t sz = chunksize(q);
+          sum += sz;
+          if (!is_inuse(q)) {
+            mfree += sz;
+            ++nfree;
+          }
+          q = next_chunk(q);
+        }
+        s = s->next;
+      }
+
+      nm.arena    = sum;
+      nm.ordblks  = nfree;
+      nm.hblkhd   = m->footprint - sum;
+      nm.usmblks  = m->max_footprint;
+      nm.uordblks = m->footprint - mfree;
+      nm.fordblks = mfree;
+      nm.keepcost = m->topsize;
+    }
+
+    POSTACTION(m);
+  }
+  return nm;
+}
+#endif /* !NO_MALLINFO */
+
+static void internal_malloc_stats(mstate m) {
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    size_t maxfp = 0;
+    size_t fp = 0;
+    size_t used = 0;
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      msegmentptr s = &m->seg;
+      maxfp = m->max_footprint;
+      fp = m->footprint;
+      used = fp - (m->topsize + TOP_FOOT_SIZE);
+
+      while (s != 0) {
+        mchunkptr q = align_as_chunk(s->base);
+        while (segment_holds(s, q) &&
+               q != m->top && q->head != FENCEPOST_HEAD) {
+          if (!is_inuse(q))
+            used -= chunksize(q);
+          q = next_chunk(q);
+        }
+        s = s->next;
+      }
+    }
+
+    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
+    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
+    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
+
+    POSTACTION(m);
+  }
+}
+
+/* ----------------------- Operations on smallbins ----------------------- */
+
+/*
+  Various forms of linking and unlinking are defined as macros.  Even
+  the ones for trees, which are very long but have very short typical
+  paths.  This is ugly but reduces reliance on inlining support of
+  compilers.
+*/
+
+/* Link a free chunk into a smallbin  */
+#define insert_small_chunk(M, P, S) {\
+  bindex_t I  = small_index(S);\
+  mchunkptr B = smallbin_at(M, I);\
+  mchunkptr F = B;\
+  assert(S >= MIN_CHUNK_SIZE);\
+  if (!smallmap_is_marked(M, I))\
+    mark_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, B->fd)))\
+    F = B->fd;\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+  B->fd = P;\
+  F->bk = P;\
+  P->fd = F;\
+  P->bk = B;\
+}
+
+/* Unlink a chunk from a smallbin  */
+#define unlink_small_chunk(M, P, S) {\
+  mchunkptr F = P->fd;\
+  mchunkptr B = P->bk;\
+  bindex_t I = small_index(S);\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (F == B)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK((F == smallbin_at(M,I) || ok_address(M, F)) &&\
+                   (B == smallbin_at(M,I) || ok_address(M, B)))) {\
+    F->bk = B;\
+    B->fd = F;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+/* Unlink the first chunk from a smallbin */
+#define unlink_first_small_chunk(M, B, P, I) {\
+  mchunkptr F = P->fd;\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (B == F)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, F))) {\
+    B->fd = F;\
+    F->bk = B;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+
+
+/* Replace dv node, binning the old one */
+/* Used only when dvsize known to be small */
+#define replace_dv(M, P, S) {\
+  size_t DVS = M->dvsize;\
+  if (DVS != 0) {\
+    mchunkptr DV = M->dv;\
+    assert(is_small(DVS));\
+    insert_small_chunk(M, DV, DVS);\
+  }\
+  M->dvsize = S;\
+  M->dv = P;\
+}
+
+/* ------------------------- Operations on trees ------------------------- */
+
+/* Insert chunk into tree */
+#define insert_large_chunk(M, X, S) {\
+  tbinptr* H;\
+  bindex_t I;\
+  compute_tree_index(S, I);\
+  H = treebin_at(M, I);\
+  X->index = I;\
+  X->child[0] = X->child[1] = 0;\
+  if (!treemap_is_marked(M, I)) {\
+    mark_treemap(M, I);\
+    *H = X;\
+    X->parent = (tchunkptr)H;\
+    X->fd = X->bk = X;\
+  }\
+  else {\
+    tchunkptr T = *H;\
+    size_t K = S << leftshift_for_tree_index(I);\
+    for (;;) {\
+      if (chunksize(T) != S) {\
+        tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
+        K <<= 1;\
+        if (*C != 0)\
+          T = *C;\
+        else if (RTCHECK(ok_address(M, C))) {\
+          *C = X;\
+          X->parent = T;\
+          X->fd = X->bk = X;\
+          break;\
+        }\
+        else {\
+          CORRUPTION_ERROR_ACTION(M);\
+          break;\
+        }\
+      }\
+      else {\
+        tchunkptr F = T->fd;\
+        if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
+          T->fd = F->bk = X;\
+          X->fd = F;\
+          X->bk = T;\
+          X->parent = 0;\
+          break;\
+        }\
+        else {\
+          CORRUPTION_ERROR_ACTION(M);\
+          break;\
+        }\
+      }\
+    }\
+  }\
+}
+
+/*
+  Unlink steps:
+
+  1. If x is a chained node, unlink it from its same-sized fd/bk links
+     and choose its bk node as its replacement.
+  2. If x was the last node of its size, but not a leaf node, it must
+     be replaced with a leaf node (not merely one with an open left or
+     right), to make sure that lefts and rights of descendents
+     correspond properly to bit masks.  We use the rightmost descendent
+     of x.  We could use any other leaf, but this is easy to locate and
+     tends to counteract removal of leftmosts elsewhere, and so keeps
+     paths shorter than minimally guaranteed.  This doesn't loop much
+     because on average a node in a tree is near the bottom.
+  3. If x is the base of a chain (i.e., has parent links) relink
+     x's parent and children to x's replacement (or null if none).
+*/
+
+#define unlink_large_chunk(M, X) {\
+  tchunkptr XP = X->parent;\
+  tchunkptr R;\
+  if (X->bk != X) {\
+    tchunkptr F = X->fd;\
+    R = X->bk;\
+    if (RTCHECK(ok_address(M, F))) {\
+      F->bk = R;\
+      R->fd = F;\
+    }\
+    else {\
+      CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+  else {\
+    tchunkptr* RP;\
+    if (((R = *(RP = &(X->child[1]))) != 0) ||\
+        ((R = *(RP = &(X->child[0]))) != 0)) {\
+      tchunkptr* CP;\
+      while ((*(CP = &(R->child[1])) != 0) ||\
+             (*(CP = &(R->child[0])) != 0)) {\
+        R = *(RP = CP);\
+      }\
+      if (RTCHECK(ok_address(M, RP)))\
+        *RP = 0;\
+      else {\
+        CORRUPTION_ERROR_ACTION(M);\
+      }\
+    }\
+  }\
+  if (XP != 0) {\
+    tbinptr* H = treebin_at(M, X->index);\
+    if (X == *H) {\
+      if ((*H = R) == 0) \
+        clear_treemap(M, X->index);\
+    }\
+    else if (RTCHECK(ok_address(M, XP))) {\
+      if (XP->child[0] == X) \
+        XP->child[0] = R;\
+      else \
+        XP->child[1] = R;\
+    }\
+    else\
+      CORRUPTION_ERROR_ACTION(M);\
+    if (R != 0) {\
+      if (RTCHECK(ok_address(M, R))) {\
+        tchunkptr C0, C1;\
+        R->parent = XP;\
+        if ((C0 = X->child[0]) != 0) {\
+          if (RTCHECK(ok_address(M, C0))) {\
+            R->child[0] = C0;\
+            C0->parent = R;\
+          }\
+          else\
+            CORRUPTION_ERROR_ACTION(M);\
+        }\
+        if ((C1 = X->child[1]) != 0) {\
+          if (RTCHECK(ok_address(M, C1))) {\
+            R->child[1] = C1;\
+            C1->parent = R;\
+          }\
+          else\
+            CORRUPTION_ERROR_ACTION(M);\
+        }\
+      }\
+      else\
+        CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+}
+
+/* Relays to large vs small bin operations */
+
+#define insert_chunk(M, P, S)\
+  if (is_small(S)) insert_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }
+
+#define unlink_chunk(M, P, S)\
+  if (is_small(S)) unlink_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }
+
+
+/* Relays to internal calls to malloc/free from realloc, memalign etc */
+
+#if ONLY_MSPACES
+#define internal_malloc(m, b) mspace_malloc(m, b)
+#define internal_free(m, mem) mspace_free(m,mem);
+#else /* ONLY_MSPACES */
+#if MSPACES
+#define internal_malloc(m, b)\
+   (m == gm)? dlmalloc(b) : mspace_malloc(m, b)
+#define internal_free(m, mem)\
+   if (m == gm) dlfree(mem); else mspace_free(m,mem);
+#else /* MSPACES */
+#define internal_malloc(m, b) dlmalloc(b)
+#define internal_free(m, mem) dlfree(mem)
+#endif /* MSPACES */
+#endif /* ONLY_MSPACES */
+
+/* -----------------------  Direct-mmapping chunks ----------------------- */
+
+/*
+  Directly mmapped chunks are set up with an offset to the start of
+  the mmapped region stored in the prev_foot field of the chunk. This
+  allows reconstruction of the required argument to MUNMAP when freed,
+  and also allows adjustment of the returned chunk to meet alignment
+  requirements (especially in memalign).
+*/
+
+/* Malloc using mmap */
+static void* mmap_alloc(mstate m, size_t nb) {
+  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  if (mmsize > nb) {     /* Check for wrap around 0 */
+    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
+    if (mm != CMFAIL) {
+      size_t offset = align_offset(chunk2mem(mm));
+      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
+      mchunkptr p = (mchunkptr)(mm + offset);
+      p->prev_foot = offset;
+      p->head = psize;
+      mark_inuse_foot(m, p, psize);
+      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;
+
+      if (m->least_addr == 0 || mm < m->least_addr)
+        m->least_addr = mm;
+      if ((m->footprint += mmsize) > m->max_footprint)
+        m->max_footprint = m->footprint;
+      assert(is_aligned(chunk2mem(p)));
+      check_mmapped_chunk(m, p);
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* Realloc using mmap */
+static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) {
+  size_t oldsize = chunksize(oldp);
+  if (is_small(nb)) /* Can't shrink mmap regions below small size */
+    return 0;
+  /* Keep old chunk if big enough but not too big */
+  if (oldsize >= nb + SIZE_T_SIZE &&
+      (oldsize - nb) <= (mparams.granularity << 1))
+    return oldp;
+  else {
+    size_t offset = oldp->prev_foot;
+    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
+    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
+                                  oldmmsize, newmmsize, 1);
+    if (cp != CMFAIL) {
+      mchunkptr newp = (mchunkptr)(cp + offset);
+      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
+      newp->head = psize;
+      mark_inuse_foot(m, newp, psize);
+      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;
+
+      if (cp < m->least_addr)
+        m->least_addr = cp;
+      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
+        m->max_footprint = m->footprint;
+      check_mmapped_chunk(m, newp);
+      return newp;
+    }
+  }
+  return 0;
+}
+
+/* -------------------------- mspace management -------------------------- */
+
+/* Initialize top chunk and its size */
+static void init_top(mstate m, mchunkptr p, size_t psize) {
+  /* Ensure alignment */
+  size_t offset = align_offset(chunk2mem(p));
+  p = (mchunkptr)((char*)p + offset);
+  psize -= offset;
+
+  m->top = p;
+  m->topsize = psize;
+  p->head = psize | PINUSE_BIT;
+  /* set size of fake trailing chunk holding overhead space only once */
+  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
+  m->trim_check = mparams.trim_threshold; /* reset on each update */
+}
+
+/* Initialize bins for a new mstate that is otherwise zeroed out */
+static void init_bins(mstate m) {
+  /* Establish circular links for smallbins */
+  bindex_t i;
+  for (i = 0; i < NSMALLBINS; ++i) {
+    sbinptr bin = smallbin_at(m,i);
+    bin->fd = bin->bk = bin;
+  }
+}
+
+#if PROCEED_ON_ERROR
+
+/* default corruption action */
+static void reset_on_error(mstate m) {
+  int i;
+  ++malloc_corruption_error_count;
+  /* Reinitialize fields to forget about all memory */
+  m->smallbins = m->treebins = 0;
+  m->dvsize = m->topsize = 0;
+  m->seg.base = 0;
+  m->seg.size = 0;
+  m->seg.next = 0;
+  m->top = m->dv = 0;
+  for (i = 0; i < NTREEBINS; ++i)
+    *treebin_at(m, i) = 0;
+  init_bins(m);
+}
+#endif /* PROCEED_ON_ERROR */
+
+/* Allocate chunk and prepend remainder with chunk in successor base. */
+static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
+                           size_t nb) {
+  mchunkptr p = align_as_chunk(newbase);
+  mchunkptr oldfirst = align_as_chunk(oldbase);
+  size_t psize = (char*)oldfirst - (char*)p;
+  mchunkptr q = chunk_plus_offset(p, nb);
+  size_t qsize = psize - nb;
+  set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+
+  assert((char*)oldfirst > (char*)q);
+  assert(pinuse(oldfirst));
+  assert(qsize >= MIN_CHUNK_SIZE);
+
+  /* consolidate remainder with first chunk of old base */
+  if (oldfirst == m->top) {
+    size_t tsize = m->topsize += qsize;
+    m->top = q;
+    q->head = tsize | PINUSE_BIT;
+    check_top_chunk(m, q);
+  }
+  else if (oldfirst == m->dv) {
+    size_t dsize = m->dvsize += qsize;
+    m->dv = q;
+    set_size_and_pinuse_of_free_chunk(q, dsize);
+  }
+  else {
+    if (!is_inuse(oldfirst)) {
+      size_t nsize = chunksize(oldfirst);
+      unlink_chunk(m, oldfirst, nsize);
+      oldfirst = chunk_plus_offset(oldfirst, nsize);
+      qsize += nsize;
+    }
+    set_free_with_pinuse(q, qsize, oldfirst);
+    insert_chunk(m, q, qsize);
+    check_free_chunk(m, q);
+  }
+
+  check_malloced_chunk(m, chunk2mem(p), nb);
+  return chunk2mem(p);
+}
+
+/* Add a segment to hold a new noncontiguous region */
+static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
+  /* Determine locations and sizes of segment, fenceposts, old top */
+  char* old_top = (char*)m->top;
+  msegmentptr oldsp = segment_holding(m, old_top);
+  char* old_end = oldsp->base + oldsp->size;
+  size_t ssize = pad_request(sizeof(struct malloc_segment));
+  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  size_t offset = align_offset(chunk2mem(rawsp));
+  char* asp = rawsp + offset;
+  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
+  mchunkptr sp = (mchunkptr)csp;
+  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
+  mchunkptr tnext = chunk_plus_offset(sp, ssize);
+  mchunkptr p = tnext;
+  int nfences = 0;
+
+  /* reset top to new space */
+  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+
+  /* Set up segment record */
+  assert(is_aligned(ss));
+  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
+  *ss = m->seg; /* Push current record */
+  m->seg.base = tbase;
+  m->seg.size = tsize;
+  m->seg.sflags = mmapped;
+  m->seg.next = ss;
+
+  /* Insert trailing fenceposts */
+  for (;;) {
+    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
+    p->head = FENCEPOST_HEAD;
+    ++nfences;
+    if ((char*)(&(nextp->head)) < old_end)
+      p = nextp;
+    else
+      break;
+  }
+  assert(nfences >= 2);
+
+  /* Insert the rest of old top into a bin as an ordinary free chunk */
+  if (csp != old_top) {
+    mchunkptr q = (mchunkptr)old_top;
+    size_t psize = csp - old_top;
+    mchunkptr tn = chunk_plus_offset(q, psize);
+    set_free_with_pinuse(q, psize, tn);
+    insert_chunk(m, q, psize);
+  }
+
+  check_top_chunk(m, m->top);
+}
+
+/* -------------------------- System allocation -------------------------- */
+
+/* Get memory from system using MORECORE or MMAP */
+static void* sys_alloc(mstate m, size_t nb) {
+  char* tbase = CMFAIL;
+  size_t tsize = 0;
+  flag_t mmap_flag = 0;
+
+  ensure_initialization();
+
+  /* Directly map large chunks, but only if already initialized */
+  if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize != 0) {
+    void* mem = mmap_alloc(m, nb);
+    if (mem != 0)
+      return mem;
+  }
+
+  /*
+    Try getting memory in any of three ways (in most-preferred to
+    least-preferred order):
+    1. A call to MORECORE that can normally contiguously extend memory.
+       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
+       or main space is mmapped or a previous contiguous call failed)
+    2. A call to MMAP new space (disabled if not HAVE_MMAP).
+       Note that under the default settings, if MORECORE is unable to
+       fulfill a request, and HAVE_MMAP is true, then mmap is
+       used as a noncontiguous system allocator. This is a useful backup
+       strategy for systems with holes in address spaces -- in this case
+       sbrk cannot contiguously expand the heap, but mmap may be able to
+       find space.
+    3. A call to MORECORE that cannot usually contiguously extend memory.
+       (disabled if not HAVE_MORECORE)
+
+   In all cases, we need to request enough bytes from system to ensure
+   we can malloc nb bytes upon success, so pad with enough space for
+   top_foot, plus alignment-pad to make sure we don't lose bytes if
+   not on boundary, and round this up to a granularity unit.
+  */
+
+  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
+    char* br = CMFAIL;
+    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top);
+    size_t asize = 0;
+    ACQUIRE_MALLOC_GLOBAL_LOCK();
+
+    if (ss == 0) {  /* First time through or recovery */
+      char* base = (char*)CALL_MORECORE(0);
+      if (base != CMFAIL) {
+        asize = granularity_align(nb + SYS_ALLOC_PADDING);
+        /* Adjust to end on a page boundary */
+        if (!is_page_aligned(base))
+          asize += (page_align((size_t)base) - (size_t)base);
+        /* Can't call MORECORE if size is negative when treated as signed */
+        if (asize < HALF_MAX_SIZE_T &&
+            (br = (char*)(CALL_MORECORE(asize))) == base) {
+          tbase = base;
+          tsize = asize;
+        }
+      }
+    }
+    else {
+      /* Subtract out existing available top space from MORECORE request. */
+      asize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
+      /* Use mem here only if it did continuously extend old space */
+      if (asize < HALF_MAX_SIZE_T &&
+          (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) {
+        tbase = br;
+        tsize = asize;
+      }
+    }
+
+    if (tbase == CMFAIL) {    /* Cope with partial failure */
+      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
+        if (asize < HALF_MAX_SIZE_T &&
+            asize < nb + SYS_ALLOC_PADDING) {
+          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - asize);
+          if (esize < HALF_MAX_SIZE_T) {
+            char* end = (char*)CALL_MORECORE(esize);
+            if (end != CMFAIL)
+              asize += esize;
+            else {            /* Can't use; try to release */
+              (void) CALL_MORECORE(-asize);
+              br = CMFAIL;
+            }
+          }
+        }
+      }
+      if (br != CMFAIL) {    /* Use the space we did get */
+        tbase = br;
+        tsize = asize;
+      }
+      else
+        disable_contiguous(m); /* Don't try contiguous path in the future */
+    }
+
+    RELEASE_MALLOC_GLOBAL_LOCK();
+  }
+
+  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
+    size_t rsize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (rsize > nb) { /* Fail if wraps around zero */
+      char* mp = (char*)(CALL_MMAP(rsize));
+      if (mp != CMFAIL) {
+        tbase = mp;
+        tsize = rsize;
+        mmap_flag = USE_MMAP_BIT;
+      }
+    }
+  }
+
+  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
+    size_t asize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (asize < HALF_MAX_SIZE_T) {
+      char* br = CMFAIL;
+      char* end = CMFAIL;
+      ACQUIRE_MALLOC_GLOBAL_LOCK();
+      br = (char*)(CALL_MORECORE(asize));
+      end = (char*)(CALL_MORECORE(0));
+      RELEASE_MALLOC_GLOBAL_LOCK();
+      if (br != CMFAIL && end != CMFAIL && br < end) {
+        size_t ssize = end - br;
+        if (ssize > nb + TOP_FOOT_SIZE) {
+          tbase = br;
+          tsize = ssize;
+        }
+      }
+    }
+  }
+
+  if (tbase != CMFAIL) {
+
+    if ((m->footprint += tsize) > m->max_footprint)
+      m->max_footprint = m->footprint;
+
+    if (!is_initialized(m)) { /* first-time initialization */
+      if (m->least_addr == 0 || tbase < m->least_addr)
+        m->least_addr = tbase;
+      m->seg.base = tbase;
+      m->seg.size = tsize;
+      m->seg.sflags = mmap_flag;
+      m->magic = mparams.magic;
+      m->release_checks = MAX_RELEASE_CHECK_RATE;
+      init_bins(m);
+#if !ONLY_MSPACES
+      if (is_global(m))
+        init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+      else
+#endif
+      {
+        /* Offset top by embedded malloc_state */
+        mchunkptr mn = next_chunk(mem2chunk(m));
+        init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
+      }
+    }
+
+    else {
+      /* Try to merge with an existing segment */
+      msegmentptr sp = &m->seg;
+      /* Only consider most recent segment if traversal suppressed */
+      while (sp != 0 && tbase != sp->base + sp->size)
+        sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+      if (sp != 0 &&
+          !is_extern_segment(sp) &&
+          (sp->sflags & USE_MMAP_BIT) == mmap_flag &&
+          segment_holds(sp, m->top)) { /* append */
+        sp->size += tsize;
+        init_top(m, m->top, m->topsize + tsize);
+      }
+      else {
+        if (tbase < m->least_addr)
+          m->least_addr = tbase;
+        sp = &m->seg;
+        while (sp != 0 && sp->base != tbase + tsize)
+          sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+        if (sp != 0 &&
+            !is_extern_segment(sp) &&
+            (sp->sflags & USE_MMAP_BIT) == mmap_flag) {
+          char* oldbase = sp->base;
+          sp->base = tbase;
+          sp->size += tsize;
+          return prepend_alloc(m, tbase, oldbase, nb);
+        }
+        else
+          add_segment(m, tbase, tsize, mmap_flag);
+      }
+    }
+
+    if (nb < m->topsize) { /* Allocate from new or extended top space */
+      size_t rsize = m->topsize -= nb;
+      mchunkptr p = m->top;
+      mchunkptr r = m->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+      check_top_chunk(m, m->top);
+      check_malloced_chunk(m, chunk2mem(p), nb);
+      return chunk2mem(p);
+    }
+  }
+
+  MALLOC_FAILURE_ACTION;
+  return 0;
+}
+
+/* -----------------------  system deallocation -------------------------- */
+
+/* Unmap and unlink any mmapped segments that don't contain used chunks */
+static size_t release_unused_segments(mstate m) {
+  size_t released = 0;
+  int nsegs = 0;
+  msegmentptr pred = &m->seg;
+  msegmentptr sp = pred->next;
+  while (sp != 0) {
+    char* base = sp->base;
+    size_t size = sp->size;
+    msegmentptr next = sp->next;
+    ++nsegs;
+    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
+      mchunkptr p = align_as_chunk(base);
+      size_t psize = chunksize(p);
+      /* Can unmap if first chunk holds entire segment and not pinned */
+      if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
+        tchunkptr tp = (tchunkptr)p;
+        assert(segment_holds(sp, (char*)sp));
+        if (p == m->dv) {
+          m->dv = 0;
+          m->dvsize = 0;
+        }
+        else {
+          unlink_large_chunk(m, tp);
+        }
+        if (CALL_MUNMAP(base, size) == 0) {
+          released += size;
+          m->footprint -= size;
+          /* unlink obsoleted record */
+          sp = pred;
+          sp->next = next;
+        }
+        else { /* back out if cannot unmap */
+          insert_large_chunk(m, tp, psize);
+        }
+      }
+    }
+    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
+      break;
+    pred = sp;
+    sp = next;
+  }
+  /* Reset check counter */
+  m->release_checks = ((nsegs > MAX_RELEASE_CHECK_RATE)?
+                       nsegs : MAX_RELEASE_CHECK_RATE);
+  return released;
+}
+
+static int sys_trim(mstate m, size_t pad) {
+  size_t released = 0;
+  ensure_initialization();
+  if (pad < MAX_REQUEST && is_initialized(m)) {
+    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */
+
+    if (m->topsize > pad) {
+      /* Shrink top space in granularity-size units, keeping at least one */
+      size_t unit = mparams.granularity;
+      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
+                      SIZE_T_ONE) * unit;
+      msegmentptr sp = segment_holding(m, (char*)m->top);
+
+      if (!is_extern_segment(sp)) {
+        if (is_mmapped_segment(sp)) {
+          if (HAVE_MMAP &&
+              sp->size >= extra &&
+              !has_segment_link(m, sp)) { /* can't shrink if pinned */
+            size_t newsize = sp->size - extra;
+            /* Prefer mremap, fall back to munmap */
+            if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
+                (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
+              released = extra;
+            }
+          }
+        }
+        else if (HAVE_MORECORE) {
+          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
+            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
+          ACQUIRE_MALLOC_GLOBAL_LOCK();
+          {
+            /* Make sure end of memory is where we last set it. */
+            char* old_br = (char*)(CALL_MORECORE(0));
+            if (old_br == sp->base + sp->size) {
+              char* rel_br = (char*)(CALL_MORECORE(-extra));
+              char* new_br = (char*)(CALL_MORECORE(0));
+              if (rel_br != CMFAIL && new_br < old_br)
+                released = old_br - new_br;
+            }
+          }
+          RELEASE_MALLOC_GLOBAL_LOCK();
+        }
+      }
+
+      if (released != 0) {
+        sp->size -= released;
+        m->footprint -= released;
+        init_top(m, m->top, m->topsize - released);
+        check_top_chunk(m, m->top);
+      }
+    }
+
+    /* Unmap any unused mmapped segments */
+    if (HAVE_MMAP)
+      released += release_unused_segments(m);
+
+    /* On failure, disable autotrim to avoid repeated failed future calls */
+    if (released == 0 && m->topsize > m->trim_check)
+      m->trim_check = MAX_SIZE_T;
+  }
+
+  return (released != 0)? 1 : 0;
+}
+
+
+/* ---------------------------- malloc support --------------------------- */
+
+/* allocate a large request from the best fitting chunk in a treebin */
+static void* tmalloc_large(mstate m, size_t nb) {
+  tchunkptr v = 0;
+  size_t rsize = -nb; /* Unsigned negation */
+  tchunkptr t;
+  bindex_t idx;
+  compute_tree_index(nb, idx);
+  if ((t = *treebin_at(m, idx)) != 0) {
+    /* Traverse tree for this bin looking for node with size == nb */
+    size_t sizebits = nb << leftshift_for_tree_index(idx);
+    tchunkptr rst = 0;  /* The deepest untaken right subtree */
+    for (;;) {
+      tchunkptr rt;
+      size_t trem = chunksize(t) - nb;
+      if (trem < rsize) {
+        v = t;
+        if ((rsize = trem) == 0)
+          break;
+      }
+      rt = t->child[1];
+      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+      if (rt != 0 && rt != t)
+        rst = rt;
+      if (t == 0) {
+        t = rst; /* set t to least subtree holding sizes > nb */
+        break;
+      }
+      sizebits <<= 1;
+    }
+  }
+  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
+    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
+    if (leftbits != 0) {
+      bindex_t i;
+      binmap_t leastbit = least_bit(leftbits);
+      compute_bit2idx(leastbit, i);
+      t = *treebin_at(m, i);
+    }
+  }
+
+  while (t != 0) { /* find smallest of tree or subtree */
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+    t = leftmost_child(t);
+  }
+
+  /*  If dv is a better fit, return 0 so malloc will use it */
+  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
+    if (RTCHECK(ok_address(m, v))) { /* split */
+      mchunkptr r = chunk_plus_offset(v, nb);
+      assert(chunksize(v) == rsize + nb);
+      if (RTCHECK(ok_next(v, r))) {
+        unlink_large_chunk(m, v);
+        if (rsize < MIN_CHUNK_SIZE)
+          set_inuse_and_pinuse(m, v, (rsize + nb));
+        else {
+          set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+          set_size_and_pinuse_of_free_chunk(r, rsize);
+          insert_chunk(m, r, rsize);
+        }
+        return chunk2mem(v);
+      }
+    }
+    CORRUPTION_ERROR_ACTION(m);
+  }
+  return 0;
+}
+
+/* allocate a small request from the best fitting chunk in a treebin */
+static void* tmalloc_small(mstate m, size_t nb) {
+  tchunkptr t, v;
+  size_t rsize;
+  bindex_t i;
+  binmap_t leastbit = least_bit(m->treemap);
+  compute_bit2idx(leastbit, i);
+  v = t = *treebin_at(m, i);
+  rsize = chunksize(t) - nb;
+
+  while ((t = leftmost_child(t)) != 0) {
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+  }
+
+  if (RTCHECK(ok_address(m, v))) {
+    mchunkptr r = chunk_plus_offset(v, nb);
+    assert(chunksize(v) == rsize + nb);
+    if (RTCHECK(ok_next(v, r))) {
+      unlink_large_chunk(m, v);
+      if (rsize < MIN_CHUNK_SIZE)
+        set_inuse_and_pinuse(m, v, (rsize + nb));
+      else {
+        set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        replace_dv(m, r, rsize);
+      }
+      return chunk2mem(v);
+    }
+  }
+
+  CORRUPTION_ERROR_ACTION(m);
+  return 0;
+}
+
+/* --------------------------- realloc support --------------------------- */
+
+static void* internal_realloc(mstate m, void* oldmem, size_t bytes) {
+  if (bytes >= MAX_REQUEST) {
+    MALLOC_FAILURE_ACTION;
+    return 0;
+  }
+  if (!PREACTION(m)) {
+    mchunkptr oldp = mem2chunk(oldmem);
+    size_t oldsize = chunksize(oldp);
+    mchunkptr next = chunk_plus_offset(oldp, oldsize);
+    mchunkptr newp = 0;
+    void* extra = 0;
+
+    /* Try to either shrink or extend into top. Else malloc-copy-free */
+
+    if (RTCHECK(ok_address(m, oldp) && ok_inuse(oldp) &&
+                ok_next(oldp, next) && ok_pinuse(next))) {
+      size_t nb = request2size(bytes);
+      if (is_mmapped(oldp))
+        newp = mmap_resize(m, oldp, nb);
+      else if (oldsize >= nb) { /* already big enough */
+        size_t rsize = oldsize - nb;
+        newp = oldp;
+        if (rsize >= MIN_CHUNK_SIZE) {
+          mchunkptr remainder = chunk_plus_offset(newp, nb);
+          set_inuse(m, newp, nb);
+          set_inuse_and_pinuse(m, remainder, rsize);
+          extra = chunk2mem(remainder);
+        }
+      }
+      else if (next == m->top && oldsize + m->topsize > nb) {
+        /* Expand into top */
+        size_t newsize = oldsize + m->topsize;
+        size_t newtopsize = newsize - nb;
+        mchunkptr newtop = chunk_plus_offset(oldp, nb);
+        set_inuse(m, oldp, nb);
+        newtop->head = newtopsize |PINUSE_BIT;
+        m->top = newtop;
+        m->topsize = newtopsize;
+        newp = oldp;
+      }
+    }
+    else {
+      USAGE_ERROR_ACTION(m, oldmem);
+      POSTACTION(m);
+      return 0;
+    }
+#if DEBUG
+    if (newp != 0) {
+      check_inuse_chunk(m, newp); /* Check requires lock */
+    }
+#endif
+
+    POSTACTION(m);
+
+    if (newp != 0) {
+      if (extra != 0) {
+        internal_free(m, extra);
+      }
+      return chunk2mem(newp);
+    }
+    else {
+      void* newmem = internal_malloc(m, bytes);
+      if (newmem != 0) {
+        size_t oc = oldsize - overhead_for(oldp);
+        memcpy(newmem, oldmem, (oc < bytes)? oc : bytes);
+        internal_free(m, oldmem);
+      }
+      return newmem;
+    }
+  }
+  return 0;
+}
+
+/* --------------------------- memalign support -------------------------- */
+
+static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
+  if (alignment <= MALLOC_ALIGNMENT)    /* Can just use malloc */
+    return internal_malloc(m, bytes);
+  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
+    alignment = MIN_CHUNK_SIZE;
+  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
+    size_t a = MALLOC_ALIGNMENT << 1;
+    while (a < alignment) a <<= 1;
+    alignment = a;
+  }
+
+  if (bytes >= MAX_REQUEST - alignment) {
+    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
+      MALLOC_FAILURE_ACTION;
+    }
+  }
+  else {
+    size_t nb = request2size(bytes);
+    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
+    char* mem = (char*)internal_malloc(m, req);
+    if (mem != 0) {
+      void* leader = 0;
+      void* trailer = 0;
+      mchunkptr p = mem2chunk(mem);
+
+      if (PREACTION(m)) return 0;
+      if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */
+        /*
+          Find an aligned spot inside chunk.  Since we need to give
+          back leading space in a chunk of at least MIN_CHUNK_SIZE, if
+          the first calculation places us at a spot with less than
+          MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
+          We've allocated enough total room so that this is always
+          possible.
+        */
+        char* br = (char*)mem2chunk((size_t)(((size_t)(mem +
+                                                       alignment -
+                                                       SIZE_T_ONE)) &
+                                             -alignment));
+        char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
+          br : br+alignment;
+        mchunkptr newp = (mchunkptr)pos;
+        size_t leadsize = pos - (char*)(p);
+        size_t newsize = chunksize(p) - leadsize;
+
+        if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
+          newp->prev_foot = p->prev_foot + leadsize;
+          newp->head = newsize;
+        }
+        else { /* Otherwise, give back leader, use the rest */
+          set_inuse(m, newp, newsize);
+          set_inuse(m, p, leadsize);
+          leader = chunk2mem(p);
+        }
+        p = newp;
+      }
+
+      /* Give back spare room at the end */
+      if (!is_mmapped(p)) {
+        size_t size = chunksize(p);
+        if (size > nb + MIN_CHUNK_SIZE) {
+          size_t remainder_size = size - nb;
+          mchunkptr remainder = chunk_plus_offset(p, nb);
+          set_inuse(m, p, nb);
+          set_inuse(m, remainder, remainder_size);
+          trailer = chunk2mem(remainder);
+        }
+      }
+
+      assert (chunksize(p) >= nb);
+      assert((((size_t)(chunk2mem(p))) % alignment) == 0);
+      check_inuse_chunk(m, p);
+      POSTACTION(m);
+      if (leader != 0) {
+        internal_free(m, leader);
+      }
+      if (trailer != 0) {
+        internal_free(m, trailer);
+      }
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* ------------------------ comalloc/coalloc support --------------------- */
+
+static void** ialloc(mstate m,
+                     size_t n_elements,
+                     size_t* sizes,
+                     int opts,
+                     void* chunks[]) {
+  /*
+    This provides common support for independent_X routines, handling
+    all of the combinations that can result.
+
+    The opts arg has:
+    bit 0 set if all elements are same size (using sizes[0])
+    bit 1 set if elements should be zeroed
+  */
+
+  size_t    element_size;   /* chunksize of each element, if all same */
+  size_t    contents_size;  /* total size of elements */
+  size_t    array_size;     /* request size of pointer array */
+  void*     mem;            /* malloced aggregate space */
+  mchunkptr p;              /* corresponding chunk */
+  size_t    remainder_size; /* remaining bytes while splitting */
+  void**    marray;         /* either "chunks" or malloced ptr array */
+  mchunkptr array_chunk;    /* chunk for malloced ptr array */
+  flag_t    was_enabled;    /* to disable mmap */
+  size_t    size;
+  size_t    i;
+
+  ensure_initialization();
+  /* compute array length, if needed */
+  if (chunks != 0) {
+    if (n_elements == 0)
+      return chunks; /* nothing to do */
+    marray = chunks;
+    array_size = 0;
+  }
+  else {
+    /* if empty req, must still return chunk representing empty array */
+    if (n_elements == 0)
+      return (void**)internal_malloc(m, 0);
+    marray = 0;
+    array_size = request2size(n_elements * (sizeof(void*)));
+  }
+
+  /* compute total element size */
+  if (opts & 0x1) { /* all-same-size */
+    element_size = request2size(*sizes);
+    contents_size = n_elements * element_size;
+  }
+  else { /* add up all the sizes */
+    element_size = 0;
+    contents_size = 0;
+    for (i = 0; i != n_elements; ++i)
+      contents_size += request2size(sizes[i]);
+  }
+
+  size = contents_size + array_size;
+
+  /*
+     Allocate the aggregate chunk.  First disable direct-mmapping so
+     malloc won't use it, since we would not be able to later
+     free/realloc space internal to a segregated mmap region.
+  */
+  was_enabled = use_mmap(m);
+  disable_mmap(m);
+  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
+  if (was_enabled)
+    enable_mmap(m);
+  if (mem == 0)
+    return 0;
+
+  if (PREACTION(m)) return 0;
+  p = mem2chunk(mem);
+  remainder_size = chunksize(p);
+
+  assert(!is_mmapped(p));
+
+  if (opts & 0x2) {       /* optionally clear the elements */
+    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
+  }
+
+  /* If not provided, allocate the pointer array as final part of chunk */
+  if (marray == 0) {
+    size_t  array_chunk_size;
+    array_chunk = chunk_plus_offset(p, contents_size);
+    array_chunk_size = remainder_size - contents_size;
+    marray = (void**) (chunk2mem(array_chunk));
+    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
+    remainder_size = contents_size;
+  }
+
+  /* split out elements */
+  for (i = 0; ; ++i) {
+    marray[i] = chunk2mem(p);
+    if (i != n_elements-1) {
+      if (element_size != 0)
+        size = element_size;
+      else
+        size = request2size(sizes[i]);
+      remainder_size -= size;
+      set_size_and_pinuse_of_inuse_chunk(m, p, size);
+      p = chunk_plus_offset(p, size);
+    }
+    else { /* the final element absorbs any overallocation slop */
+      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
+      break;
+    }
+  }
+
+#if DEBUG
+  if (marray != chunks) {
+    /* final element must have exactly exhausted chunk */
+    if (element_size != 0) {
+      assert(remainder_size == element_size);
+    }
+    else {
+      assert(remainder_size == request2size(sizes[i]));
+    }
+    check_inuse_chunk(m, mem2chunk(marray));
+  }
+  for (i = 0; i != n_elements; ++i)
+    check_inuse_chunk(m, mem2chunk(marray[i]));
+
+#endif /* DEBUG */
+
+  POSTACTION(m);
+  return marray;
+}
+
+
+/* -------------------------- public routines ---------------------------- */
+
+#if !ONLY_MSPACES
+
+void* dlmalloc(size_t bytes) {
+  /*
+     Basic algorithm:
+     If a small request (< 256 bytes minus per-chunk overhead):
+       1. If one exists, use a remainderless chunk in associated smallbin.
+          (Remainderless means that there are too few excess bytes to
+          represent as a chunk.)
+       2. If it is big enough, use the dv chunk, which is normally the
+          chunk adjacent to the one used for the most recent small request.
+       3. If one exists, split the smallest available chunk in a bin,
+          saving remainder in dv.
+       4. If it is big enough, use the top chunk.
+       5. If available, get memory from system and use it
+     Otherwise, for a large request:
+       1. Find the smallest available binned chunk that fits, and use it
+          if it is better fitting than dv chunk, splitting if necessary.
+       2. If better fitting than any binned chunk, use the dv chunk.
+       3. If it is big enough, use the top chunk.
+       4. If request size >= mmap threshold, try to directly mmap this chunk.
+       5. If available, get memory from system and use it
+
+     The ugly goto's here ensure that postaction occurs along all paths.
+  */
+
+#if USE_LOCKS
+  ensure_initialization(); /* initialize in sys_alloc if not using locks */
+#endif
+
+  if (!PREACTION(gm)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = gm->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+        mchunkptr b, p;
+        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+        b = smallbin_at(gm, idx);
+        p = b->fd;
+        assert(chunksize(p) == small_index2size(idx));
+        unlink_first_small_chunk(gm, b, p, idx);
+        set_inuse_and_pinuse(gm, p, small_index2size(idx));
+        mem = chunk2mem(p);
+        check_malloced_chunk(gm, mem, nb);
+        goto postaction;
+      }
+
+      else if (nb > gm->dvsize) {
+        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+          mchunkptr b, p, r;
+          size_t rsize;
+          bindex_t i;
+          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+          binmap_t leastbit = least_bit(leftbits);
+          compute_bit2idx(leastbit, i);
+          b = smallbin_at(gm, i);
+          p = b->fd;
+          assert(chunksize(p) == small_index2size(i));
+          unlink_first_small_chunk(gm, b, p, i);
+          rsize = small_index2size(i) - nb;
+          /* Fit here cannot be remainderless if 4byte sizes */
+          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+            set_inuse_and_pinuse(gm, p, small_index2size(i));
+          else {
+            set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+            r = chunk_plus_offset(p, nb);
+            set_size_and_pinuse_of_free_chunk(r, rsize);
+            replace_dv(gm, r, rsize);
+          }
+          mem = chunk2mem(p);
+          check_malloced_chunk(gm, mem, nb);
+          goto postaction;
+        }
+
+        else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
+          check_malloced_chunk(gm, mem, nb);
+          goto postaction;
+        }
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
+        check_malloced_chunk(gm, mem, nb);
+        goto postaction;
+      }
+    }
+
+    if (nb <= gm->dvsize) {
+      size_t rsize = gm->dvsize - nb;
+      mchunkptr p = gm->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+        mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
+        gm->dvsize = rsize;
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      }
+      else { /* exhaust dv */
+        size_t dvs = gm->dvsize;
+        gm->dvsize = 0;
+        gm->dv = 0;
+        set_inuse_and_pinuse(gm, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < gm->topsize) { /* Split top */
+      size_t rsize = gm->topsize -= nb;
+      mchunkptr p = gm->top;
+      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(gm, gm->top);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(gm, nb);
+
+  postaction:
+    POSTACTION(gm);
+    return mem;
+  }
+
+  return 0;
+}
+
+void dlfree(void* mem) {
+  /*
+     Consolidate freed chunks with preceeding or succeeding bordering
+     free chunks, if they exist, and then place in a bin.  Intermixed
+     with special cases for top, dv, mmapped chunks, and usage errors.
+  */
+
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+#else /* FOOTERS */
+#define fm gm
+#endif /* FOOTERS */
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
+        size_t psize = chunksize(p);
+        mchunkptr next = chunk_plus_offset(p, psize);
+        if (!pinuse(p)) {
+          size_t prevsize = p->prev_foot;
+          if (is_mmapped(p)) {
+            psize += prevsize + MMAP_FOOT_PAD;
+            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+              fm->footprint -= psize;
+            goto postaction;
+          }
+          else {
+            mchunkptr prev = chunk_minus_offset(p, prevsize);
+            psize += prevsize;
+            p = prev;
+            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+              if (p != fm->dv) {
+                unlink_chunk(fm, p, prevsize);
+              }
+              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+                fm->dvsize = psize;
+                set_free_with_pinuse(p, psize, next);
+                goto postaction;
+              }
+            }
+            else
+              goto erroraction;
+          }
+        }
+
+        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+          if (!cinuse(next)) {  /* consolidate forward */
+            if (next == fm->top) {
+              size_t tsize = fm->topsize += psize;
+              fm->top = p;
+              p->head = tsize | PINUSE_BIT;
+              if (p == fm->dv) {
+                fm->dv = 0;
+                fm->dvsize = 0;
+              }
+              if (should_trim(fm, tsize))
+                sys_trim(fm, 0);
+              goto postaction;
+            }
+            else if (next == fm->dv) {
+              size_t dsize = fm->dvsize += psize;
+              fm->dv = p;
+              set_size_and_pinuse_of_free_chunk(p, dsize);
+              goto postaction;
+            }
+            else {
+              size_t nsize = chunksize(next);
+              psize += nsize;
+              unlink_chunk(fm, next, nsize);
+              set_size_and_pinuse_of_free_chunk(p, psize);
+              if (p == fm->dv) {
+                fm->dvsize = psize;
+                goto postaction;
+              }
+            }
+          }
+          else
+            set_free_with_pinuse(p, psize, next);
+
+          if (is_small(psize)) {
+            insert_small_chunk(fm, p, psize);
+            check_free_chunk(fm, p);
+          }
+          else {
+            tchunkptr tp = (tchunkptr)p;
+            insert_large_chunk(fm, tp, psize);
+            check_free_chunk(fm, p);
+            if (--fm->release_checks == 0)
+              release_unused_segments(fm);
+          }
+          goto postaction;
+        }
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+#if !FOOTERS
+#undef fm
+#endif /* FOOTERS */
+}
+
+void* dlcalloc(size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+        (req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = dlmalloc(req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* dlrealloc(void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return dlmalloc(bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    dlfree(oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if ! FOOTERS
+    mstate m = gm;
+#else /* FOOTERS */
+    mstate m = get_mstate_for(mem2chunk(oldmem));
+    if (!ok_magic(m)) {
+      USAGE_ERROR_ACTION(m, oldmem);
+      return 0;
+    }
+#endif /* FOOTERS */
+    return internal_realloc(m, oldmem, bytes);
+  }
+}
+
+void* dlmemalign(size_t alignment, size_t bytes) {
+  return internal_memalign(gm, alignment, bytes);
+}
+
+void** dlindependent_calloc(size_t n_elements, size_t elem_size,
+                                 void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  return ialloc(gm, n_elements, &sz, 3, chunks);
+}
+
+void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
+                                   void* chunks[]) {
+  return ialloc(gm, n_elements, sizes, 0, chunks);
+}
+
+void* dlvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, bytes);
+}
+
+void* dlpvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
+}
+
+int dlmalloc_trim(size_t pad) {
+  int result = 0;
+  ensure_initialization();
+  if (!PREACTION(gm)) {
+    result = sys_trim(gm, pad);
+    POSTACTION(gm);
+  }
+  return result;
+}
+
+size_t dlmalloc_footprint(void) {
+  return gm->footprint;
+}
+
+size_t dlmalloc_max_footprint(void) {
+  return gm->max_footprint;
+}
+
+#if !NO_MALLINFO
+struct mallinfo dlmallinfo(void) {
+  return internal_mallinfo(gm);
+}
+#endif /* NO_MALLINFO */
+
+void dlmalloc_stats() {
+  internal_malloc_stats(gm);
+}
+
+int dlmallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* !ONLY_MSPACES */
+
+size_t dlmalloc_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (is_inuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+/* ----------------------------- user mspaces ---------------------------- */
+
+#if MSPACES
+
+static mstate init_user_mstate(char* tbase, size_t tsize) {
+  size_t msize = pad_request(sizeof(struct malloc_state));
+  mchunkptr mn;
+  mchunkptr msp = align_as_chunk(tbase);
+  mstate m = (mstate)(chunk2mem(msp));
+  memset(m, 0, msize);
+  INITIAL_LOCK(&m->mutex);
+  msp->head = (msize|INUSE_BITS);
+  m->seg.base = m->least_addr = tbase;
+  m->seg.size = m->footprint = m->max_footprint = tsize;
+  m->magic = mparams.magic;
+  m->release_checks = MAX_RELEASE_CHECK_RATE;
+  m->mflags = mparams.default_mflags;
+  m->extp = 0;
+  m->exts = 0;
+  disable_contiguous(m);
+  init_bins(m);
+  mn = next_chunk(mem2chunk(m));
+  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
+  check_top_chunk(m, m->top);
+  return m;
+}
+
+mspace create_mspace(size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    size_t rs = ((capacity == 0)? mparams.granularity :
+                 (capacity + TOP_FOOT_SIZE + msize));
+    size_t tsize = granularity_align(rs);
+    char* tbase = (char*)(CALL_MMAP(tsize));
+    if (tbase != CMFAIL) {
+      m = init_user_mstate(tbase, tsize);
+      m->seg.sflags = USE_MMAP_BIT;
+      set_lock(m, locked);
+    }
+  }
+  return (mspace)m;
+}
+
+mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity > msize + TOP_FOOT_SIZE &&
+      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    m = init_user_mstate((char*)base, capacity);
+    m->seg.sflags = EXTERN_BIT;
+    set_lock(m, locked);
+  }
+  return (mspace)m;
+}
+
+int mspace_track_large_chunks(mspace msp, int enable) {
+  int ret = 0;
+  mstate ms = (mstate)msp;
+  if (!PREACTION(ms)) {
+    if (!use_mmap(ms))
+      ret = 1;
+    if (!enable)
+      enable_mmap(ms);
+    else
+      disable_mmap(ms);
+    POSTACTION(ms);
+  }
+  return ret;
+}
+
+size_t destroy_mspace(mspace msp) {
+  size_t freed = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    msegmentptr sp = &ms->seg;
+    while (sp != 0) {
+      char* base = sp->base;
+      size_t size = sp->size;
+      flag_t flag = sp->sflags;
+      sp = sp->next;
+      if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
+          CALL_MUNMAP(base, size) == 0)
+        freed += size;
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return freed;
+}
+
+/*
+  mspace versions of routines are near-clones of the global
+  versions. This is not so nice but better than the alternatives.
+*/
+
+
+void* mspace_malloc(mspace msp, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (!PREACTION(ms)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = ms->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+        mchunkptr b, p;
+        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+        b = smallbin_at(ms, idx);
+        p = b->fd;
+        assert(chunksize(p) == small_index2size(idx));
+        unlink_first_small_chunk(ms, b, p, idx);
+        set_inuse_and_pinuse(ms, p, small_index2size(idx));
+        mem = chunk2mem(p);
+        check_malloced_chunk(ms, mem, nb);
+        goto postaction;
+      }
+
+      else if (nb > ms->dvsize) {
+        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+          mchunkptr b, p, r;
+          size_t rsize;
+          bindex_t i;
+          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+          binmap_t leastbit = least_bit(leftbits);
+          compute_bit2idx(leastbit, i);
+          b = smallbin_at(ms, i);
+          p = b->fd;
+          assert(chunksize(p) == small_index2size(i));
+          unlink_first_small_chunk(ms, b, p, i);
+          rsize = small_index2size(i) - nb;
+          /* Fit here cannot be remainderless if 4byte sizes */
+          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+            set_inuse_and_pinuse(ms, p, small_index2size(i));
+          else {
+            set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+            r = chunk_plus_offset(p, nb);
+            set_size_and_pinuse_of_free_chunk(r, rsize);
+            replace_dv(ms, r, rsize);
+          }
+          mem = chunk2mem(p);
+          check_malloced_chunk(ms, mem, nb);
+          goto postaction;
+        }
+
+        else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
+          check_malloced_chunk(ms, mem, nb);
+          goto postaction;
+        }
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
+        check_malloced_chunk(ms, mem, nb);
+        goto postaction;
+      }
+    }
+
+    if (nb <= ms->dvsize) {
+      size_t rsize = ms->dvsize - nb;
+      mchunkptr p = ms->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+        mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
+        ms->dvsize = rsize;
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      }
+      else { /* exhaust dv */
+        size_t dvs = ms->dvsize;
+        ms->dvsize = 0;
+        ms->dv = 0;
+        set_inuse_and_pinuse(ms, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < ms->topsize) { /* Split top */
+      size_t rsize = ms->topsize -= nb;
+      mchunkptr p = ms->top;
+      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(ms, ms->top);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(ms, nb);
+
+  postaction:
+    POSTACTION(ms);
+    return mem;
+  }
+
+  return 0;
+}
+
+void mspace_free(mspace msp, void* mem) {
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+    msp = msp; /* placate people compiling -Wunused */
+#else /* FOOTERS */
+    mstate fm = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
+        size_t psize = chunksize(p);
+        mchunkptr next = chunk_plus_offset(p, psize);
+        if (!pinuse(p)) {
+          size_t prevsize = p->prev_foot;
+          if (is_mmapped(p)) {
+            psize += prevsize + MMAP_FOOT_PAD;
+            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+              fm->footprint -= psize;
+            goto postaction;
+          }
+          else {
+            mchunkptr prev = chunk_minus_offset(p, prevsize);
+            psize += prevsize;
+            p = prev;
+            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+              if (p != fm->dv) {
+                unlink_chunk(fm, p, prevsize);
+              }
+              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+                fm->dvsize = psize;
+                set_free_with_pinuse(p, psize, next);
+                goto postaction;
+              }
+            }
+            else
+              goto erroraction;
+          }
+        }
+
+        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+          if (!cinuse(next)) {  /* consolidate forward */
+            if (next == fm->top) {
+              size_t tsize = fm->topsize += psize;
+              fm->top = p;
+              p->head = tsize | PINUSE_BIT;
+              if (p == fm->dv) {
+                fm->dv = 0;
+                fm->dvsize = 0;
+              }
+              if (should_trim(fm, tsize))
+                sys_trim(fm, 0);
+              goto postaction;
+            }
+            else if (next == fm->dv) {
+              size_t dsize = fm->dvsize += psize;
+              fm->dv = p;
+              set_size_and_pinuse_of_free_chunk(p, dsize);
+              goto postaction;
+            }
+            else {
+              size_t nsize = chunksize(next);
+              psize += nsize;
+              unlink_chunk(fm, next, nsize);
+              set_size_and_pinuse_of_free_chunk(p, psize);
+              if (p == fm->dv) {
+                fm->dvsize = psize;
+                goto postaction;
+              }
+            }
+          }
+          else
+            set_free_with_pinuse(p, psize, next);
+
+          if (is_small(psize)) {
+            insert_small_chunk(fm, p, psize);
+            check_free_chunk(fm, p);
+          }
+          else {
+            tchunkptr tp = (tchunkptr)p;
+            insert_large_chunk(fm, tp, psize);
+            check_free_chunk(fm, p);
+            if (--fm->release_checks == 0)
+              release_unused_segments(fm);
+          }
+          goto postaction;
+        }
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+}
+
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+        (req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = internal_malloc(ms, req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return mspace_malloc(msp, bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    mspace_free(msp, oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if FOOTERS
+    mchunkptr p  = mem2chunk(oldmem);
+    mstate ms = get_mstate_for(p);
+#else /* FOOTERS */
+    mstate ms = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(ms)) {
+      USAGE_ERROR_ACTION(ms,ms);
+      return 0;
+    }
+    return internal_realloc(ms, oldmem, bytes);
+  }
+}
+
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return internal_memalign(ms, alignment, bytes);
+}
+
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+                                 size_t elem_size, void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, &sz, 3, chunks);
+}
+
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+                                   size_t sizes[], void* chunks[]) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, sizes, 0, chunks);
+}
+
+int mspace_trim(mspace msp, size_t pad) {
+  int result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    if (!PREACTION(ms)) {
+      result = sys_trim(ms, pad);
+      POSTACTION(ms);
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+void mspace_malloc_stats(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    internal_malloc_stats(ms);
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+}
+
+size_t mspace_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+size_t mspace_max_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->max_footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+#if !NO_MALLINFO
+struct mallinfo mspace_mallinfo(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return internal_mallinfo(ms);
+}
+#endif /* NO_MALLINFO */
+
+size_t mspace_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (is_inuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+int mspace_mallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* MSPACES */
+
+
+/* -------------------- Alternative MORECORE functions ------------------- */
+
+/*
+  Guidelines for creating a custom version of MORECORE:
+
+  * For best performance, MORECORE should allocate in multiples of pagesize.
+  * MORECORE may allocate more memory than requested. (Or even less,
+      but this will usually result in a malloc failure.)
+  * MORECORE must not allocate memory when given argument zero, but
+      instead return one past the end address of memory from previous
+      nonzero call.
+  * For best performance, consecutive calls to MORECORE with positive
+      arguments should return increasing addresses, indicating that
+      space has been contiguously extended.
+  * Even though consecutive calls to MORECORE need not return contiguous
+      addresses, it must be OK for malloc'ed chunks to span multiple
+      regions in those cases where they do happen to be contiguous.
+  * MORECORE need not handle negative arguments -- it may instead
+      just return MFAIL when given negative arguments.
+      Negative arguments are always multiples of pagesize. MORECORE
+      must not misinterpret negative args as large positive unsigned
+      args. You can suppress all such calls from even occurring by defining
+      MORECORE_CANNOT_TRIM,
+
+  As an example alternative MORECORE, here is a custom allocator
+  kindly contributed for pre-OSX macOS.  It uses virtually but not
+  necessarily physically contiguous non-paged memory (locked in,
+  present and won't get swapped out).  You can use it by uncommenting
+  this section, adding some #includes, and setting up the appropriate
+  defines above:
+
+      #define MORECORE osMoreCore
+
+  There is also a shutdown routine that should somehow be called for
+  cleanup upon program exit.
+
+  #define MAX_POOL_ENTRIES 100
+  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
+  static int next_os_pool;
+  void *our_os_pools[MAX_POOL_ENTRIES];
+
+  void *osMoreCore(int size)
+  {
+    void *ptr = 0;
+    static void *sbrk_top = 0;
+
+    if (size > 0)
+    {
+      if (size < MINIMUM_MORECORE_SIZE)
+         size = MINIMUM_MORECORE_SIZE;
+      if (CurrentExecutionLevel() == kTaskLevel)
+         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
+      if (ptr == 0)
+      {
+        return (void *) MFAIL;
+      }
+      // save ptrs so they can be freed during cleanup
+      our_os_pools[next_os_pool] = ptr;
+      next_os_pool++;
+      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
+      sbrk_top = (char *) ptr + size;
+      return ptr;
+    }
+    else if (size < 0)
+    {
+      // we don't currently support shrink behavior
+      return (void *) MFAIL;
+    }
+    else
+    {
+      return sbrk_top;
+    }
+  }
+
+  // cleanup any allocated memory pools
+  // called as last thing before shutting down driver
+
+  void osCleanupMem(void)
+  {
+    void **ptr;
+
+    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
+      if (*ptr)
+      {
+         PoolDeallocate(*ptr);
+         *ptr = 0;
+      }
+  }
+
+*/
+
+
+/* -----------------------------------------------------------------------
+History:
+    V2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
+      * Use zeros instead of prev foot for is_mmapped
+      * Add mspace_track_large_chunks; thanks to Jean Brouwers
+      * Fix set_inuse in internal_realloc; thanks to Jean Brouwers
+      * Fix insufficient sys_alloc padding when using 16byte alignment
+      * Fix bad error check in mspace_footprint
+      * Adaptations for ptmalloc; thanks to Wolfram Gloger.
+      * Reentrant spin locks; thanks to Earl Chew and others
+      * Win32 improvements; thanks to Niall Douglas and Earl Chew
+      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
+      * Extension hook in malloc_state
+      * Various small adjustments to reduce warnings on some compilers
+      * Various configuration extensions/changes for more platforms. Thanks
+         to all who contributed these.
+
+    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
+      * Add max_footprint functions
+      * Ensure all appropriate literals are size_t
+      * Fix conditional compilation problem for some #define settings
+      * Avoid concatenating segments with the one provided
+        in create_mspace_with_base
+      * Rename some variables to avoid compiler shadowing warnings
+      * Use explicit lock initialization.
+      * Better handling of sbrk interference.
+      * Simplify and fix segment insertion, trimming and mspace_destroy
+      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
+      * Thanks especially to Dennis Flanagan for help on these.
+
+    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
+      * Fix memalign brace error.
+
+    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
+      * Fix improper #endif nesting in C++
+      * Add explicit casts needed for C++
+
+    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
+      * Use trees for large bins
+      * Support mspaces
+      * Use segments to unify sbrk-based and mmap-based system allocation,
+        removing need for emulation on most platforms without sbrk.
+      * Default safety checks
+      * Optional footer checks. Thanks to William Robertson for the idea.
+      * Internal code refactoring
+      * Incorporate suggestions and platform-specific changes.
+        Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
+        Aaron Bachmann,  Emery Berger, and others.
+      * Speed up non-fastbin processing enough to remove fastbins.
+      * Remove useless cfree() to avoid conflicts with other apps.
+      * Remove internal memcpy, memset. Compilers handle builtins better.
+      * Remove some options that no one ever used and rename others.
+
+    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
+      * Fix malloc_state bitmap array misdeclaration
+
+    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
+      * Allow tuning of FIRST_SORTED_BIN_SIZE
+      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
+      * Better detection and support for non-contiguousness of MORECORE.
+        Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
+      * Bypass most of malloc if no frees. Thanks To Emery Berger.
+      * Fix freeing of old top non-contiguous chunk im sysmalloc.
+      * Raised default trim and map thresholds to 256K.
+      * Fix mmap-related #defines. Thanks to Lubos Lunak.
+      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
+      * Branch-free bin calculation
+      * Default trim and mmap thresholds now 256K.
+
+    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
+      * Introduce independent_comalloc and independent_calloc.
+        Thanks to Michael Pachos for motivation and help.
+      * Make optional .h file available
+      * Allow > 2GB requests on 32bit systems.
+      * new WIN32 sbrk, mmap, munmap, lock code from <[email protected]>.
+        Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
+        and Anonymous.
+      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
+        helping test this.)
+      * memalign: check alignment arg
+      * realloc: don't try to shift chunks backwards, since this
+        leads to  more fragmentation in some programs and doesn't
+        seem to help in any others.
+      * Collect all cases in malloc requiring system memory into sysmalloc
+      * Use mmap as backup to sbrk
+      * Place all internal state in malloc_state
+      * Introduce fastbins (although similar to 2.5.1)
+      * Many minor tunings and cosmetic improvements
+      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
+      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
+        Thanks to Tony E. Bennett <[email protected]> and others.
+      * Include errno.h to support default failure action.
+
+    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
+      * return null for negative arguments
+      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
+         * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
+          (e.g. WIN32 platforms)
+         * Cleanup header file inclusion for WIN32 platforms
+         * Cleanup code to avoid Microsoft Visual C++ compiler complaints
+         * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
+           memory allocation routines
+         * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
+         * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
+           usage of 'assert' in non-WIN32 code
+         * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
+           avoid infinite loop
+      * Always call 'fREe()' rather than 'free()'
+
+    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
+      * Fixed ordering problem with boundary-stamping
+
+    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
+      * Added pvalloc, as recommended by H.J. Liu
+      * Added 64bit pointer support mainly from Wolfram Gloger
+      * Added anonymously donated WIN32 sbrk emulation
+      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
+      * malloc_extend_top: fix mask error that caused wastage after
+        foreign sbrks
+      * Add linux mremap support code from HJ Liu
+
+    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
+      * Integrated most documentation with the code.
+      * Add support for mmap, with help from
+        Wolfram Gloger ([email protected]).
+      * Use last_remainder in more cases.
+      * Pack bins using idea from  [email protected]
+      * Use ordered bins instead of best-fit threshhold
+      * Eliminate block-local decls to simplify tracing and debugging.
+      * Support another case of realloc via move into top
+      * Fix error occuring when initial sbrk_base not word-aligned.
+      * Rely on page size for units instead of SBRK_UNIT to
+        avoid surprises about sbrk alignment conventions.
+      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
+        ([email protected]) for the suggestion.
+      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
+      * More precautions for cases where other routines call sbrk,
+        courtesy of Wolfram Gloger ([email protected]).
+      * Added macros etc., allowing use in linux libc from
+        H.J. Lu ([email protected])
+      * Inverted this history list
+
+    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
+      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
+      * Removed all preallocation code since under current scheme
+        the work required to undo bad preallocations exceeds
+        the work saved in good cases for most test programs.
+      * No longer use return list or unconsolidated bins since
+        no scheme using them consistently outperforms those that don't
+        given above changes.
+      * Use best fit for very large chunks to prevent some worst-cases.
+      * Added some support for debugging
+
+    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
+      * Removed footers when chunks are in use. Thanks to
+        Paul Wilson ([email protected]) for the suggestion.
+
+    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
+      * Added malloc_trim, with help from Wolfram Gloger
+        ([email protected]).
+
+    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)
+
+    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
+      * realloc: try to expand in both directions
+      * malloc: swap order of clean-bin strategy;
+      * realloc: only conditionally expand backwards
+      * Try not to scavenge used bins
+      * Use bin counts as a guide to preallocation
+      * Occasionally bin return list chunks in first scan
+      * Add a few optimizations from [email protected]
+
+    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
+      * faster bin computation & slightly different binning
+      * merged all consolidations to one part of malloc proper
+         (eliminating old malloc_find_space & malloc_clean_bin)
+      * Scan 2 returns chunks (not just 1)
+      * Propagate failure in realloc if malloc returns 0
+      * Add stuff to allow compilation on non-ANSI compilers
+          from [email protected]
+
+    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
+      * removed potential for odd address access in prev_chunk
+      * removed dependency on getpagesize.h
+      * misc cosmetics and a bit more internal documentation
+      * anticosmetics: mangled names in macros to evade debugger strangeness
+      * tested on sparc, hp-700, dec-mips, rs6000
+          with gcc & native cc (hp, dec only) allowing
+          Detlefs & Zorn comparison study (in SIGPLAN Notices.)
+
+    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
+      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
+         structure of old version,  but most details differ.)
+
+*/
+
+#endif

+ 1467 - 1467
drivers/nedmalloc/nedmalloc.cpp

@@ -1,1467 +1,1467 @@
-#ifdef NEDMALLOC_ENABLED
-/* Alternative malloc implementation for multiple threads without
-lock contention based on dlmalloc. (C) 2005-2009 Niall Douglas
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-#ifdef _MSC_VER
-/* Enable full aliasing on MSVC */
-/*#pragma optimize("a", on)*/
-#pragma warning(push)
-#pragma warning(disable:4100)	/* unreferenced formal parameter */
-#pragma warning(disable:4127)	/* conditional expression is constant */
-#pragma warning(disable:4706)	/* assignment within conditional expression */
-#endif
-
-/*#define ENABLE_TOLERANT_NEDMALLOC 1*/
-/*#define ENABLE_FAST_HEAP_DETECTION 1*/
-/*#define NEDMALLOC_DEBUG 1*/
-
-/*#define FULLSANITYCHECKS*/
-/* If link time code generation is on, don't force or prevent inlining */
-#if defined(_MSC_VER) && defined(NEDMALLOC_DLL_EXPORTS)
-#define FORCEINLINE
-#define NOINLINE
-#endif
-
-
-#include "nedmalloc.h"
-#ifdef WIN32
- #include <malloc.h>
- #include <stddef.h>
-#endif
-#if USE_ALLOCATOR==1
- #define MSPACES 1
- #define ONLY_MSPACES 1
-#endif
-#define USE_DL_PREFIX 1
-#ifndef USE_LOCKS
- #define USE_LOCKS 1
-#endif
-#define FOOTERS 1           /* Need to enable footers so frees lock the right mspace */
-#ifndef NEDMALLOC_DEBUG
- #if defined(DEBUG) || defined(_DEBUG)
-  #define NEDMALLOC_DEBUG 1
- #else
-  #define NEDMALLOC_DEBUG 0
- #endif
-#endif
-/* We need to consistently define DEBUG=0|1, _DEBUG and NDEBUG for dlmalloc */
-#undef DEBUG
-#undef _DEBUG
-#if NEDMALLOC_DEBUG
- #define _DEBUG
- #define DEBUG 1
-#else
- #define DEBUG 0
-#endif
-#ifdef NDEBUG               /* Disable assert checking on release builds */
- #undef DEBUG
- #undef _DEBUG
-#endif
-/* The default of 64Kb means we spend too much time kernel-side */
-#ifndef DEFAULT_GRANULARITY
-#define DEFAULT_GRANULARITY (1*1024*1024)
-#if DEBUG
-#define DEFAULT_GRANULARITY_ALIGNED
-#endif
-#endif
-/*#define USE_SPIN_LOCKS 0*/
-
-
-#include "malloc.c.h"
-#ifdef NDEBUG               /* Disable assert checking on release builds */
- #undef DEBUG
-#elif !NEDMALLOC_DEBUG
- #ifdef __GNUC__
-  #warning DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.
- #elif defined(_MSC_VER)
-  #pragma message(__FILE__ ": WARNING: DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.")
- #endif
-#endif
-
-/* The maximum concurrent threads in a pool possible */
-#ifndef MAXTHREADSINPOOL
-#define MAXTHREADSINPOOL 16
-#endif
-/* The maximum number of threadcaches which can be allocated */
-#ifndef THREADCACHEMAXCACHES
-#define THREADCACHEMAXCACHES 256
-#endif
-/* The maximum size to be allocated from the thread cache */
-#ifndef THREADCACHEMAX
-#define THREADCACHEMAX 8192
-#endif
-#if 0
-/* The number of cache entries for finer grained bins. This is (topbitpos(THREADCACHEMAX)-4)*2 */
-#define THREADCACHEMAXBINS ((13-4)*2)
-#else
-/* The number of cache entries. This is (topbitpos(THREADCACHEMAX)-4) */
-#define THREADCACHEMAXBINS (13-4)
-#endif
-/* Point at which the free space in a thread cache is garbage collected */
-#ifndef THREADCACHEMAXFREESPACE
-#define THREADCACHEMAXFREESPACE (512*1024)
-#endif
-
-
-#ifdef WIN32
- #define TLSVAR			DWORD
- #define TLSALLOC(k)	(*(k)=TlsAlloc(), TLS_OUT_OF_INDEXES==*(k))
- #define TLSFREE(k)		(!TlsFree(k))
- #define TLSGET(k)		TlsGetValue(k)
- #define TLSSET(k, a)	(!TlsSetValue(k, a))
- #ifdef DEBUG
-static LPVOID ChkedTlsGetValue(DWORD idx)
-{
-	LPVOID ret=TlsGetValue(idx);
-	assert(S_OK==GetLastError());
-	return ret;
-}
-  #undef TLSGET
-  #define TLSGET(k) ChkedTlsGetValue(k)
- #endif
-#else
- #define TLSVAR			pthread_key_t
- #define TLSALLOC(k)	pthread_key_create(k, 0)
- #define TLSFREE(k)		pthread_key_delete(k)
- #define TLSGET(k)		pthread_getspecific(k)
- #define TLSSET(k, a)	pthread_setspecific(k, a)
-#endif
-
-#if defined(__cplusplus)
-#if !defined(NO_NED_NAMESPACE)
-namespace nedalloc {
-#else
-extern "C" {
-#endif
-#endif
-
-#if USE_ALLOCATOR==0
-static void *unsupported_operation(const char *opname) THROWSPEC
-{
-	fprintf(stderr, "nedmalloc: The operation %s is not supported under this build configuration\n", opname);
-	abort();
-	return 0;
-}
-static size_t mspacecounter=(size_t) 0xdeadbeef;
-#endif
-#ifndef ENABLE_FAST_HEAP_DETECTION
-static void *RESTRICT leastusedaddress;
-static size_t largestusedblock;
-#endif
-
-static FORCEINLINE void *CallMalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
-{
-	void *RESTRICT ret=0;
-	size_t _alignment=alignment;
-#if USE_MAGIC_HEADERS
-	size_t *_ret=0;
-	size+=alignment+3*sizeof(size_t);
-	_alignment=0;
-#endif
-#if USE_ALLOCATOR==0
-	ret=_alignment ? 
-#ifdef _MSC_VER
-	/* This is the MSVCRT equivalent */
-		_aligned_malloc(size, _alignment)
-#elif defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
-	/* This is the glibc/ptmalloc2/dlmalloc/BSD libc equivalent.  */
-		memalign(_alignment, size)
-#else
-#error Cannot aligned allocate with the memory allocator of an unknown system!
-#endif
-		: malloc(size);
-#elif USE_ALLOCATOR==1
-	ret=_alignment ? mspace_memalign((mstate) mspace, _alignment, size) : mspace_malloc((mstate) mspace, size);
-#ifndef ENABLE_FAST_HEAP_DETECTION
-	if(ret)
-	{
-		size_t truesize=chunksize(mem2chunk(ret));
-		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
-		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
-	}
-#endif
-#endif
-	if(!ret) return 0;
-#if USE_MAGIC_HEADERS
-	_ret=(size_t *) ret;
-	ret=(void *)(_ret+3);
-	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
-	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *)"NEDMALOC";
-	_ret[0]=(size_t) mspace;
-	_ret[1]=size-3*sizeof(size_t);
-#endif
-	return ret;
-}
-
-static FORCEINLINE void *CallCalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
-{
-	void *RESTRICT ret=0;
-#if USE_MAGIC_HEADERS
-	size_t *_ret=0;
-	size+=alignment+3*sizeof(size_t);
-#endif
-#if USE_ALLOCATOR==0
-	ret=calloc(1, size);
-#elif USE_ALLOCATOR==1
-	ret=mspace_calloc((mstate) mspace, 1, size);
-#ifndef ENABLE_FAST_HEAP_DETECTION
-	if(ret)
-	{
-		size_t truesize=chunksize(mem2chunk(ret));
-		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
-		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
-	}
-#endif
-#endif
-	if(!ret) return 0;
-#if USE_MAGIC_HEADERS
-	_ret=(size_t *) ret;
-	ret=(void *)(_ret+3);
-	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
-	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
-	_ret[0]=(size_t) mspace;
-	_ret[1]=size-3*sizeof(size_t);
-#endif
-	return ret;
-}
-
-static FORCEINLINE void *CallRealloc(void *RESTRICT mspace, void *RESTRICT mem, int isforeign, size_t oldsize, size_t newsize) THROWSPEC
-{
-	void *RESTRICT ret=0;
-#if USE_MAGIC_HEADERS
-	mstate oldmspace=0;
-	size_t *_ret=0, *_mem=(size_t *) mem-3;
-#endif
-	if(isforeign)
-	{	/* Transfer */
-#if USE_MAGIC_HEADERS
-		assert(_mem[0]!=*(size_t *) "NEDMALOC");
-#endif
-		if((ret=CallMalloc(mspace, newsize, 0)))
-		{
-#if defined(DEBUG)
-			printf("*** nedmalloc frees system allocated block %p\n", mem);
-#endif
-			memcpy(ret, mem, oldsize<newsize ? oldsize : newsize);
-			free(mem);
-		}
-		return ret;
-	}
-#if USE_MAGIC_HEADERS
-	assert(_mem[0]==*(size_t *) "NEDMALOC");
-	newsize+=3*sizeof(size_t);
-	oldmspace=(mstate) _mem[1];
-	assert(oldsize>=_mem[2]);
-	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
-	mem=(void *)(++_mem);
-#endif
-#if USE_ALLOCATOR==0
-	ret=realloc(mem, newsize);
-#elif USE_ALLOCATOR==1
-	ret=mspace_realloc((mstate) mspace, mem, newsize);
-#ifndef ENABLE_FAST_HEAP_DETECTION
-	if(ret)
-	{
-		size_t truesize=chunksize(mem2chunk(ret));
-		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
-	}
-#endif
-#endif
-	if(!ret)
-	{	/* Put it back the way it was */
-#if USE_MAGIC_HEADERS
-		for(; *_mem==0; *_mem++=*(size_t *) "NEDMALOC");
-#endif
-		return 0;
-	}
-#if USE_MAGIC_HEADERS
-	_ret=(size_t *) ret;
-	ret=(void *)(_ret+3);
-	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
-	_ret[0]=(size_t) mspace;
-	_ret[1]=newsize-3*sizeof(size_t);
-#endif
-	return ret;
-}
-
-static FORCEINLINE void CallFree(void *RESTRICT mspace, void *RESTRICT mem, int isforeign) THROWSPEC
-{
-#if USE_MAGIC_HEADERS
-	mstate oldmspace=0;
-	size_t *_mem=(size_t *) mem-3, oldsize=0;
-#endif
-	if(isforeign)
-	{
-#if USE_MAGIC_HEADERS
-		assert(_mem[0]!=*(size_t *) "NEDMALOC");
-#endif
-#if defined(DEBUG)
-		printf("*** nedmalloc frees system allocated block %p\n", mem);
-#endif
-		free(mem);
-		return;
-	}
-#if USE_MAGIC_HEADERS
-	assert(_mem[0]==*(size_t *) "NEDMALOC");
-	oldmspace=(mstate) _mem[1];
-	oldsize=_mem[2];
-	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
-	mem=(void *)(++_mem);
-#endif
-#if USE_ALLOCATOR==0
-	free(mem);
-#elif USE_ALLOCATOR==1
-	mspace_free((mstate) mspace, mem);
-#endif
-}
-
-static NEDMALLOCNOALIASATTR mstate nedblkmstate(void *RESTRICT mem) THROWSPEC
-{
-	if(mem)
-	{
-#if USE_MAGIC_HEADERS
-		size_t *_mem=(size_t *) mem-3;
-		if(_mem[0]==*(size_t *) "NEDMALOC")
-		{
-			return (mstate) _mem[1];
-		}
-		else return 0;
-#else
-#if USE_ALLOCATOR==0
-		/* Fail everything */
-		return 0;
-#elif USE_ALLOCATOR==1
-#ifdef ENABLE_FAST_HEAP_DETECTION
-#ifdef WIN32
-		/*  On Windows for RELEASE both x86 and x64 the NT heap precedes each block with an eight byte header
-			which looks like:
-				normal: 4 bytes of size, 4 bytes of [char < 64, char < 64, char < 64 bit 0 always set, char random ]
-				mmaped: 4 bytes of size  4 bytes of [zero,      zero,      0xb,                        zero        ]
-
-			On Windows for DEBUG both x86 and x64 the preceding four bytes is always 0xfdfdfdfd (no man's land).
-		*/
-#pragma pack(push, 1)
-		struct _HEAP_ENTRY
-		{
-			USHORT Size;
-			USHORT PreviousSize;
-			UCHAR Cookie;			/* SegmentIndex */
-			UCHAR Flags;			/* always bit 0 (HEAP_ENTRY_BUSY). bit 1=(HEAP_ENTRY_EXTRA_PRESENT), bit 2=normal block (HEAP_ENTRY_FILL_PATTERN), bit 3=mmap block (HEAP_ENTRY_VIRTUAL_ALLOC). Bit 4 (HEAP_ENTRY_LAST_ENTRY) could be set */
-			UCHAR UnusedBytes;
-			UCHAR SmallTagIndex;	/* fastbin index. Always one of 0x02, 0x03, 0x04 < 0x80 */
-		} *RESTRICT he=((struct _HEAP_ENTRY *) mem)-1;
-#pragma pack(pop)
-		unsigned int header=((unsigned int *)mem)[-1], mask1=0x8080E100, result1, mask2=0xFFFFFF06, result2;
-		result1=header & mask1;	/* Positive testing for NT heap */
-		result2=header & mask2;	/* Positive testing for dlmalloc */
-		if(result1==0x00000100 && result2!=0x00000102)
-		{	/* This is likely a NT heap block */
-			return 0;
-		}
-#endif
-#ifdef __linux__
-		/* On Linux glibc uses ptmalloc2 (really dlmalloc) just as we do, but prev_foot contains rubbish
-		when the preceding block is allocated because ptmalloc2 finds the local mstate by rounding the ptr
-		down to the nearest megabyte. It's like dlmalloc with FOOTERS disabled. */
-		mchunkptr p=mem2chunk(mem);
-		mstate fm=get_mstate_for(p);
-		/* If it's a ptmalloc2 block, fm is likely to be some crazy value */
-		if(!is_aligned(fm)) return 0;
-		if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
-		if(ok_magic(fm))
-			return fm;
-		else
-			return 0;
-		if(1) { }
-#endif
-		else
-		{
-			mchunkptr p=mem2chunk(mem);
-			mstate fm=get_mstate_for(p);
-			assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
-			if(ok_magic(fm))
-				return fm;
-		}
-#else
-//#ifdef WIN32
-//		__try
-//#endif
-		{
-			/* We try to return zero here if it isn't one of our own blocks, however
-			the current block annotation scheme used by dlmalloc makes it impossible
-			to be absolutely sure of avoiding a segfault.
-
-			mchunkptr->prev_foot = mem-(2*size_t) = mstate ^ mparams.magic for PRECEDING block;
-			mchunkptr->head      = mem-(1*size_t) = 8 multiple size of this block with bottom three bits = FLAG_BITS
-			    FLAG_BITS = bit 0 is CINUSE (currently in use unless is mmap), bit 1 is PINUSE (previous block currently
-				            in use unless mmap), bit 2 is UNUSED and currently is always zero.
-			*/
-			register void *RESTRICT leastusedaddress_=leastusedaddress;		/* Cache these to avoid register reloading */
-			register size_t largestusedblock_=largestusedblock;
-			if(!is_aligned(mem)) return 0;		/* Would fail very rarely as all allocators return aligned blocks */
-			if(mem<leastusedaddress_) return 0;	/* Simple but effective */
-			{
-				mchunkptr p=mem2chunk(mem);
-				mstate fm=0;
-				int ismmapped=is_mmapped(p);
-				if((!ismmapped && !is_inuse(p)) || (p->head & FLAG4_BIT)) return 0;
-				/* Reduced uncertainty by 0.5^2 = 25.0% */
-				/* size should never exceed largestusedblock */
-				if(chunksize(p)>largestusedblock_) return 0;
-				/* Reduced uncertainty by a minimum of 0.5^3 = 12.5%, maximum 0.5^16 = 0.0015% */
-				/* Having sanity checked prev_foot and head, check next block */
-				if(!ismmapped && (!next_pinuse(p) || (next_chunk(p)->head & FLAG4_BIT))) return 0;
-				/* Reduced uncertainty by 0.5^5 = 3.13% or 0.5^18 = 0.00038% */
-	#if 0
-				/* If previous block is free, check that its next block pointer equals us */
-				if(!ismmapped && !pinuse(p))
-					if(next_chunk(prev_chunk(p))!=p) return 0;
-				/* We could start comparing prev_foot's for similarity but it starts getting slow. */
-	#endif
-				fm = get_mstate_for(p);
-				if(!is_aligned(fm) || (void *)fm<leastusedaddress_) return 0;
-				if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
-				assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
-				if(ok_magic(fm))
-					return fm;
-			}
-		}
-//#ifdef WIN32
-//		__except(1) { }
-//#endif
-#endif
-#endif
-#endif
-	}
-	return 0;
-}
-NEDMALLOCNOALIASATTR size_t nedblksize(int *RESTRICT isforeign, void *RESTRICT mem) THROWSPEC
-{
-	if(mem)
-	{
-		if(isforeign) *isforeign=1;
-#if USE_MAGIC_HEADERS
-		{
-			size_t *_mem=(size_t *) mem-3;
-			if(_mem[0]==*(size_t *) "NEDMALOC")
-			{
-				mstate mspace=(mstate) _mem[1];
-				size_t size=_mem[2];
-				if(isforeign) *isforeign=0;
-				return size;
-			}
-		}
-#elif USE_ALLOCATOR==1
-		if(nedblkmstate(mem))
-		{
-			mchunkptr p=mem2chunk(mem);
-			if(isforeign) *isforeign=0;
-			return chunksize(p)-overhead_for(p);
-		}
-#ifdef DEBUG
-		else
-		{
-			int a=1; /* Set breakpoints here if needed */
-		}
-#endif
-#endif
-#if defined(ENABLE_TOLERANT_NEDMALLOC) || USE_ALLOCATOR==0
-#ifdef _MSC_VER
-		/* This is the MSVCRT equivalent */
-		return _msize(mem);
-#elif defined(__linux__)
-		/* This is the glibc/ptmalloc2/dlmalloc equivalent.  */
-		return malloc_usable_size(mem);
-#elif defined(__FreeBSD__) || defined(__APPLE__)
-		/* This is the BSD libc equivalent.  */
-		return malloc_size(mem);
-#else
-#error Cannot tolerate the memory allocator of an unknown system!
-#endif
-#endif
-	}
-	return 0;
-}
-
-NEDMALLOCNOALIASATTR void nedsetvalue(void *v) THROWSPEC											{ nedpsetvalue((nedpool *) 0, v); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmalloc(size_t size) THROWSPEC						{ return nedpmalloc((nedpool *) 0, size); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedcalloc(size_t no, size_t size) THROWSPEC			{ return nedpcalloc((nedpool *) 0, no, size); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedrealloc(void *mem, size_t size) THROWSPEC			{ return nedprealloc((nedpool *) 0, mem, size); }
-NEDMALLOCNOALIASATTR void   nedfree(void *mem) THROWSPEC											{ nedpfree((nedpool *) 0, mem); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC	{ return nedpmemalign((nedpool *) 0, alignment, bytes); }
-NEDMALLOCNOALIASATTR struct nedmallinfo nedmallinfo(void) THROWSPEC									{ return nedpmallinfo((nedpool *) 0); }
-NEDMALLOCNOALIASATTR int    nedmallopt(int parno, int value) THROWSPEC								{ return nedpmallopt((nedpool *) 0, parno, value); }
-NEDMALLOCNOALIASATTR int    nedmalloc_trim(size_t pad) THROWSPEC									{ return nedpmalloc_trim((nedpool *) 0, pad); }
-void   nedmalloc_stats() THROWSPEC																	{ nedpmalloc_stats((nedpool *) 0); }
-NEDMALLOCNOALIASATTR size_t nedmalloc_footprint() THROWSPEC											{ return nedpmalloc_footprint((nedpool *) 0); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC	{ return nedpindependent_calloc((nedpool *) 0, elemsno, elemsize, chunks); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC		{ return nedpindependent_comalloc((nedpool *) 0, elems, sizes, chunks); }
-
-struct threadcacheblk_t;
-typedef struct threadcacheblk_t threadcacheblk;
-struct threadcacheblk_t
-{	/* Keep less than 16 bytes on 32 bit systems and 32 bytes on 64 bit systems */
-#ifdef FULLSANITYCHECKS
-	unsigned int magic;
-#endif
-	unsigned int lastUsed, size;
-	threadcacheblk *next, *prev;
-};
-typedef struct threadcache_t
-{
-#ifdef FULLSANITYCHECKS
-	unsigned int magic1;
-#endif
-	int mymspace;						/* Last mspace entry this thread used */
-	long threadid;
-	unsigned int mallocs, frees, successes;
-	size_t freeInCache;					/* How much free space is stored in this cache */
-	threadcacheblk *bins[(THREADCACHEMAXBINS+1)*2];
-#ifdef FULLSANITYCHECKS
-	unsigned int magic2;
-#endif
-} threadcache;
-struct nedpool_t
-{
-	MLOCK_T mutex;
-	void *uservalue;
-	int threads;						/* Max entries in m to use */
-	threadcache *caches[THREADCACHEMAXCACHES];
-	TLSVAR mycache;						/* Thread cache for this thread. 0 for unset, negative for use mspace-1 directly, otherwise is cache-1 */
-	mstate m[MAXTHREADSINPOOL+1];		/* mspace entries for this pool */
-};
-static nedpool syspool;
-
-static FORCEINLINE NEDMALLOCNOALIASATTR unsigned int size2binidx(size_t _size) THROWSPEC
-{	/* 8=1000	16=10000	20=10100	24=11000	32=100000	48=110000	4096=1000000000000 */
-	unsigned int topbit, size=(unsigned int)(_size>>4);
-	/* 16=1		20=1	24=1	32=10	48=11	64=100	96=110	128=1000	4096=100000000 */
-
-#if defined(__GNUC__)
-        topbit = sizeof(size)*__CHAR_BIT__ - 1 - __builtin_clz(size);
-#elif defined(_MSC_VER) && _MSC_VER>=1300
-	{
-            unsigned long bsrTopBit;
-
-            _BitScanReverse(&bsrTopBit, size);
-
-            topbit = bsrTopBit;
-        }
-#else
-#if 0
-	union {
-		unsigned asInt[2];
-		double asDouble;
-	};
-	int n;
-
-	asDouble = (double)size + 0.5;
-	topbit = (asInt[!FOX_BIGENDIAN] >> 20) - 1023;
-#else
-	{
-		unsigned int x=size;
-		x = x | (x >> 1);
-		x = x | (x >> 2);
-		x = x | (x >> 4);
-		x = x | (x >> 8);
-		x = x | (x >>16);
-		x = ~x;
-		x = x - ((x >> 1) & 0x55555555);
-		x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-		x = (x + (x >> 4)) & 0x0F0F0F0F;
-		x = x + (x << 8);
-		x = x + (x << 16);
-		topbit=31 - (x >> 24);
-	}
-#endif
-#endif
-	return topbit;
-}
-
-
-#ifdef FULLSANITYCHECKS
-static void tcsanitycheck(threadcacheblk **ptr) THROWSPEC
-{
-	assert((ptr[0] && ptr[1]) || (!ptr[0] && !ptr[1]));
-	if(ptr[0] && ptr[1])
-	{
-		assert(nedblksize(ptr[0])>=sizeof(threadcacheblk));
-		assert(nedblksize(ptr[1])>=sizeof(threadcacheblk));
-		assert(*(unsigned int *) "NEDN"==ptr[0]->magic);
-		assert(*(unsigned int *) "NEDN"==ptr[1]->magic);
-		assert(!ptr[0]->prev);
-		assert(!ptr[1]->next);
-		if(ptr[0]==ptr[1])
-		{
-			assert(!ptr[0]->next);
-			assert(!ptr[1]->prev);
-		}
-	}
-}
-static void tcfullsanitycheck(threadcache *tc) THROWSPEC
-{
-	threadcacheblk **tcbptr=tc->bins;
-	int n;
-	for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
-	{
-		threadcacheblk *b, *ob=0;
-		tcsanitycheck(tcbptr);
-		for(b=tcbptr[0]; b; ob=b, b=b->next)
-		{
-			assert(*(unsigned int *) "NEDN"==b->magic);
-			assert(!ob || ob->next==b);
-			assert(!ob || b->prev==ob);
-		}
-	}
-}
-#endif
-
-static NOINLINE void RemoveCacheEntries(nedpool *RESTRICT p, threadcache *RESTRICT tc, unsigned int age) THROWSPEC
-{
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	if(tc->freeInCache)
-	{
-		threadcacheblk **tcbptr=tc->bins;
-		int n;
-		for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
-		{
-			threadcacheblk **tcb=tcbptr+1;		/* come from oldest end of list */
-			/*tcsanitycheck(tcbptr);*/
-			for(; *tcb && tc->frees-(*tcb)->lastUsed>=age; )
-			{
-				threadcacheblk *f=*tcb;
-				size_t blksize=f->size; /*nedblksize(f);*/
-				assert(blksize<=nedblksize(0, f));
-				assert(blksize);
-#ifdef FULLSANITYCHECKS
-				assert(*(unsigned int *) "NEDN"==(*tcb)->magic);
-#endif
-				*tcb=(*tcb)->prev;
-				if(*tcb)
-					(*tcb)->next=0;
-				else
-					*tcbptr=0;
-				tc->freeInCache-=blksize;
-				assert((long) tc->freeInCache>=0);
-				CallFree(0, f, 0);
-				/*tcsanitycheck(tcbptr);*/
-			}
-		}
-	}
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-}
-static void DestroyCaches(nedpool *RESTRICT p) THROWSPEC
-{
-	if(p->caches)
-	{
-		threadcache *tc;
-		int n;
-		for(n=0; n<THREADCACHEMAXCACHES; n++)
-		{
-			if((tc=p->caches[n]))
-			{
-				tc->frees++;
-				RemoveCacheEntries(p, tc, 0);
-				assert(!tc->freeInCache);
-				tc->mymspace=-1;
-				tc->threadid=0;
-				CallFree(0, tc, 0);
-				p->caches[n]=0;
-			}
-		}
-	}
-}
-
-static NOINLINE threadcache *AllocCache(nedpool *RESTRICT p) THROWSPEC
-{
-	threadcache *tc=0;
-	int n, end;
-	ACQUIRE_LOCK(&p->mutex);
-	for(n=0; n<THREADCACHEMAXCACHES && p->caches[n]; n++);
-	if(THREADCACHEMAXCACHES==n)
-	{	/* List exhausted, so disable for this thread */
-		RELEASE_LOCK(&p->mutex);
-		return 0;
-	}
-	tc=p->caches[n]=(threadcache *) CallCalloc(p->m[0], sizeof(threadcache), 0);
-	if(!tc)
-	{
-		RELEASE_LOCK(&p->mutex);
-		return 0;
-	}
-#ifdef FULLSANITYCHECKS
-	tc->magic1=*(unsigned int *)"NEDMALC1";
-	tc->magic2=*(unsigned int *)"NEDMALC2";
-#endif
-	tc->threadid=(long)(size_t)CURRENT_THREAD;
-	for(end=0; p->m[end]; end++);
-	tc->mymspace=abs(tc->threadid) % end;
-	RELEASE_LOCK(&p->mutex);
-	if(TLSSET(p->mycache, (void *)(size_t)(n+1))) abort();
-	return tc;
-}
-
-static void *threadcache_malloc(nedpool *RESTRICT p, threadcache *RESTRICT tc, size_t *RESTRICT _size) THROWSPEC
-{
-	void *RESTRICT ret=0;
-	size_t size=*_size, blksize=0;
-	unsigned int bestsize;
-	unsigned int idx=size2binidx(size);
-	threadcacheblk *RESTRICT blk, **RESTRICT binsptr;
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	/* Calculate best fit bin size */
-	bestsize=1<<(idx+4);
-#if 0
-	/* Finer grained bin fit */
-	idx<<=1;
-	if(size>bestsize)
-	{
-		idx++;
-		bestsize+=bestsize>>1;
-	}
-	if(size>bestsize)
-	{
-		idx++;
-		bestsize=1<<(4+(idx>>1));
-	}
-#else
-	if(size>bestsize)
-	{
-		idx++;
-		bestsize<<=1;
-	}
-#endif
-	assert(bestsize>=size);
-	if(size<bestsize) size=bestsize;
-	assert(size<=THREADCACHEMAX);
-	assert(idx<=THREADCACHEMAXBINS);
-	binsptr=&tc->bins[idx*2];
-	/* Try to match close, but move up a bin if necessary */
-	blk=*binsptr;
-	if(!blk || blk->size<size)
-	{	/* Bump it up a bin */
-		if(idx<THREADCACHEMAXBINS)
-		{
-			idx++;
-			binsptr+=2;
-			blk=*binsptr;
-		}
-	}
-	if(blk)
-	{
-		blksize=blk->size; /*nedblksize(blk);*/
-		assert(nedblksize(0, blk)>=blksize);
-		assert(blksize>=size);
-		if(blk->next)
-			blk->next->prev=0;
-		*binsptr=blk->next;
-		if(!*binsptr)
-			binsptr[1]=0;
-#ifdef FULLSANITYCHECKS
-		blk->magic=0;
-#endif
-		assert(binsptr[0]!=blk && binsptr[1]!=blk);
-		assert(nedblksize(0, blk)>=sizeof(threadcacheblk) && nedblksize(0, blk)<=THREADCACHEMAX+CHUNK_OVERHEAD);
-		/*printf("malloc: %p, %p, %p, %lu\n", p, tc, blk, (long) _size);*/
-		ret=(void *) blk;
-	}
-	++tc->mallocs;
-	if(ret)
-	{
-		assert(blksize>=size);
-		++tc->successes;
-		tc->freeInCache-=blksize;
-		assert((long) tc->freeInCache>=0);
-	}
-#if defined(DEBUG) && 0
-	if(!(tc->mallocs & 0xfff))
-	{
-		printf("*** threadcache=%u, mallocs=%u (%f), free=%u (%f), freeInCache=%u\n", (unsigned int) tc->threadid, tc->mallocs,
-			(float) tc->successes/tc->mallocs, tc->frees, (float) tc->successes/tc->frees, (unsigned int) tc->freeInCache);
-	}
-#endif
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	*_size=size;
-	return ret;
-}
-static NOINLINE void ReleaseFreeInCache(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace) THROWSPEC
-{
-	unsigned int age=THREADCACHEMAXFREESPACE/8192;
-	/*ACQUIRE_LOCK(&p->m[mymspace]->mutex);*/
-	while(age && tc->freeInCache>=THREADCACHEMAXFREESPACE)
-	{
-		RemoveCacheEntries(p, tc, age);
-		/*printf("*** Removing cache entries older than %u (%u)\n", age, (unsigned int) tc->freeInCache);*/
-		age>>=1;
-	}
-	/*RELEASE_LOCK(&p->m[mymspace]->mutex);*/
-}
-static void threadcache_free(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, void *RESTRICT mem, size_t size) THROWSPEC
-{
-	unsigned int bestsize;
-	unsigned int idx=size2binidx(size);
-	threadcacheblk **RESTRICT binsptr, *RESTRICT tck=(threadcacheblk *) mem;
-	assert(size>=sizeof(threadcacheblk) && size<=THREADCACHEMAX+CHUNK_OVERHEAD);
-#ifdef DEBUG
-	/* Make sure this is a valid memory block */
-	assert(nedblksize(0, mem));
-#endif
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	/* Calculate best fit bin size */
-	bestsize=1<<(idx+4);
-#if 0
-	/* Finer grained bin fit */
-	idx<<=1;
-	if(size>bestsize)
-	{
-		unsigned int biggerbestsize=bestsize+bestsize<<1;
-		if(size>=biggerbestsize)
-		{
-			idx++;
-			bestsize=biggerbestsize;
-		}
-	}
-#endif
-	if(bestsize!=size)	/* dlmalloc can round up, so we round down to preserve indexing */
-		size=bestsize;
-	binsptr=&tc->bins[idx*2];
-	assert(idx<=THREADCACHEMAXBINS);
-	if(tck==*binsptr)
-	{
-		fprintf(stderr, "nedmalloc: Attempt to free already freed memory block %p - aborting!\n", tck);
-		abort();
-	}
-#ifdef FULLSANITYCHECKS
-	tck->magic=*(unsigned int *) "NEDN";
-#endif
-	tck->lastUsed=++tc->frees;
-	tck->size=(unsigned int) size;
-	tck->next=*binsptr;
-	tck->prev=0;
-	if(tck->next)
-		tck->next->prev=tck;
-	else
-		binsptr[1]=tck;
-	assert(!*binsptr || (*binsptr)->size==tck->size);
-	*binsptr=tck;
-	assert(tck==tc->bins[idx*2]);
-	assert(tc->bins[idx*2+1]==tck || binsptr[0]->next->prev==tck);
-	/*printf("free: %p, %p, %p, %lu\n", p, tc, mem, (long) size);*/
-	tc->freeInCache+=size;
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-#if 1
-	if(tc->freeInCache>=THREADCACHEMAXFREESPACE)
-		ReleaseFreeInCache(p, tc, mymspace);
-#endif
-}
-
-
-
-
-static NOINLINE int InitPool(nedpool *RESTRICT p, size_t capacity, int threads) THROWSPEC
-{	/* threads is -1 for system pool */
-	ensure_initialization();
-	ACQUIRE_MALLOC_GLOBAL_LOCK();
-	if(p->threads) goto done;
-	if(INITIAL_LOCK(&p->mutex)) goto err;
-	if(TLSALLOC(&p->mycache)) goto err;
-#if USE_ALLOCATOR==0
-	p->m[0]=(mstate) mspacecounter++;
-#elif USE_ALLOCATOR==1
-	if(!(p->m[0]=(mstate) create_mspace(capacity, 1))) goto err;
-	p->m[0]->extp=p;
-#endif
-	p->threads=(threads<1 || threads>MAXTHREADSINPOOL) ? MAXTHREADSINPOOL : threads;
-done:
-	RELEASE_MALLOC_GLOBAL_LOCK();
-	return 1;
-err:
-	if(threads<0)
-		abort();			/* If you can't allocate for system pool, we're screwed */
-	DestroyCaches(p);
-	if(p->m[0])
-	{
-#if USE_ALLOCATOR==1
-		destroy_mspace(p->m[0]);
-#endif
-		p->m[0]=0;
-	}
-	if(p->mycache)
-	{
-		if(TLSFREE(p->mycache)) abort();
-		p->mycache=0;
-	}
-	RELEASE_MALLOC_GLOBAL_LOCK();
-	return 0;
-}
-static NOINLINE mstate FindMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int *RESTRICT lastUsed, size_t size) THROWSPEC
-{	/* Gets called when thread's last used mspace is in use. The strategy
-	is to run through the list of all available mspaces looking for an
-	unlocked one and if we fail, we create a new one so long as we don't
-	exceed p->threads */
-	int n, end;
-	for(n=end=*lastUsed+1; p->m[n]; end=++n)
-	{
-		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
-	}
-	for(n=0; n<*lastUsed && p->m[n]; n++)
-	{
-		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
-	}
-	if(end<p->threads)
-	{
-		mstate temp;
-#if USE_ALLOCATOR==0
-		temp=(mstate) mspacecounter++;
-#elif USE_ALLOCATOR==1
-		if(!(temp=(mstate) create_mspace(size, 1)))
-			goto badexit;
-#endif
-		/* Now we're ready to modify the lists, we lock */
-		ACQUIRE_LOCK(&p->mutex);
-		while(p->m[end] && end<p->threads)
-			end++;
-		if(end>=p->threads)
-		{	/* Drat, must destroy it now */
-			RELEASE_LOCK(&p->mutex);
-#if USE_ALLOCATOR==1
-			destroy_mspace((mstate) temp);
-#endif
-			goto badexit;
-		}
-		/* We really want to make sure this goes into memory now but we
-		have to be careful of breaking aliasing rules, so write it twice */
-		*((volatile struct malloc_state **) &p->m[end])=p->m[end]=temp;
-		ACQUIRE_LOCK(&p->m[end]->mutex);
-		/*printf("Created mspace idx %d\n", end);*/
-		RELEASE_LOCK(&p->mutex);
-		n=end;
-		goto found;
-	}
-	/* Let it lock on the last one it used */
-badexit:
-	ACQUIRE_LOCK(&p->m[*lastUsed]->mutex);
-	return p->m[*lastUsed];
-found:
-	*lastUsed=n;
-	if(tc)
-		tc->mymspace=n;
-	else
-	{
-		if(TLSSET(p->mycache, (void *)(size_t)(-(n+1)))) abort();
-	}
-	return p->m[n];
-}
-
-typedef struct PoolList_t
-{
-	size_t size;			/* Size of list */
-	size_t length;			/* Actual entries in list */
-#ifdef DEBUG
-	nedpool *list[1];		/* Force testing of list expansion */
-#else
-	nedpool *list[16];
-#endif
-} PoolList;
-static MLOCK_T poollistlock;
-static PoolList *poollist;
-NEDMALLOCPTRATTR nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC
-{
-	nedpool *ret=0;
-	if(!poollist)
-	{
-		PoolList *newpoollist=0;
-		if(!(newpoollist=(PoolList *) nedpcalloc(0, 1, sizeof(PoolList)+sizeof(nedpool *)))) return 0;
-		INITIAL_LOCK(&poollistlock);
-		ACQUIRE_LOCK(&poollistlock);
-		poollist=newpoollist;
-		poollist->size=sizeof(poollist->list)/sizeof(nedpool *);
-	}
-	else
-		ACQUIRE_LOCK(&poollistlock);
-	if(poollist->length==poollist->size)
-	{
-		PoolList *newpoollist=0;
-		size_t newsize=0;
-		newsize=sizeof(PoolList)+(poollist->size+1)*sizeof(nedpool *);
-		if(!(newpoollist=(PoolList *) nedprealloc(0, poollist, newsize))) goto badexit;
-		poollist=newpoollist;
-		memset(&poollist->list[poollist->size], 0, newsize-((size_t)&poollist->list[poollist->size]-(size_t)&poollist->list[0]));
-		poollist->size=((newsize-((char *)&poollist->list[0]-(char *)poollist))/sizeof(nedpool *))-1;
-		assert(poollist->size>poollist->length);
-	}
-	if(!(ret=(nedpool *) nedpcalloc(0, 1, sizeof(nedpool)))) goto badexit;
-	if(!InitPool(ret, capacity, threads))
-	{
-		nedpfree(0, ret);
-		goto badexit;
-	}
-	poollist->list[poollist->length++]=ret;
-badexit:
-	RELEASE_LOCK(&poollistlock);
-	return ret;
-}
-void neddestroypool(nedpool *p) THROWSPEC
-{
-	unsigned int n;
-	ACQUIRE_LOCK(&p->mutex);
-	DestroyCaches(p);
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		destroy_mspace(p->m[n]);
-#endif
-		p->m[n]=0;
-	}
-	RELEASE_LOCK(&p->mutex);
-	if(TLSFREE(p->mycache)) abort();
-	nedpfree(0, p);
-	ACQUIRE_LOCK(&poollistlock);
-	assert(poollist);
-	for(n=0; n<poollist->length && poollist->list[n]!=p; n++);
-	assert(n!=poollist->length);
-	memmove(&poollist->list[n], &poollist->list[n+1], (size_t)&poollist->list[poollist->length]-(size_t)&poollist->list[n]);
-	if(!--poollist->length)
-	{
-		assert(!poollist->list[0]);
-		nedpfree(0, poollist);
-		poollist=0;
-	}
-	RELEASE_LOCK(&poollistlock);
-}
-void neddestroysyspool() THROWSPEC
-{
-	nedpool *p=&syspool;
-	int n;
-	ACQUIRE_LOCK(&p->mutex);
-	DestroyCaches(p);
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		destroy_mspace(p->m[n]);
-#endif
-		p->m[n]=0;
-	}
-	/* Render syspool unusable */
-	for(n=0; n<THREADCACHEMAXCACHES; n++)
-		p->caches[n]=(threadcache *)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
-	for(n=0; n<MAXTHREADSINPOOL+1; n++)
-		p->m[n]=(mstate)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
-	if(TLSFREE(p->mycache)) abort();
-	RELEASE_LOCK(&p->mutex);
-}
-nedpool **nedpoollist() THROWSPEC
-{
-	nedpool **ret=0;
-	if(poollist)
-	{
-		ACQUIRE_LOCK(&poollistlock);
-		if(!(ret=(nedpool **) nedmalloc((poollist->length+1)*sizeof(nedpool *)))) goto badexit;
-		memcpy(ret, poollist->list, (poollist->length+1)*sizeof(nedpool *));
-badexit:
-		RELEASE_LOCK(&poollistlock);
-	}
-	return ret;
-}
-
-void nedpsetvalue(nedpool *p, void *v) THROWSPEC
-{
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	p->uservalue=v;
-}
-void *nedgetvalue(nedpool **p, void *mem) THROWSPEC
-{
-	nedpool *np=0;
-	mstate fm=nedblkmstate(mem);
-	if(!fm || !fm->extp) return 0;
-	np=(nedpool *) fm->extp;
-	if(p) *p=np;
-	return np->uservalue;
-}
-
-void nedtrimthreadcache(nedpool *p, int disable) THROWSPEC
-{
-	int mycache;
-	if(!p)
-	{
-		p=&syspool;
-		if(!syspool.threads) InitPool(&syspool, 0, -1);
-	}
-	mycache=(int)(size_t) TLSGET(p->mycache);
-	if(!mycache)
-	{	/* Set to mspace 0 */
-		if(disable && TLSSET(p->mycache, (void *)(size_t)-1)) abort();
-	}
-	else if(mycache>0)
-	{	/* Set to last used mspace */
-		threadcache *tc=p->caches[mycache-1];
-#if defined(DEBUG)
-		printf("Threadcache utilisation: %lf%% in cache with %lf%% lost to other threads\n",
-			100.0*tc->successes/tc->mallocs, 100.0*((double) tc->mallocs-tc->frees)/tc->mallocs);
-#endif
-		if(disable && TLSSET(p->mycache, (void *)(size_t)(-tc->mymspace))) abort();
-		tc->frees++;
-		RemoveCacheEntries(p, tc, 0);
-		assert(!tc->freeInCache);
-		if(disable)
-		{
-			tc->mymspace=-1;
-			tc->threadid=0;
-			CallFree(0, p->caches[mycache-1], 0);
-			p->caches[mycache-1]=0;
-		}
-	}
-}
-void neddisablethreadcache(nedpool *p) THROWSPEC
-{
-	nedtrimthreadcache(p, 1);
-}
-
-#define GETMSPACE(m,p,tc,ms,s,action)                 \
-  do                                                  \
-  {                                                   \
-    mstate m = GetMSpace((p),(tc),(ms),(s));          \
-    action;                                           \
-	if(USE_ALLOCATOR==1) { RELEASE_LOCK(&m->mutex); } \
-  } while (0)
-
-static FORCEINLINE mstate GetMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, size_t size) THROWSPEC
-{	/* Returns a locked and ready for use mspace */
-	mstate m=p->m[mymspace];
-	assert(m);
-#if USE_ALLOCATOR==1
-	if(!TRY_LOCK(&p->m[mymspace]->mutex)) m=FindMSpace(p, tc, &mymspace, size);
-	/*assert(IS_LOCKED(&p->m[mymspace]->mutex));*/
-#endif
-	return m;
-}
-static NOINLINE void GetThreadCache_cold1(nedpool *RESTRICT *RESTRICT p) THROWSPEC
-{
-	*p=&syspool;
-	if(!syspool.threads) InitPool(&syspool, 0, -1);
-}
-static NOINLINE void GetThreadCache_cold2(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, int mycache) THROWSPEC
-{
-	if(!mycache)
-	{	/* Need to allocate a new cache */
-		*tc=AllocCache(*p);
-		if(!*tc)
-		{	/* Disable */
-			if(TLSSET((*p)->mycache, (void *)(size_t)-1)) abort();
-			*mymspace=0;
-		}
-		else
-			*mymspace=(*tc)->mymspace;
-	}
-	else
-	{	/* Cache disabled, but we do have an assigned thread pool */
-		*tc=0;
-		*mymspace=-mycache-1;
-	}
-}
-static FORCEINLINE void GetThreadCache(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, size_t *RESTRICT size) THROWSPEC
-{
-	int mycache;
-	if(size && *size<sizeof(threadcacheblk)) *size=sizeof(threadcacheblk);
-	if(!*p)
-		GetThreadCache_cold1(p);
-	mycache=(int)(size_t) TLSGET((*p)->mycache);
-	if(mycache>0)
-	{	/* Already have a cache */
-		*tc=(*p)->caches[mycache-1];
-		*mymspace=(*tc)->mymspace;
-	}
-	else GetThreadCache_cold2(p, tc, mymspace, mycache);
-	assert(*mymspace>=0);
-	assert(!(*tc) || (long)(size_t)CURRENT_THREAD==(*tc)->threadid);
-#ifdef FULLSANITYCHECKS
-	if(*tc)
-	{
-		if(*(unsigned int *)"NEDMALC1"!=(*tc)->magic1 || *(unsigned int *)"NEDMALC2"!=(*tc)->magic2)
-		{
-			abort();
-		}
-	}
-#endif
-}
-
-NEDMALLOCPTRATTR void * nedpmalloc(nedpool *p, size_t size) THROWSPEC
-{
-	void *ret=0;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &size);
-#if THREADCACHEMAX
-	if(tc && size<=THREADCACHEMAX)
-	{	/* Use the thread cache */
-		ret=threadcache_malloc(p, tc, &size);
-	}
-#endif
-	if(!ret)
-	{	/* Use this thread's mspace */
-        GETMSPACE(m, p, tc, mymspace, size,
-                  ret=CallMalloc(m, size, 0));
-	}
-	return ret;
-}
-NEDMALLOCPTRATTR void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC
-{
-	size_t rsize=size*no;
-	void *ret=0;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &rsize);
-#if THREADCACHEMAX
-	if(tc && rsize<=THREADCACHEMAX)
-	{	/* Use the thread cache */
-		if((ret=threadcache_malloc(p, tc, &rsize)))
-			memset(ret, 0, rsize);
-	}
-#endif
-	if(!ret)
-	{	/* Use this thread's mspace */
-        GETMSPACE(m, p, tc, mymspace, rsize,
-                  ret=CallCalloc(m, rsize, 0));
-	}
-	return ret;
-}
-NEDMALLOCPTRATTR void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC
-{
-	void *ret=0;
-	threadcache *tc;
-	int mymspace, isforeign=1;
-	size_t memsize;
-	if(!mem) return nedpmalloc(p, size);
-	memsize=nedblksize(&isforeign, mem);
-	assert(memsize);
-	if(!memsize)
-	{
-		fprintf(stderr, "nedmalloc: nedprealloc() called with a block not created by nedmalloc!\n");
-		abort();
-	}
-	else if(size<=memsize && memsize-size<
-#ifdef DEBUG
-		32
-#else
-		1024
-#endif
-		)		/* If realloc size is within 1Kb smaller than existing, noop it */
-		return mem;
-	GetThreadCache(&p, &tc, &mymspace, &size);
-#if THREADCACHEMAX
-	if(tc && size && size<=THREADCACHEMAX)
-	{	/* Use the thread cache */
-		if((ret=threadcache_malloc(p, tc, &size)))
-		{
-			memcpy(ret, mem, memsize<size ? memsize : size);
-			if(memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
-				threadcache_free(p, tc, mymspace, mem, memsize);
-			else
-				CallFree(0, mem, isforeign);
-		}
-	}
-#endif
-	if(!ret)
-	{	/* Reallocs always happen in the mspace they happened in, so skip
-		locking the preferred mspace for this thread */
-		ret=CallRealloc(p->m[mymspace], mem, isforeign, memsize, size);
-	}
-	return ret;
-}
-void   nedpfree(nedpool *p, void *mem) THROWSPEC
-{	/* Frees always happen in the mspace they happened in, so skip
-	locking the preferred mspace for this thread */
-	threadcache *tc;
-	int mymspace, isforeign=1;
-	size_t memsize;
-	if(!mem)
-	{	/* If you tried this on FreeBSD you'd be sorry! */
-#ifdef DEBUG
-		fprintf(stderr, "nedmalloc: WARNING nedpfree() called with zero. This is not portable behaviour!\n");
-#endif
-		return;
-	}
-	memsize=nedblksize(&isforeign, mem);
-	assert(memsize);
-	if(!memsize)
-	{
-		fprintf(stderr, "nedmalloc: nedpfree() called with a block not created by nedmalloc!\n");
-		abort();
-	}
-	GetThreadCache(&p, &tc, &mymspace, 0);
-#if THREADCACHEMAX
-	if(mem && tc && memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
-		threadcache_free(p, tc, mymspace, mem, memsize);
-	else
-#endif
-		CallFree(0, mem, isforeign);
-}
-NEDMALLOCPTRATTR void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC
-{
-	void *ret;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &bytes);
-	{	/* Use this thread's mspace */
-        GETMSPACE(m, p, tc, mymspace, bytes,
-                  ret=CallMalloc(m, bytes, alignment));
-	}
-	return ret;
-}
-struct nedmallinfo nedpmallinfo(nedpool *p) THROWSPEC
-{
-	int n;
-	struct nedmallinfo ret={0};
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1 && !NO_MALLINFO
-		struct mallinfo t=mspace_mallinfo(p->m[n]);
-		ret.arena+=t.arena;
-		ret.ordblks+=t.ordblks;
-		ret.hblkhd+=t.hblkhd;
-		ret.usmblks+=t.usmblks;
-		ret.uordblks+=t.uordblks;
-		ret.fordblks+=t.fordblks;
-		ret.keepcost+=t.keepcost;
-#endif
-	}
-	return ret;
-}
-int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC
-{
-#if USE_ALLOCATOR==1
-	return mspace_mallopt(parno, value);
-#else
-	return 0;
-#endif
-}
-NEDMALLOCNOALIASATTR void*  nedmalloc_internals(size_t *granularity, size_t *magic) THROWSPEC
-{
-#if USE_ALLOCATOR==1
-	if(granularity) *granularity=mparams.granularity;
-	if(magic) *magic=mparams.magic;
-	return (void *) &syspool;
-#else
-	if(granularity) *granularity=0;
-	if(magic) *magic=0;
-	return 0;
-#endif
-}
-int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC
-{
-	int n, ret=0;
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		ret+=mspace_trim(p->m[n], pad);
-#endif
-	}
-	return ret;
-}
-void   nedpmalloc_stats(nedpool *p) THROWSPEC
-{
-	int n;
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		mspace_malloc_stats(p->m[n]);
-#endif
-	}
-}
-size_t nedpmalloc_footprint(nedpool *p) THROWSPEC
-{
-	size_t ret=0;
-	int n;
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		ret+=mspace_footprint(p->m[n]);
-#endif
-	}
-	return ret;
-}
-NEDMALLOCPTRATTR void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC
-{
-	void **ret;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &elemsize);
-#if USE_ALLOCATOR==0
-    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
-              ret=unsupported_operation("independent_calloc"));
-#elif USE_ALLOCATOR==1
-    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
-              ret=mspace_independent_calloc(m, elemsno, elemsize, chunks));
-#endif
-	return ret;
-}
-NEDMALLOCPTRATTR void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC
-{
-	void **ret;
-	threadcache *tc;
-	int mymspace;
-    size_t i, *adjustedsizes=(size_t *) alloca(elems*sizeof(size_t));
-    if(!adjustedsizes) return 0;
-    for(i=0; i<elems; i++)
-        adjustedsizes[i]=sizes[i]<sizeof(threadcacheblk) ? sizeof(threadcacheblk) : sizes[i];
-	GetThreadCache(&p, &tc, &mymspace, 0);
-#if USE_ALLOCATOR==0
-	GETMSPACE(m, p, tc, mymspace, 0,
-              ret=unsupported_operation("independent_comalloc"));
-#elif USE_ALLOCATOR==1
-	GETMSPACE(m, p, tc, mymspace, 0,
-              ret=mspace_independent_comalloc(m, elems, adjustedsizes, chunks));
-#endif
-	return ret;
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#endif
+#ifdef NEDMALLOC_ENABLED
+/* Alternative malloc implementation for multiple threads without
+lock contention based on dlmalloc. (C) 2005-2009 Niall Douglas
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+#ifdef _MSC_VER
+/* Enable full aliasing on MSVC */
+/*#pragma optimize("a", on)*/
+#pragma warning(push)
+#pragma warning(disable:4100)	/* unreferenced formal parameter */
+#pragma warning(disable:4127)	/* conditional expression is constant */
+#pragma warning(disable:4706)	/* assignment within conditional expression */
+#endif
+
+/*#define ENABLE_TOLERANT_NEDMALLOC 1*/
+/*#define ENABLE_FAST_HEAP_DETECTION 1*/
+/*#define NEDMALLOC_DEBUG 1*/
+
+/*#define FULLSANITYCHECKS*/
+/* If link time code generation is on, don't force or prevent inlining */
+#if defined(_MSC_VER) && defined(NEDMALLOC_DLL_EXPORTS)
+#define FORCEINLINE
+#define NOINLINE
+#endif
+
+
+#include "nedmalloc.h"
+#ifdef WIN32
+ #include <malloc.h>
+ #include <stddef.h>
+#endif
+#if USE_ALLOCATOR==1
+ #define MSPACES 1
+ #define ONLY_MSPACES 1
+#endif
+#define USE_DL_PREFIX 1
+#ifndef USE_LOCKS
+ #define USE_LOCKS 1
+#endif
+#define FOOTERS 1           /* Need to enable footers so frees lock the right mspace */
+#ifndef NEDMALLOC_DEBUG
+ #if defined(DEBUG) || defined(_DEBUG)
+  #define NEDMALLOC_DEBUG 1
+ #else
+  #define NEDMALLOC_DEBUG 0
+ #endif
+#endif
+/* We need to consistently define DEBUG=0|1, _DEBUG and NDEBUG for dlmalloc */
+#undef DEBUG
+#undef _DEBUG
+#if NEDMALLOC_DEBUG
+ #define _DEBUG
+ #define DEBUG 1
+#else
+ #define DEBUG 0
+#endif
+#ifdef NDEBUG               /* Disable assert checking on release builds */
+ #undef DEBUG
+ #undef _DEBUG
+#endif
+/* The default of 64Kb means we spend too much time kernel-side */
+#ifndef DEFAULT_GRANULARITY
+#define DEFAULT_GRANULARITY (1*1024*1024)
+#if DEBUG
+#define DEFAULT_GRANULARITY_ALIGNED
+#endif
+#endif
+/*#define USE_SPIN_LOCKS 0*/
+
+
+#include "malloc.c.h"
+#ifdef NDEBUG               /* Disable assert checking on release builds */
+ #undef DEBUG
+#elif !NEDMALLOC_DEBUG
+ #ifdef __GNUC__
+  #warning DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.
+ #elif defined(_MSC_VER)
+  #pragma message(__FILE__ ": WARNING: DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.")
+ #endif
+#endif
+
+/* The maximum concurrent threads in a pool possible */
+#ifndef MAXTHREADSINPOOL
+#define MAXTHREADSINPOOL 16
+#endif
+/* The maximum number of threadcaches which can be allocated */
+#ifndef THREADCACHEMAXCACHES
+#define THREADCACHEMAXCACHES 256
+#endif
+/* The maximum size to be allocated from the thread cache */
+#ifndef THREADCACHEMAX
+#define THREADCACHEMAX 8192
+#endif
+#if 0
+/* The number of cache entries for finer grained bins. This is (topbitpos(THREADCACHEMAX)-4)*2 */
+#define THREADCACHEMAXBINS ((13-4)*2)
+#else
+/* The number of cache entries. This is (topbitpos(THREADCACHEMAX)-4) */
+#define THREADCACHEMAXBINS (13-4)
+#endif
+/* Point at which the free space in a thread cache is garbage collected */
+#ifndef THREADCACHEMAXFREESPACE
+#define THREADCACHEMAXFREESPACE (512*1024)
+#endif
+
+
+#ifdef WIN32
+ #define TLSVAR			DWORD
+ #define TLSALLOC(k)	(*(k)=TlsAlloc(), TLS_OUT_OF_INDEXES==*(k))
+ #define TLSFREE(k)		(!TlsFree(k))
+ #define TLSGET(k)		TlsGetValue(k)
+ #define TLSSET(k, a)	(!TlsSetValue(k, a))
+ #ifdef DEBUG
+static LPVOID ChkedTlsGetValue(DWORD idx)
+{
+	LPVOID ret=TlsGetValue(idx);
+	assert(S_OK==GetLastError());
+	return ret;
+}
+  #undef TLSGET
+  #define TLSGET(k) ChkedTlsGetValue(k)
+ #endif
+#else
+ #define TLSVAR			pthread_key_t
+ #define TLSALLOC(k)	pthread_key_create(k, 0)
+ #define TLSFREE(k)		pthread_key_delete(k)
+ #define TLSGET(k)		pthread_getspecific(k)
+ #define TLSSET(k, a)	pthread_setspecific(k, a)
+#endif
+
+#if defined(__cplusplus)
+#if !defined(NO_NED_NAMESPACE)
+namespace nedalloc {
+#else
+extern "C" {
+#endif
+#endif
+
+#if USE_ALLOCATOR==0
+static void *unsupported_operation(const char *opname) THROWSPEC
+{
+	fprintf(stderr, "nedmalloc: The operation %s is not supported under this build configuration\n", opname);
+	abort();
+	return 0;
+}
+static size_t mspacecounter=(size_t) 0xdeadbeef;
+#endif
+#ifndef ENABLE_FAST_HEAP_DETECTION
+static void *RESTRICT leastusedaddress;
+static size_t largestusedblock;
+#endif
+
+static FORCEINLINE void *CallMalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
+{
+	void *RESTRICT ret=0;
+	size_t _alignment=alignment;
+#if USE_MAGIC_HEADERS
+	size_t *_ret=0;
+	size+=alignment+3*sizeof(size_t);
+	_alignment=0;
+#endif
+#if USE_ALLOCATOR==0
+	ret=_alignment ? 
+#ifdef _MSC_VER
+	/* This is the MSVCRT equivalent */
+		_aligned_malloc(size, _alignment)
+#elif defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
+	/* This is the glibc/ptmalloc2/dlmalloc/BSD libc equivalent.  */
+		memalign(_alignment, size)
+#else
+#error Cannot aligned allocate with the memory allocator of an unknown system!
+#endif
+		: malloc(size);
+#elif USE_ALLOCATOR==1
+	ret=_alignment ? mspace_memalign((mstate) mspace, _alignment, size) : mspace_malloc((mstate) mspace, size);
+#ifndef ENABLE_FAST_HEAP_DETECTION
+	if(ret)
+	{
+		size_t truesize=chunksize(mem2chunk(ret));
+		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
+		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
+	}
+#endif
+#endif
+	if(!ret) return 0;
+#if USE_MAGIC_HEADERS
+	_ret=(size_t *) ret;
+	ret=(void *)(_ret+3);
+	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
+	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *)"NEDMALOC";
+	_ret[0]=(size_t) mspace;
+	_ret[1]=size-3*sizeof(size_t);
+#endif
+	return ret;
+}
+
+static FORCEINLINE void *CallCalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
+{
+	void *RESTRICT ret=0;
+#if USE_MAGIC_HEADERS
+	size_t *_ret=0;
+	size+=alignment+3*sizeof(size_t);
+#endif
+#if USE_ALLOCATOR==0
+	ret=calloc(1, size);
+#elif USE_ALLOCATOR==1
+	ret=mspace_calloc((mstate) mspace, 1, size);
+#ifndef ENABLE_FAST_HEAP_DETECTION
+	if(ret)
+	{
+		size_t truesize=chunksize(mem2chunk(ret));
+		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
+		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
+	}
+#endif
+#endif
+	if(!ret) return 0;
+#if USE_MAGIC_HEADERS
+	_ret=(size_t *) ret;
+	ret=(void *)(_ret+3);
+	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
+	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
+	_ret[0]=(size_t) mspace;
+	_ret[1]=size-3*sizeof(size_t);
+#endif
+	return ret;
+}
+
+static FORCEINLINE void *CallRealloc(void *RESTRICT mspace, void *RESTRICT mem, int isforeign, size_t oldsize, size_t newsize) THROWSPEC
+{
+	void *RESTRICT ret=0;
+#if USE_MAGIC_HEADERS
+	mstate oldmspace=0;
+	size_t *_ret=0, *_mem=(size_t *) mem-3;
+#endif
+	if(isforeign)
+	{	/* Transfer */
+#if USE_MAGIC_HEADERS
+		assert(_mem[0]!=*(size_t *) "NEDMALOC");
+#endif
+		if((ret=CallMalloc(mspace, newsize, 0)))
+		{
+#if defined(DEBUG)
+			printf("*** nedmalloc frees system allocated block %p\n", mem);
+#endif
+			memcpy(ret, mem, oldsize<newsize ? oldsize : newsize);
+			free(mem);
+		}
+		return ret;
+	}
+#if USE_MAGIC_HEADERS
+	assert(_mem[0]==*(size_t *) "NEDMALOC");
+	newsize+=3*sizeof(size_t);
+	oldmspace=(mstate) _mem[1];
+	assert(oldsize>=_mem[2]);
+	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
+	mem=(void *)(++_mem);
+#endif
+#if USE_ALLOCATOR==0
+	ret=realloc(mem, newsize);
+#elif USE_ALLOCATOR==1
+	ret=mspace_realloc((mstate) mspace, mem, newsize);
+#ifndef ENABLE_FAST_HEAP_DETECTION
+	if(ret)
+	{
+		size_t truesize=chunksize(mem2chunk(ret));
+		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
+	}
+#endif
+#endif
+	if(!ret)
+	{	/* Put it back the way it was */
+#if USE_MAGIC_HEADERS
+		for(; *_mem==0; *_mem++=*(size_t *) "NEDMALOC");
+#endif
+		return 0;
+	}
+#if USE_MAGIC_HEADERS
+	_ret=(size_t *) ret;
+	ret=(void *)(_ret+3);
+	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
+	_ret[0]=(size_t) mspace;
+	_ret[1]=newsize-3*sizeof(size_t);
+#endif
+	return ret;
+}
+
+static FORCEINLINE void CallFree(void *RESTRICT mspace, void *RESTRICT mem, int isforeign) THROWSPEC
+{
+#if USE_MAGIC_HEADERS
+	mstate oldmspace=0;
+	size_t *_mem=(size_t *) mem-3, oldsize=0;
+#endif
+	if(isforeign)
+	{
+#if USE_MAGIC_HEADERS
+		assert(_mem[0]!=*(size_t *) "NEDMALOC");
+#endif
+#if defined(DEBUG)
+		printf("*** nedmalloc frees system allocated block %p\n", mem);
+#endif
+		free(mem);
+		return;
+	}
+#if USE_MAGIC_HEADERS
+	assert(_mem[0]==*(size_t *) "NEDMALOC");
+	oldmspace=(mstate) _mem[1];
+	oldsize=_mem[2];
+	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
+	mem=(void *)(++_mem);
+#endif
+#if USE_ALLOCATOR==0
+	free(mem);
+#elif USE_ALLOCATOR==1
+	mspace_free((mstate) mspace, mem);
+#endif
+}
+
+static NEDMALLOCNOALIASATTR mstate nedblkmstate(void *RESTRICT mem) THROWSPEC
+{
+	if(mem)
+	{
+#if USE_MAGIC_HEADERS
+		size_t *_mem=(size_t *) mem-3;
+		if(_mem[0]==*(size_t *) "NEDMALOC")
+		{
+			return (mstate) _mem[1];
+		}
+		else return 0;
+#else
+#if USE_ALLOCATOR==0
+		/* Fail everything */
+		return 0;
+#elif USE_ALLOCATOR==1
+#ifdef ENABLE_FAST_HEAP_DETECTION
+#ifdef WIN32
+		/*  On Windows for RELEASE both x86 and x64 the NT heap precedes each block with an eight byte header
+			which looks like:
+				normal: 4 bytes of size, 4 bytes of [char < 64, char < 64, char < 64 bit 0 always set, char random ]
+				mmaped: 4 bytes of size  4 bytes of [zero,      zero,      0xb,                        zero        ]
+
+			On Windows for DEBUG both x86 and x64 the preceding four bytes is always 0xfdfdfdfd (no man's land).
+		*/
+#pragma pack(push, 1)
+		struct _HEAP_ENTRY
+		{
+			USHORT Size;
+			USHORT PreviousSize;
+			UCHAR Cookie;			/* SegmentIndex */
+			UCHAR Flags;			/* always bit 0 (HEAP_ENTRY_BUSY). bit 1=(HEAP_ENTRY_EXTRA_PRESENT), bit 2=normal block (HEAP_ENTRY_FILL_PATTERN), bit 3=mmap block (HEAP_ENTRY_VIRTUAL_ALLOC). Bit 4 (HEAP_ENTRY_LAST_ENTRY) could be set */
+			UCHAR UnusedBytes;
+			UCHAR SmallTagIndex;	/* fastbin index. Always one of 0x02, 0x03, 0x04 < 0x80 */
+		} *RESTRICT he=((struct _HEAP_ENTRY *) mem)-1;
+#pragma pack(pop)
+		unsigned int header=((unsigned int *)mem)[-1], mask1=0x8080E100, result1, mask2=0xFFFFFF06, result2;
+		result1=header & mask1;	/* Positive testing for NT heap */
+		result2=header & mask2;	/* Positive testing for dlmalloc */
+		if(result1==0x00000100 && result2!=0x00000102)
+		{	/* This is likely a NT heap block */
+			return 0;
+		}
+#endif
+#ifdef __linux__
+		/* On Linux glibc uses ptmalloc2 (really dlmalloc) just as we do, but prev_foot contains rubbish
+		when the preceding block is allocated because ptmalloc2 finds the local mstate by rounding the ptr
+		down to the nearest megabyte. It's like dlmalloc with FOOTERS disabled. */
+		mchunkptr p=mem2chunk(mem);
+		mstate fm=get_mstate_for(p);
+		/* If it's a ptmalloc2 block, fm is likely to be some crazy value */
+		if(!is_aligned(fm)) return 0;
+		if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
+		if(ok_magic(fm))
+			return fm;
+		else
+			return 0;
+		if(1) { }
+#endif
+		else
+		{
+			mchunkptr p=mem2chunk(mem);
+			mstate fm=get_mstate_for(p);
+			assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
+			if(ok_magic(fm))
+				return fm;
+		}
+#else
+//#ifdef WIN32
+//		__try
+//#endif
+		{
+			/* We try to return zero here if it isn't one of our own blocks, however
+			the current block annotation scheme used by dlmalloc makes it impossible
+			to be absolutely sure of avoiding a segfault.
+
+			mchunkptr->prev_foot = mem-(2*size_t) = mstate ^ mparams.magic for PRECEDING block;
+			mchunkptr->head      = mem-(1*size_t) = 8 multiple size of this block with bottom three bits = FLAG_BITS
+			    FLAG_BITS = bit 0 is CINUSE (currently in use unless is mmap), bit 1 is PINUSE (previous block currently
+				            in use unless mmap), bit 2 is UNUSED and currently is always zero.
+			*/
+			register void *RESTRICT leastusedaddress_=leastusedaddress;		/* Cache these to avoid register reloading */
+			register size_t largestusedblock_=largestusedblock;
+			if(!is_aligned(mem)) return 0;		/* Would fail very rarely as all allocators return aligned blocks */
+			if(mem<leastusedaddress_) return 0;	/* Simple but effective */
+			{
+				mchunkptr p=mem2chunk(mem);
+				mstate fm=0;
+				int ismmapped=is_mmapped(p);
+				if((!ismmapped && !is_inuse(p)) || (p->head & FLAG4_BIT)) return 0;
+				/* Reduced uncertainty by 0.5^2 = 25.0% */
+				/* size should never exceed largestusedblock */
+				if(chunksize(p)>largestusedblock_) return 0;
+				/* Reduced uncertainty by a minimum of 0.5^3 = 12.5%, maximum 0.5^16 = 0.0015% */
+				/* Having sanity checked prev_foot and head, check next block */
+				if(!ismmapped && (!next_pinuse(p) || (next_chunk(p)->head & FLAG4_BIT))) return 0;
+				/* Reduced uncertainty by 0.5^5 = 3.13% or 0.5^18 = 0.00038% */
+	#if 0
+				/* If previous block is free, check that its next block pointer equals us */
+				if(!ismmapped && !pinuse(p))
+					if(next_chunk(prev_chunk(p))!=p) return 0;
+				/* We could start comparing prev_foot's for similarity but it starts getting slow. */
+	#endif
+				fm = get_mstate_for(p);
+				if(!is_aligned(fm) || (void *)fm<leastusedaddress_) return 0;
+				if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
+				assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
+				if(ok_magic(fm))
+					return fm;
+			}
+		}
+//#ifdef WIN32
+//		__except(1) { }
+//#endif
+#endif
+#endif
+#endif
+	}
+	return 0;
+}
+NEDMALLOCNOALIASATTR size_t nedblksize(int *RESTRICT isforeign, void *RESTRICT mem) THROWSPEC
+{
+	if(mem)
+	{
+		if(isforeign) *isforeign=1;
+#if USE_MAGIC_HEADERS
+		{
+			size_t *_mem=(size_t *) mem-3;
+			if(_mem[0]==*(size_t *) "NEDMALOC")
+			{
+				mstate mspace=(mstate) _mem[1];
+				size_t size=_mem[2];
+				if(isforeign) *isforeign=0;
+				return size;
+			}
+		}
+#elif USE_ALLOCATOR==1
+		if(nedblkmstate(mem))
+		{
+			mchunkptr p=mem2chunk(mem);
+			if(isforeign) *isforeign=0;
+			return chunksize(p)-overhead_for(p);
+		}
+#ifdef DEBUG
+		else
+		{
+			int a=1; /* Set breakpoints here if needed */
+		}
+#endif
+#endif
+#if defined(ENABLE_TOLERANT_NEDMALLOC) || USE_ALLOCATOR==0
+#ifdef _MSC_VER
+		/* This is the MSVCRT equivalent */
+		return _msize(mem);
+#elif defined(__linux__)
+		/* This is the glibc/ptmalloc2/dlmalloc equivalent.  */
+		return malloc_usable_size(mem);
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+		/* This is the BSD libc equivalent.  */
+		return malloc_size(mem);
+#else
+#error Cannot tolerate the memory allocator of an unknown system!
+#endif
+#endif
+	}
+	return 0;
+}
+
+NEDMALLOCNOALIASATTR void nedsetvalue(void *v) THROWSPEC											{ nedpsetvalue((nedpool *) 0, v); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmalloc(size_t size) THROWSPEC						{ return nedpmalloc((nedpool *) 0, size); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedcalloc(size_t no, size_t size) THROWSPEC			{ return nedpcalloc((nedpool *) 0, no, size); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedrealloc(void *mem, size_t size) THROWSPEC			{ return nedprealloc((nedpool *) 0, mem, size); }
+NEDMALLOCNOALIASATTR void   nedfree(void *mem) THROWSPEC											{ nedpfree((nedpool *) 0, mem); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC	{ return nedpmemalign((nedpool *) 0, alignment, bytes); }
+NEDMALLOCNOALIASATTR struct nedmallinfo nedmallinfo(void) THROWSPEC									{ return nedpmallinfo((nedpool *) 0); }
+NEDMALLOCNOALIASATTR int    nedmallopt(int parno, int value) THROWSPEC								{ return nedpmallopt((nedpool *) 0, parno, value); }
+NEDMALLOCNOALIASATTR int    nedmalloc_trim(size_t pad) THROWSPEC									{ return nedpmalloc_trim((nedpool *) 0, pad); }
+void   nedmalloc_stats() THROWSPEC																	{ nedpmalloc_stats((nedpool *) 0); }
+NEDMALLOCNOALIASATTR size_t nedmalloc_footprint() THROWSPEC											{ return nedpmalloc_footprint((nedpool *) 0); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC	{ return nedpindependent_calloc((nedpool *) 0, elemsno, elemsize, chunks); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC		{ return nedpindependent_comalloc((nedpool *) 0, elems, sizes, chunks); }
+
+struct threadcacheblk_t;
+typedef struct threadcacheblk_t threadcacheblk;
+struct threadcacheblk_t
+{	/* Keep less than 16 bytes on 32 bit systems and 32 bytes on 64 bit systems */
+#ifdef FULLSANITYCHECKS
+	unsigned int magic;
+#endif
+	unsigned int lastUsed, size;
+	threadcacheblk *next, *prev;
+};
+typedef struct threadcache_t
+{
+#ifdef FULLSANITYCHECKS
+	unsigned int magic1;
+#endif
+	int mymspace;						/* Last mspace entry this thread used */
+	long threadid;
+	unsigned int mallocs, frees, successes;
+	size_t freeInCache;					/* How much free space is stored in this cache */
+	threadcacheblk *bins[(THREADCACHEMAXBINS+1)*2];
+#ifdef FULLSANITYCHECKS
+	unsigned int magic2;
+#endif
+} threadcache;
+struct nedpool_t
+{
+	MLOCK_T mutex;
+	void *uservalue;
+	int threads;						/* Max entries in m to use */
+	threadcache *caches[THREADCACHEMAXCACHES];
+	TLSVAR mycache;						/* Thread cache for this thread. 0 for unset, negative for use mspace-1 directly, otherwise is cache-1 */
+	mstate m[MAXTHREADSINPOOL+1];		/* mspace entries for this pool */
+};
+static nedpool syspool;
+
+static FORCEINLINE NEDMALLOCNOALIASATTR unsigned int size2binidx(size_t _size) THROWSPEC
+{	/* 8=1000	16=10000	20=10100	24=11000	32=100000	48=110000	4096=1000000000000 */
+	unsigned int topbit, size=(unsigned int)(_size>>4);
+	/* 16=1		20=1	24=1	32=10	48=11	64=100	96=110	128=1000	4096=100000000 */
+
+#if defined(__GNUC__)
+        topbit = sizeof(size)*__CHAR_BIT__ - 1 - __builtin_clz(size);
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+	{
+            unsigned long bsrTopBit;
+
+            _BitScanReverse(&bsrTopBit, size);
+
+            topbit = bsrTopBit;
+        }
+#else
+#if 0
+	union {
+		unsigned asInt[2];
+		double asDouble;
+	};
+	int n;
+
+	asDouble = (double)size + 0.5;
+	topbit = (asInt[!FOX_BIGENDIAN] >> 20) - 1023;
+#else
+	{
+		unsigned int x=size;
+		x = x | (x >> 1);
+		x = x | (x >> 2);
+		x = x | (x >> 4);
+		x = x | (x >> 8);
+		x = x | (x >>16);
+		x = ~x;
+		x = x - ((x >> 1) & 0x55555555);
+		x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+		x = (x + (x >> 4)) & 0x0F0F0F0F;
+		x = x + (x << 8);
+		x = x + (x << 16);
+		topbit=31 - (x >> 24);
+	}
+#endif
+#endif
+	return topbit;
+}
+
+
+#ifdef FULLSANITYCHECKS
+static void tcsanitycheck(threadcacheblk **ptr) THROWSPEC
+{
+	assert((ptr[0] && ptr[1]) || (!ptr[0] && !ptr[1]));
+	if(ptr[0] && ptr[1])
+	{
+		assert(nedblksize(ptr[0])>=sizeof(threadcacheblk));
+		assert(nedblksize(ptr[1])>=sizeof(threadcacheblk));
+		assert(*(unsigned int *) "NEDN"==ptr[0]->magic);
+		assert(*(unsigned int *) "NEDN"==ptr[1]->magic);
+		assert(!ptr[0]->prev);
+		assert(!ptr[1]->next);
+		if(ptr[0]==ptr[1])
+		{
+			assert(!ptr[0]->next);
+			assert(!ptr[1]->prev);
+		}
+	}
+}
+static void tcfullsanitycheck(threadcache *tc) THROWSPEC
+{
+	threadcacheblk **tcbptr=tc->bins;
+	int n;
+	for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
+	{
+		threadcacheblk *b, *ob=0;
+		tcsanitycheck(tcbptr);
+		for(b=tcbptr[0]; b; ob=b, b=b->next)
+		{
+			assert(*(unsigned int *) "NEDN"==b->magic);
+			assert(!ob || ob->next==b);
+			assert(!ob || b->prev==ob);
+		}
+	}
+}
+#endif
+
+static NOINLINE void RemoveCacheEntries(nedpool *RESTRICT p, threadcache *RESTRICT tc, unsigned int age) THROWSPEC
+{
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	if(tc->freeInCache)
+	{
+		threadcacheblk **tcbptr=tc->bins;
+		int n;
+		for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
+		{
+			threadcacheblk **tcb=tcbptr+1;		/* come from oldest end of list */
+			/*tcsanitycheck(tcbptr);*/
+			for(; *tcb && tc->frees-(*tcb)->lastUsed>=age; )
+			{
+				threadcacheblk *f=*tcb;
+				size_t blksize=f->size; /*nedblksize(f);*/
+				assert(blksize<=nedblksize(0, f));
+				assert(blksize);
+#ifdef FULLSANITYCHECKS
+				assert(*(unsigned int *) "NEDN"==(*tcb)->magic);
+#endif
+				*tcb=(*tcb)->prev;
+				if(*tcb)
+					(*tcb)->next=0;
+				else
+					*tcbptr=0;
+				tc->freeInCache-=blksize;
+				assert((long) tc->freeInCache>=0);
+				CallFree(0, f, 0);
+				/*tcsanitycheck(tcbptr);*/
+			}
+		}
+	}
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+}
+static void DestroyCaches(nedpool *RESTRICT p) THROWSPEC
+{
+	if(p->caches)
+	{
+		threadcache *tc;
+		int n;
+		for(n=0; n<THREADCACHEMAXCACHES; n++)
+		{
+			if((tc=p->caches[n]))
+			{
+				tc->frees++;
+				RemoveCacheEntries(p, tc, 0);
+				assert(!tc->freeInCache);
+				tc->mymspace=-1;
+				tc->threadid=0;
+				CallFree(0, tc, 0);
+				p->caches[n]=0;
+			}
+		}
+	}
+}
+
+static NOINLINE threadcache *AllocCache(nedpool *RESTRICT p) THROWSPEC
+{
+	threadcache *tc=0;
+	int n, end;
+	ACQUIRE_LOCK(&p->mutex);
+	for(n=0; n<THREADCACHEMAXCACHES && p->caches[n]; n++);
+	if(THREADCACHEMAXCACHES==n)
+	{	/* List exhausted, so disable for this thread */
+		RELEASE_LOCK(&p->mutex);
+		return 0;
+	}
+	tc=p->caches[n]=(threadcache *) CallCalloc(p->m[0], sizeof(threadcache), 0);
+	if(!tc)
+	{
+		RELEASE_LOCK(&p->mutex);
+		return 0;
+	}
+#ifdef FULLSANITYCHECKS
+	tc->magic1=*(unsigned int *)"NEDMALC1";
+	tc->magic2=*(unsigned int *)"NEDMALC2";
+#endif
+	tc->threadid=(long)(size_t)CURRENT_THREAD;
+	for(end=0; p->m[end]; end++);
+	tc->mymspace=abs(tc->threadid) % end;
+	RELEASE_LOCK(&p->mutex);
+	if(TLSSET(p->mycache, (void *)(size_t)(n+1))) abort();
+	return tc;
+}
+
+static void *threadcache_malloc(nedpool *RESTRICT p, threadcache *RESTRICT tc, size_t *RESTRICT _size) THROWSPEC
+{
+	void *RESTRICT ret=0;
+	size_t size=*_size, blksize=0;
+	unsigned int bestsize;
+	unsigned int idx=size2binidx(size);
+	threadcacheblk *RESTRICT blk, **RESTRICT binsptr;
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	/* Calculate best fit bin size */
+	bestsize=1<<(idx+4);
+#if 0
+	/* Finer grained bin fit */
+	idx<<=1;
+	if(size>bestsize)
+	{
+		idx++;
+		bestsize+=bestsize>>1;
+	}
+	if(size>bestsize)
+	{
+		idx++;
+		bestsize=1<<(4+(idx>>1));
+	}
+#else
+	if(size>bestsize)
+	{
+		idx++;
+		bestsize<<=1;
+	}
+#endif
+	assert(bestsize>=size);
+	if(size<bestsize) size=bestsize;
+	assert(size<=THREADCACHEMAX);
+	assert(idx<=THREADCACHEMAXBINS);
+	binsptr=&tc->bins[idx*2];
+	/* Try to match close, but move up a bin if necessary */
+	blk=*binsptr;
+	if(!blk || blk->size<size)
+	{	/* Bump it up a bin */
+		if(idx<THREADCACHEMAXBINS)
+		{
+			idx++;
+			binsptr+=2;
+			blk=*binsptr;
+		}
+	}
+	if(blk)
+	{
+		blksize=blk->size; /*nedblksize(blk);*/
+		assert(nedblksize(0, blk)>=blksize);
+		assert(blksize>=size);
+		if(blk->next)
+			blk->next->prev=0;
+		*binsptr=blk->next;
+		if(!*binsptr)
+			binsptr[1]=0;
+#ifdef FULLSANITYCHECKS
+		blk->magic=0;
+#endif
+		assert(binsptr[0]!=blk && binsptr[1]!=blk);
+		assert(nedblksize(0, blk)>=sizeof(threadcacheblk) && nedblksize(0, blk)<=THREADCACHEMAX+CHUNK_OVERHEAD);
+		/*printf("malloc: %p, %p, %p, %lu\n", p, tc, blk, (long) _size);*/
+		ret=(void *) blk;
+	}
+	++tc->mallocs;
+	if(ret)
+	{
+		assert(blksize>=size);
+		++tc->successes;
+		tc->freeInCache-=blksize;
+		assert((long) tc->freeInCache>=0);
+	}
+#if defined(DEBUG) && 0
+	if(!(tc->mallocs & 0xfff))
+	{
+		printf("*** threadcache=%u, mallocs=%u (%f), free=%u (%f), freeInCache=%u\n", (unsigned int) tc->threadid, tc->mallocs,
+			(float) tc->successes/tc->mallocs, tc->frees, (float) tc->successes/tc->frees, (unsigned int) tc->freeInCache);
+	}
+#endif
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	*_size=size;
+	return ret;
+}
+static NOINLINE void ReleaseFreeInCache(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace) THROWSPEC
+{
+	unsigned int age=THREADCACHEMAXFREESPACE/8192;
+	/*ACQUIRE_LOCK(&p->m[mymspace]->mutex);*/
+	while(age && tc->freeInCache>=THREADCACHEMAXFREESPACE)
+	{
+		RemoveCacheEntries(p, tc, age);
+		/*printf("*** Removing cache entries older than %u (%u)\n", age, (unsigned int) tc->freeInCache);*/
+		age>>=1;
+	}
+	/*RELEASE_LOCK(&p->m[mymspace]->mutex);*/
+}
+static void threadcache_free(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, void *RESTRICT mem, size_t size) THROWSPEC
+{
+	unsigned int bestsize;
+	unsigned int idx=size2binidx(size);
+	threadcacheblk **RESTRICT binsptr, *RESTRICT tck=(threadcacheblk *) mem;
+	assert(size>=sizeof(threadcacheblk) && size<=THREADCACHEMAX+CHUNK_OVERHEAD);
+#ifdef DEBUG
+	/* Make sure this is a valid memory block */
+	assert(nedblksize(0, mem));
+#endif
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	/* Calculate best fit bin size */
+	bestsize=1<<(idx+4);
+#if 0
+	/* Finer grained bin fit */
+	idx<<=1;
+	if(size>bestsize)
+	{
+		unsigned int biggerbestsize=bestsize+bestsize<<1;
+		if(size>=biggerbestsize)
+		{
+			idx++;
+			bestsize=biggerbestsize;
+		}
+	}
+#endif
+	if(bestsize!=size)	/* dlmalloc can round up, so we round down to preserve indexing */
+		size=bestsize;
+	binsptr=&tc->bins[idx*2];
+	assert(idx<=THREADCACHEMAXBINS);
+	if(tck==*binsptr)
+	{
+		fprintf(stderr, "nedmalloc: Attempt to free already freed memory block %p - aborting!\n", tck);
+		abort();
+	}
+#ifdef FULLSANITYCHECKS
+	tck->magic=*(unsigned int *) "NEDN";
+#endif
+	tck->lastUsed=++tc->frees;
+	tck->size=(unsigned int) size;
+	tck->next=*binsptr;
+	tck->prev=0;
+	if(tck->next)
+		tck->next->prev=tck;
+	else
+		binsptr[1]=tck;
+	assert(!*binsptr || (*binsptr)->size==tck->size);
+	*binsptr=tck;
+	assert(tck==tc->bins[idx*2]);
+	assert(tc->bins[idx*2+1]==tck || binsptr[0]->next->prev==tck);
+	/*printf("free: %p, %p, %p, %lu\n", p, tc, mem, (long) size);*/
+	tc->freeInCache+=size;
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+#if 1
+	if(tc->freeInCache>=THREADCACHEMAXFREESPACE)
+		ReleaseFreeInCache(p, tc, mymspace);
+#endif
+}
+
+
+
+
+static NOINLINE int InitPool(nedpool *RESTRICT p, size_t capacity, int threads) THROWSPEC
+{	/* threads is -1 for system pool */
+	ensure_initialization();
+	ACQUIRE_MALLOC_GLOBAL_LOCK();
+	if(p->threads) goto done;
+	if(INITIAL_LOCK(&p->mutex)) goto err;
+	if(TLSALLOC(&p->mycache)) goto err;
+#if USE_ALLOCATOR==0
+	p->m[0]=(mstate) mspacecounter++;
+#elif USE_ALLOCATOR==1
+	if(!(p->m[0]=(mstate) create_mspace(capacity, 1))) goto err;
+	p->m[0]->extp=p;
+#endif
+	p->threads=(threads<1 || threads>MAXTHREADSINPOOL) ? MAXTHREADSINPOOL : threads;
+done:
+	RELEASE_MALLOC_GLOBAL_LOCK();
+	return 1;
+err:
+	if(threads<0)
+		abort();			/* If you can't allocate for system pool, we're screwed */
+	DestroyCaches(p);
+	if(p->m[0])
+	{
+#if USE_ALLOCATOR==1
+		destroy_mspace(p->m[0]);
+#endif
+		p->m[0]=0;
+	}
+	if(p->mycache)
+	{
+		if(TLSFREE(p->mycache)) abort();
+		p->mycache=0;
+	}
+	RELEASE_MALLOC_GLOBAL_LOCK();
+	return 0;
+}
+static NOINLINE mstate FindMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int *RESTRICT lastUsed, size_t size) THROWSPEC
+{	/* Gets called when thread's last used mspace is in use. The strategy
+	is to run through the list of all available mspaces looking for an
+	unlocked one and if we fail, we create a new one so long as we don't
+	exceed p->threads */
+	int n, end;
+	for(n=end=*lastUsed+1; p->m[n]; end=++n)
+	{
+		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
+	}
+	for(n=0; n<*lastUsed && p->m[n]; n++)
+	{
+		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
+	}
+	if(end<p->threads)
+	{
+		mstate temp;
+#if USE_ALLOCATOR==0
+		temp=(mstate) mspacecounter++;
+#elif USE_ALLOCATOR==1
+		if(!(temp=(mstate) create_mspace(size, 1)))
+			goto badexit;
+#endif
+		/* Now we're ready to modify the lists, we lock */
+		ACQUIRE_LOCK(&p->mutex);
+		while(p->m[end] && end<p->threads)
+			end++;
+		if(end>=p->threads)
+		{	/* Drat, must destroy it now */
+			RELEASE_LOCK(&p->mutex);
+#if USE_ALLOCATOR==1
+			destroy_mspace((mstate) temp);
+#endif
+			goto badexit;
+		}
+		/* We really want to make sure this goes into memory now but we
+		have to be careful of breaking aliasing rules, so write it twice */
+		*((volatile struct malloc_state **) &p->m[end])=p->m[end]=temp;
+		ACQUIRE_LOCK(&p->m[end]->mutex);
+		/*printf("Created mspace idx %d\n", end);*/
+		RELEASE_LOCK(&p->mutex);
+		n=end;
+		goto found;
+	}
+	/* Let it lock on the last one it used */
+badexit:
+	ACQUIRE_LOCK(&p->m[*lastUsed]->mutex);
+	return p->m[*lastUsed];
+found:
+	*lastUsed=n;
+	if(tc)
+		tc->mymspace=n;
+	else
+	{
+		if(TLSSET(p->mycache, (void *)(size_t)(-(n+1)))) abort();
+	}
+	return p->m[n];
+}
+
+typedef struct PoolList_t
+{
+	size_t size;			/* Size of list */
+	size_t length;			/* Actual entries in list */
+#ifdef DEBUG
+	nedpool *list[1];		/* Force testing of list expansion */
+#else
+	nedpool *list[16];
+#endif
+} PoolList;
+static MLOCK_T poollistlock;
+static PoolList *poollist;
+NEDMALLOCPTRATTR nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC
+{
+	nedpool *ret=0;
+	if(!poollist)
+	{
+		PoolList *newpoollist=0;
+		if(!(newpoollist=(PoolList *) nedpcalloc(0, 1, sizeof(PoolList)+sizeof(nedpool *)))) return 0;
+		INITIAL_LOCK(&poollistlock);
+		ACQUIRE_LOCK(&poollistlock);
+		poollist=newpoollist;
+		poollist->size=sizeof(poollist->list)/sizeof(nedpool *);
+	}
+	else
+		ACQUIRE_LOCK(&poollistlock);
+	if(poollist->length==poollist->size)
+	{
+		PoolList *newpoollist=0;
+		size_t newsize=0;
+		newsize=sizeof(PoolList)+(poollist->size+1)*sizeof(nedpool *);
+		if(!(newpoollist=(PoolList *) nedprealloc(0, poollist, newsize))) goto badexit;
+		poollist=newpoollist;
+		memset(&poollist->list[poollist->size], 0, newsize-((size_t)&poollist->list[poollist->size]-(size_t)&poollist->list[0]));
+		poollist->size=((newsize-((char *)&poollist->list[0]-(char *)poollist))/sizeof(nedpool *))-1;
+		assert(poollist->size>poollist->length);
+	}
+	if(!(ret=(nedpool *) nedpcalloc(0, 1, sizeof(nedpool)))) goto badexit;
+	if(!InitPool(ret, capacity, threads))
+	{
+		nedpfree(0, ret);
+		goto badexit;
+	}
+	poollist->list[poollist->length++]=ret;
+badexit:
+	RELEASE_LOCK(&poollistlock);
+	return ret;
+}
+void neddestroypool(nedpool *p) THROWSPEC
+{
+	unsigned int n;
+	ACQUIRE_LOCK(&p->mutex);
+	DestroyCaches(p);
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		destroy_mspace(p->m[n]);
+#endif
+		p->m[n]=0;
+	}
+	RELEASE_LOCK(&p->mutex);
+	if(TLSFREE(p->mycache)) abort();
+	nedpfree(0, p);
+	ACQUIRE_LOCK(&poollistlock);
+	assert(poollist);
+	for(n=0; n<poollist->length && poollist->list[n]!=p; n++);
+	assert(n!=poollist->length);
+	memmove(&poollist->list[n], &poollist->list[n+1], (size_t)&poollist->list[poollist->length]-(size_t)&poollist->list[n]);
+	if(!--poollist->length)
+	{
+		assert(!poollist->list[0]);
+		nedpfree(0, poollist);
+		poollist=0;
+	}
+	RELEASE_LOCK(&poollistlock);
+}
+void neddestroysyspool() THROWSPEC
+{
+	nedpool *p=&syspool;
+	int n;
+	ACQUIRE_LOCK(&p->mutex);
+	DestroyCaches(p);
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		destroy_mspace(p->m[n]);
+#endif
+		p->m[n]=0;
+	}
+	/* Render syspool unusable */
+	for(n=0; n<THREADCACHEMAXCACHES; n++)
+		p->caches[n]=(threadcache *)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
+	for(n=0; n<MAXTHREADSINPOOL+1; n++)
+		p->m[n]=(mstate)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
+	if(TLSFREE(p->mycache)) abort();
+	RELEASE_LOCK(&p->mutex);
+}
+nedpool **nedpoollist() THROWSPEC
+{
+	nedpool **ret=0;
+	if(poollist)
+	{
+		ACQUIRE_LOCK(&poollistlock);
+		if(!(ret=(nedpool **) nedmalloc((poollist->length+1)*sizeof(nedpool *)))) goto badexit;
+		memcpy(ret, poollist->list, (poollist->length+1)*sizeof(nedpool *));
+badexit:
+		RELEASE_LOCK(&poollistlock);
+	}
+	return ret;
+}
+
+void nedpsetvalue(nedpool *p, void *v) THROWSPEC
+{
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	p->uservalue=v;
+}
+void *nedgetvalue(nedpool **p, void *mem) THROWSPEC
+{
+	nedpool *np=0;
+	mstate fm=nedblkmstate(mem);
+	if(!fm || !fm->extp) return 0;
+	np=(nedpool *) fm->extp;
+	if(p) *p=np;
+	return np->uservalue;
+}
+
+void nedtrimthreadcache(nedpool *p, int disable) THROWSPEC
+{
+	int mycache;
+	if(!p)
+	{
+		p=&syspool;
+		if(!syspool.threads) InitPool(&syspool, 0, -1);
+	}
+	mycache=(int)(size_t) TLSGET(p->mycache);
+	if(!mycache)
+	{	/* Set to mspace 0 */
+		if(disable && TLSSET(p->mycache, (void *)(size_t)-1)) abort();
+	}
+	else if(mycache>0)
+	{	/* Set to last used mspace */
+		threadcache *tc=p->caches[mycache-1];
+#if defined(DEBUG)
+		printf("Threadcache utilisation: %lf%% in cache with %lf%% lost to other threads\n",
+			100.0*tc->successes/tc->mallocs, 100.0*((double) tc->mallocs-tc->frees)/tc->mallocs);
+#endif
+		if(disable && TLSSET(p->mycache, (void *)(size_t)(-tc->mymspace))) abort();
+		tc->frees++;
+		RemoveCacheEntries(p, tc, 0);
+		assert(!tc->freeInCache);
+		if(disable)
+		{
+			tc->mymspace=-1;
+			tc->threadid=0;
+			CallFree(0, p->caches[mycache-1], 0);
+			p->caches[mycache-1]=0;
+		}
+	}
+}
+void neddisablethreadcache(nedpool *p) THROWSPEC
+{
+	nedtrimthreadcache(p, 1);
+}
+
+#define GETMSPACE(m,p,tc,ms,s,action)                 \
+  do                                                  \
+  {                                                   \
+    mstate m = GetMSpace((p),(tc),(ms),(s));          \
+    action;                                           \
+	if(USE_ALLOCATOR==1) { RELEASE_LOCK(&m->mutex); } \
+  } while (0)
+
+static FORCEINLINE mstate GetMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, size_t size) THROWSPEC
+{	/* Returns a locked and ready for use mspace */
+	mstate m=p->m[mymspace];
+	assert(m);
+#if USE_ALLOCATOR==1
+	if(!TRY_LOCK(&p->m[mymspace]->mutex)) m=FindMSpace(p, tc, &mymspace, size);
+	/*assert(IS_LOCKED(&p->m[mymspace]->mutex));*/
+#endif
+	return m;
+}
+static NOINLINE void GetThreadCache_cold1(nedpool *RESTRICT *RESTRICT p) THROWSPEC
+{
+	*p=&syspool;
+	if(!syspool.threads) InitPool(&syspool, 0, -1);
+}
+static NOINLINE void GetThreadCache_cold2(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, int mycache) THROWSPEC
+{
+	if(!mycache)
+	{	/* Need to allocate a new cache */
+		*tc=AllocCache(*p);
+		if(!*tc)
+		{	/* Disable */
+			if(TLSSET((*p)->mycache, (void *)(size_t)-1)) abort();
+			*mymspace=0;
+		}
+		else
+			*mymspace=(*tc)->mymspace;
+	}
+	else
+	{	/* Cache disabled, but we do have an assigned thread pool */
+		*tc=0;
+		*mymspace=-mycache-1;
+	}
+}
+static FORCEINLINE void GetThreadCache(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, size_t *RESTRICT size) THROWSPEC
+{
+	int mycache;
+	if(size && *size<sizeof(threadcacheblk)) *size=sizeof(threadcacheblk);
+	if(!*p)
+		GetThreadCache_cold1(p);
+	mycache=(int)(size_t) TLSGET((*p)->mycache);
+	if(mycache>0)
+	{	/* Already have a cache */
+		*tc=(*p)->caches[mycache-1];
+		*mymspace=(*tc)->mymspace;
+	}
+	else GetThreadCache_cold2(p, tc, mymspace, mycache);
+	assert(*mymspace>=0);
+	assert(!(*tc) || (long)(size_t)CURRENT_THREAD==(*tc)->threadid);
+#ifdef FULLSANITYCHECKS
+	if(*tc)
+	{
+		if(*(unsigned int *)"NEDMALC1"!=(*tc)->magic1 || *(unsigned int *)"NEDMALC2"!=(*tc)->magic2)
+		{
+			abort();
+		}
+	}
+#endif
+}
+
+NEDMALLOCPTRATTR void * nedpmalloc(nedpool *p, size_t size) THROWSPEC
+{
+	void *ret=0;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &size);
+#if THREADCACHEMAX
+	if(tc && size<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		ret=threadcache_malloc(p, tc, &size);
+	}
+#endif
+	if(!ret)
+	{	/* Use this thread's mspace */
+        GETMSPACE(m, p, tc, mymspace, size,
+                  ret=CallMalloc(m, size, 0));
+	}
+	return ret;
+}
+NEDMALLOCPTRATTR void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC
+{
+	size_t rsize=size*no;
+	void *ret=0;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &rsize);
+#if THREADCACHEMAX
+	if(tc && rsize<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		if((ret=threadcache_malloc(p, tc, &rsize)))
+			memset(ret, 0, rsize);
+	}
+#endif
+	if(!ret)
+	{	/* Use this thread's mspace */
+        GETMSPACE(m, p, tc, mymspace, rsize,
+                  ret=CallCalloc(m, rsize, 0));
+	}
+	return ret;
+}
+NEDMALLOCPTRATTR void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC
+{
+	void *ret=0;
+	threadcache *tc;
+	int mymspace, isforeign=1;
+	size_t memsize;
+	if(!mem) return nedpmalloc(p, size);
+	memsize=nedblksize(&isforeign, mem);
+	assert(memsize);
+	if(!memsize)
+	{
+		fprintf(stderr, "nedmalloc: nedprealloc() called with a block not created by nedmalloc!\n");
+		abort();
+	}
+	else if(size<=memsize && memsize-size<
+#ifdef DEBUG
+		32
+#else
+		1024
+#endif
+		)		/* If realloc size is within 1Kb smaller than existing, noop it */
+		return mem;
+	GetThreadCache(&p, &tc, &mymspace, &size);
+#if THREADCACHEMAX
+	if(tc && size && size<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		if((ret=threadcache_malloc(p, tc, &size)))
+		{
+			memcpy(ret, mem, memsize<size ? memsize : size);
+			if(memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
+				threadcache_free(p, tc, mymspace, mem, memsize);
+			else
+				CallFree(0, mem, isforeign);
+		}
+	}
+#endif
+	if(!ret)
+	{	/* Reallocs always happen in the mspace they happened in, so skip
+		locking the preferred mspace for this thread */
+		ret=CallRealloc(p->m[mymspace], mem, isforeign, memsize, size);
+	}
+	return ret;
+}
+void   nedpfree(nedpool *p, void *mem) THROWSPEC
+{	/* Frees always happen in the mspace they happened in, so skip
+	locking the preferred mspace for this thread */
+	threadcache *tc;
+	int mymspace, isforeign=1;
+	size_t memsize;
+	if(!mem)
+	{	/* If you tried this on FreeBSD you'd be sorry! */
+#ifdef DEBUG
+		fprintf(stderr, "nedmalloc: WARNING nedpfree() called with zero. This is not portable behaviour!\n");
+#endif
+		return;
+	}
+	memsize=nedblksize(&isforeign, mem);
+	assert(memsize);
+	if(!memsize)
+	{
+		fprintf(stderr, "nedmalloc: nedpfree() called with a block not created by nedmalloc!\n");
+		abort();
+	}
+	GetThreadCache(&p, &tc, &mymspace, 0);
+#if THREADCACHEMAX
+	if(mem && tc && memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
+		threadcache_free(p, tc, mymspace, mem, memsize);
+	else
+#endif
+		CallFree(0, mem, isforeign);
+}
+NEDMALLOCPTRATTR void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC
+{
+	void *ret;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &bytes);
+	{	/* Use this thread's mspace */
+        GETMSPACE(m, p, tc, mymspace, bytes,
+                  ret=CallMalloc(m, bytes, alignment));
+	}
+	return ret;
+}
+struct nedmallinfo nedpmallinfo(nedpool *p) THROWSPEC
+{
+	int n;
+	struct nedmallinfo ret={0};
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1 && !NO_MALLINFO
+		struct mallinfo t=mspace_mallinfo(p->m[n]);
+		ret.arena+=t.arena;
+		ret.ordblks+=t.ordblks;
+		ret.hblkhd+=t.hblkhd;
+		ret.usmblks+=t.usmblks;
+		ret.uordblks+=t.uordblks;
+		ret.fordblks+=t.fordblks;
+		ret.keepcost+=t.keepcost;
+#endif
+	}
+	return ret;
+}
+int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC
+{
+#if USE_ALLOCATOR==1
+	return mspace_mallopt(parno, value);
+#else
+	return 0;
+#endif
+}
+NEDMALLOCNOALIASATTR void*  nedmalloc_internals(size_t *granularity, size_t *magic) THROWSPEC
+{
+#if USE_ALLOCATOR==1
+	if(granularity) *granularity=mparams.granularity;
+	if(magic) *magic=mparams.magic;
+	return (void *) &syspool;
+#else
+	if(granularity) *granularity=0;
+	if(magic) *magic=0;
+	return 0;
+#endif
+}
+int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC
+{
+	int n, ret=0;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		ret+=mspace_trim(p->m[n], pad);
+#endif
+	}
+	return ret;
+}
+void   nedpmalloc_stats(nedpool *p) THROWSPEC
+{
+	int n;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		mspace_malloc_stats(p->m[n]);
+#endif
+	}
+}
+size_t nedpmalloc_footprint(nedpool *p) THROWSPEC
+{
+	size_t ret=0;
+	int n;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		ret+=mspace_footprint(p->m[n]);
+#endif
+	}
+	return ret;
+}
+NEDMALLOCPTRATTR void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC
+{
+	void **ret;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &elemsize);
+#if USE_ALLOCATOR==0
+    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
+              ret=unsupported_operation("independent_calloc"));
+#elif USE_ALLOCATOR==1
+    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
+              ret=mspace_independent_calloc(m, elemsno, elemsize, chunks));
+#endif
+	return ret;
+}
+NEDMALLOCPTRATTR void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC
+{
+	void **ret;
+	threadcache *tc;
+	int mymspace;
+    size_t i, *adjustedsizes=(size_t *) alloca(elems*sizeof(size_t));
+    if(!adjustedsizes) return 0;
+    for(i=0; i<elems; i++)
+        adjustedsizes[i]=sizes[i]<sizeof(threadcacheblk) ? sizeof(threadcacheblk) : sizes[i];
+	GetThreadCache(&p, &tc, &mymspace, 0);
+#if USE_ALLOCATOR==0
+	GETMSPACE(m, p, tc, mymspace, 0,
+              ret=unsupported_operation("independent_comalloc"));
+#elif USE_ALLOCATOR==1
+	GETMSPACE(m, p, tc, mymspace, 0,
+              ret=mspace_independent_comalloc(m, elems, adjustedsizes, chunks));
+#endif
+	return ret;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif

+ 302 - 302
drivers/nedmalloc/nedmalloc.h

@@ -1,302 +1,302 @@
-#ifdef NEDMALLOC_ENABLED
-
-/* nedalloc, an alternative malloc implementation for multiple threads without
-lock contention based on dlmalloc v2.8.3. (C) 2005-2009 Niall Douglas
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef NEDMALLOC_H
-#define NEDMALLOC_H
-
-#include "typedefs.h"
-#define MALLOC_ALIGNMENT DEFAULT_ALIGNMENT
-
-#ifdef PSP_ENABLED
-#define USE_LOCKS 0
-#define HAVE_MMAP 0
-#endif
-
-/* See malloc.c.h for what each function does.
-
-REPLACE_SYSTEM_ALLOCATOR on POSIX causes nedalloc's functions to be called
-malloc, free etc. instead of nedmalloc, nedfree etc. You may or may not want
-this. On Windows it causes nedmalloc to patch all loaded DLLs and binaries
-to replace usage of the system allocator.
-
-NO_NED_NAMESPACE prevents the functions from being defined in the nedalloc
-namespace when in C++ (uses the global namespace instead).
-
-NEDMALLOCEXTSPEC can be defined to be __declspec(dllexport) or
-__attribute__ ((visibility("default"))) or whatever you like. It defaults
-to extern unless NEDMALLOC_DLL_EXPORTS is set as it would be when building
-nedmalloc.dll.
-
-USE_LOCKS can be 2 if you want to define your own MLOCK_T, INITIAL_LOCK,
-ACQUIRE_LOCK, RELEASE_LOCK, TRY_LOCK, IS_LOCKED and NULL_LOCK_INITIALIZER.
-
-NEDMALLOC_DEBUG can be defined to cause DEBUG to be set differently for nedmalloc
-than for the rest of the build. Remember to set NDEBUG to disable all assertion
-checking too.
-
-USE_MAGIC_HEADERS causes nedalloc to allocate an extra three sizeof(size_t)
-to each block. nedpfree() and nedprealloc() can then automagically know when
-to free a system allocated block. Enabling this typically adds 20-50% to
-application memory usage.
-
-ENABLE_TOLERANT_NEDMALLOC is automatically turned on if REPLACE_SYSTEM_ALLOCATOR
-is set or the Windows DLL is being built. This causes nedmalloc to detect when a
-system allocator block is passed to it and to handle it appropriately. Note that
-without USE_MAGIC_HEADERS there is a very tiny chance that nedmalloc will segfault
-on non-Windows builds (it uses Win32 SEH to trap segfaults on Windows and there
-is no comparable system on POSIX).
-
-USE_ALLOCATOR can be one of these settings (it defaults to 1):
-  0: System allocator (nedmalloc now simply acts as a threadcache).
-     WARNING: Intended for DEBUG USE ONLY - not all functions work correctly.
-  1: dlmalloc
-
-ENABLE_LARGE_PAGES enables support for requesting memory from the system in large
-(typically >=2Mb) pages if the host OS supports this. These occupy just a single
-TLB entry and can significantly improve performance in large working set applications.
-
-ENABLE_FAST_HEAP_DETECTION enables special logic to detect blocks allocated
-by the system heap. This avoids 1.5%-2% overhead when checking for non-nedmalloc
-blocks, but it assumes that the NT and glibc heaps function in a very specific
-fashion which may not hold true across OS upgrades.
-*/
-
-#include <stddef.h>   /* for size_t */
-
-#ifndef NEDMALLOCEXTSPEC
- #ifdef NEDMALLOC_DLL_EXPORTS
-  #ifdef WIN32
-   #define NEDMALLOCEXTSPEC extern __declspec(dllexport)
-  #elif defined(__GNUC__)
-   #define NEDMALLOCEXTSPEC extern __attribute__ ((visibility("default")))
-  #endif
-  #ifndef ENABLE_TOLERANT_NEDMALLOC
-   #define ENABLE_TOLERANT_NEDMALLOC 1
-  #endif
- #else
-  #define NEDMALLOCEXTSPEC extern
- #endif
-#endif
-
-#if __STDC_VERSION__ >= 199901L		/* C99 or better */
- #define RESTRICT restrict
-#else
- #if defined(_MSC_VER) && _MSC_VER>=1400
-  #define RESTRICT __restrict
- #endif
- #ifdef __GNUC__
-  #define RESTRICT __restrict
- #endif
-#endif
-#ifndef RESTRICT
- #define RESTRICT
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER>=1400
- #define NEDMALLOCPTRATTR __declspec(restrict)
- #define NEDMALLOCNOALIASATTR __declspec(noalias)
-#endif
-#ifdef __GNUC__
- #define NEDMALLOCPTRATTR __attribute__ ((malloc))
-#endif
-#ifndef NEDMALLOCPTRATTR
- #define NEDMALLOCPTRATTR
-#endif
-#ifndef NEDMALLOCNOALIASATTR
- #define NEDMALLOCNOALIASATTR
-#endif
-
-#ifndef USE_MAGIC_HEADERS
- #define USE_MAGIC_HEADERS 0
-#endif
-
-#ifndef USE_ALLOCATOR
- #define USE_ALLOCATOR 1 /* dlmalloc */
-#endif
-
-#if !USE_ALLOCATOR && !USE_MAGIC_HEADERS
-#error If you are using the system allocator then you MUST use magic headers
-#endif
-
-#ifdef REPLACE_SYSTEM_ALLOCATOR
- #if USE_ALLOCATOR==0
-  #error Cannot combine using the system allocator with replacing the system allocator
- #endif
- #ifndef ENABLE_TOLERANT_NEDMALLOC
-  #define ENABLE_TOLERANT_NEDMALLOC 1
- #endif
- #ifndef WIN32	/* We have a dedicated patcher for Windows */
-  #define nedmalloc               malloc
-  #define nedcalloc               calloc
-  #define nedrealloc              realloc
-  #define nedfree                 free
-  #define nedmemalign             memalign
-  #define nedmallinfo             mallinfo
-  #define nedmallopt              mallopt
-  #define nedmalloc_trim          malloc_trim
-  #define nedmalloc_stats         malloc_stats
-  #define nedmalloc_footprint     malloc_footprint
-  #define nedindependent_calloc   independent_calloc
-  #define nedindependent_comalloc independent_comalloc
-  #ifdef _MSC_VER
-   #define nedblksize              _msize
-  #endif
- #endif
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-struct nedmallinfo {
-  size_t arena;    /* non-mmapped space allocated from system */
-  size_t ordblks;  /* number of free chunks */
-  size_t smblks;   /* always 0 */
-  size_t hblks;    /* always 0 */
-  size_t hblkhd;   /* space in mmapped regions */
-  size_t usmblks;  /* maximum total allocated space */
-  size_t fsmblks;  /* always 0 */
-  size_t uordblks; /* total allocated space */
-  size_t fordblks; /* total free space */
-  size_t keepcost; /* releasable (via malloc_trim) space */
-};
-#if defined(__cplusplus)
-}
-#endif
-
-#if defined(__cplusplus)
- #if !defined(NO_NED_NAMESPACE)
-namespace nedalloc {
- #else
-extern "C" {
- #endif
- #define THROWSPEC throw()
-#else
- #define THROWSPEC
-#endif
-
-/* These are the global functions */
-
-/* Gets the usable size of an allocated block. Note this will always be bigger than what was
-asked for due to rounding etc. Optionally returns 1 in isforeign if the block came from the
-system allocator - note that there is a small (>0.01%) but real chance of segfault on non-Windows
-systems when passing non-nedmalloc blocks if you don't use USE_MAGIC_HEADERS.
-*/
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR size_t nedblksize(int *RESTRICT isforeign, void *RESTRICT mem) THROWSPEC;
-
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR void nedsetvalue(void *v) THROWSPEC;
-
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmalloc(size_t size) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedcalloc(size_t no, size_t size) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedrealloc(void *mem, size_t size) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR void   nedfree(void *mem) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR struct nedmallinfo nedmallinfo(void) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR int    nedmallopt(int parno, int value) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR void*  nedmalloc_internals(size_t *granularity, size_t *magic) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR int    nedmalloc_trim(size_t pad) THROWSPEC;
-NEDMALLOCEXTSPEC void   nedmalloc_stats(void) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR size_t nedmalloc_footprint(void) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC;
-
-/* Destroys the system memory pool used by the functions above.
-Useful for when you have nedmalloc in a DLL you're about to unload.
-If you call ANY nedmalloc functions after calling this you will
-get a fatal exception!
-*/
-NEDMALLOCEXTSPEC void neddestroysyspool() THROWSPEC;
-
-/* These are the pool functions */
-struct nedpool_t;
-typedef struct nedpool_t nedpool;
-
-/* Creates a memory pool for use with the nedp* functions below.
-Capacity is how much to allocate immediately (if you know you'll be allocating a lot
-of memory very soon) which you can leave at zero. Threads specifies how many threads
-will *normally* be accessing the pool concurrently. Setting this to zero means it
-extends on demand, but be careful of this as it can rapidly consume system resources
-where bursts of concurrent threads use a pool at once.
-*/
-NEDMALLOCEXTSPEC NEDMALLOCPTRATTR nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC;
-
-/* Destroys a memory pool previously created by nedcreatepool().
-*/
-NEDMALLOCEXTSPEC void neddestroypool(nedpool *p) THROWSPEC;
-
-/* Returns a zero terminated snapshot of threadpools existing at the time of call. Call
-nedfree() on the returned list when you are done. Returns zero if there is only the
-system pool in existence.
-*/
-NEDMALLOCEXTSPEC nedpool **nedpoollist() THROWSPEC;
-
-/* Sets a value to be associated with a pool. You can retrieve this value by passing
-any memory block allocated from that pool.
-*/
-NEDMALLOCEXTSPEC void nedpsetvalue(nedpool *p, void *v) THROWSPEC;
-
-/* Gets a previously set value using nedpsetvalue() or zero if memory is unknown.
-Optionally can also retrieve pool. You can detect an unknown block by the return
-being zero and *p being unmodifed.
-*/
-NEDMALLOCEXTSPEC void *nedgetvalue(nedpool **p, void *mem) THROWSPEC;
-
-/* Trims the thread cache for the calling thread, returning any existing cache
-data to the central pool. Remember to ALWAYS call with zero if you used the
-system pool. Setting disable to non-zero replicates neddisablethreadcache().
-*/
-NEDMALLOCEXTSPEC void nedtrimthreadcache(nedpool *p, int disable) THROWSPEC;
-
-/* Disables the thread cache for the calling thread, returning any existing cache
-data to the central pool. Remember to ALWAYS call with zero if you used the
-system pool.
-*/
-NEDMALLOCEXTSPEC void neddisablethreadcache(nedpool *p) THROWSPEC;
-
-
-NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedpmalloc(nedpool *p, size_t size) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC;
-NEDMALLOCEXTSPEC void   nedpfree(nedpool *p, void *mem) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC;
-NEDMALLOCEXTSPEC struct nedmallinfo nedpmallinfo(nedpool *p) THROWSPEC;
-NEDMALLOCEXTSPEC int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC;
-NEDMALLOCEXTSPEC int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC;
-NEDMALLOCEXTSPEC void   nedpmalloc_stats(nedpool *p) THROWSPEC;
-NEDMALLOCEXTSPEC size_t nedpmalloc_footprint(nedpool *p) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC;
-NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC;
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
-
-#endif
+#ifdef NEDMALLOC_ENABLED
+
+/* nedalloc, an alternative malloc implementation for multiple threads without
+lock contention based on dlmalloc v2.8.3. (C) 2005-2009 Niall Douglas
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef NEDMALLOC_H
+#define NEDMALLOC_H
+
+#include "typedefs.h"
+#define MALLOC_ALIGNMENT DEFAULT_ALIGNMENT
+
+#ifdef PSP_ENABLED
+#define USE_LOCKS 0
+#define HAVE_MMAP 0
+#endif
+
+/* See malloc.c.h for what each function does.
+
+REPLACE_SYSTEM_ALLOCATOR on POSIX causes nedalloc's functions to be called
+malloc, free etc. instead of nedmalloc, nedfree etc. You may or may not want
+this. On Windows it causes nedmalloc to patch all loaded DLLs and binaries
+to replace usage of the system allocator.
+
+NO_NED_NAMESPACE prevents the functions from being defined in the nedalloc
+namespace when in C++ (uses the global namespace instead).
+
+NEDMALLOCEXTSPEC can be defined to be __declspec(dllexport) or
+__attribute__ ((visibility("default"))) or whatever you like. It defaults
+to extern unless NEDMALLOC_DLL_EXPORTS is set as it would be when building
+nedmalloc.dll.
+
+USE_LOCKS can be 2 if you want to define your own MLOCK_T, INITIAL_LOCK,
+ACQUIRE_LOCK, RELEASE_LOCK, TRY_LOCK, IS_LOCKED and NULL_LOCK_INITIALIZER.
+
+NEDMALLOC_DEBUG can be defined to cause DEBUG to be set differently for nedmalloc
+than for the rest of the build. Remember to set NDEBUG to disable all assertion
+checking too.
+
+USE_MAGIC_HEADERS causes nedalloc to allocate an extra three sizeof(size_t)
+to each block. nedpfree() and nedprealloc() can then automagically know when
+to free a system allocated block. Enabling this typically adds 20-50% to
+application memory usage.
+
+ENABLE_TOLERANT_NEDMALLOC is automatically turned on if REPLACE_SYSTEM_ALLOCATOR
+is set or the Windows DLL is being built. This causes nedmalloc to detect when a
+system allocator block is passed to it and to handle it appropriately. Note that
+without USE_MAGIC_HEADERS there is a very tiny chance that nedmalloc will segfault
+on non-Windows builds (it uses Win32 SEH to trap segfaults on Windows and there
+is no comparable system on POSIX).
+
+USE_ALLOCATOR can be one of these settings (it defaults to 1):
+  0: System allocator (nedmalloc now simply acts as a threadcache).
+     WARNING: Intended for DEBUG USE ONLY - not all functions work correctly.
+  1: dlmalloc
+
+ENABLE_LARGE_PAGES enables support for requesting memory from the system in large
+(typically >=2Mb) pages if the host OS supports this. These occupy just a single
+TLB entry and can significantly improve performance in large working set applications.
+
+ENABLE_FAST_HEAP_DETECTION enables special logic to detect blocks allocated
+by the system heap. This avoids 1.5%-2% overhead when checking for non-nedmalloc
+blocks, but it assumes that the NT and glibc heaps function in a very specific
+fashion which may not hold true across OS upgrades.
+*/
+
+#include <stddef.h>   /* for size_t */
+
+#ifndef NEDMALLOCEXTSPEC
+ #ifdef NEDMALLOC_DLL_EXPORTS
+  #ifdef WIN32
+   #define NEDMALLOCEXTSPEC extern __declspec(dllexport)
+  #elif defined(__GNUC__)
+   #define NEDMALLOCEXTSPEC extern __attribute__ ((visibility("default")))
+  #endif
+  #ifndef ENABLE_TOLERANT_NEDMALLOC
+   #define ENABLE_TOLERANT_NEDMALLOC 1
+  #endif
+ #else
+  #define NEDMALLOCEXTSPEC extern
+ #endif
+#endif
+
+#if __STDC_VERSION__ >= 199901L		/* C99 or better */
+ #define RESTRICT restrict
+#else
+ #if defined(_MSC_VER) && _MSC_VER>=1400
+  #define RESTRICT __restrict
+ #endif
+ #ifdef __GNUC__
+  #define RESTRICT __restrict
+ #endif
+#endif
+#ifndef RESTRICT
+ #define RESTRICT
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER>=1400
+ #define NEDMALLOCPTRATTR __declspec(restrict)
+ #define NEDMALLOCNOALIASATTR __declspec(noalias)
+#endif
+#ifdef __GNUC__
+ #define NEDMALLOCPTRATTR __attribute__ ((malloc))
+#endif
+#ifndef NEDMALLOCPTRATTR
+ #define NEDMALLOCPTRATTR
+#endif
+#ifndef NEDMALLOCNOALIASATTR
+ #define NEDMALLOCNOALIASATTR
+#endif
+
+#ifndef USE_MAGIC_HEADERS
+ #define USE_MAGIC_HEADERS 0
+#endif
+
+#ifndef USE_ALLOCATOR
+ #define USE_ALLOCATOR 1 /* dlmalloc */
+#endif
+
+#if !USE_ALLOCATOR && !USE_MAGIC_HEADERS
+#error If you are using the system allocator then you MUST use magic headers
+#endif
+
+#ifdef REPLACE_SYSTEM_ALLOCATOR
+ #if USE_ALLOCATOR==0
+  #error Cannot combine using the system allocator with replacing the system allocator
+ #endif
+ #ifndef ENABLE_TOLERANT_NEDMALLOC
+  #define ENABLE_TOLERANT_NEDMALLOC 1
+ #endif
+ #ifndef WIN32	/* We have a dedicated patcher for Windows */
+  #define nedmalloc               malloc
+  #define nedcalloc               calloc
+  #define nedrealloc              realloc
+  #define nedfree                 free
+  #define nedmemalign             memalign
+  #define nedmallinfo             mallinfo
+  #define nedmallopt              mallopt
+  #define nedmalloc_trim          malloc_trim
+  #define nedmalloc_stats         malloc_stats
+  #define nedmalloc_footprint     malloc_footprint
+  #define nedindependent_calloc   independent_calloc
+  #define nedindependent_comalloc independent_comalloc
+  #ifdef _MSC_VER
+   #define nedblksize              _msize
+  #endif
+ #endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+struct nedmallinfo {
+  size_t arena;    /* non-mmapped space allocated from system */
+  size_t ordblks;  /* number of free chunks */
+  size_t smblks;   /* always 0 */
+  size_t hblks;    /* always 0 */
+  size_t hblkhd;   /* space in mmapped regions */
+  size_t usmblks;  /* maximum total allocated space */
+  size_t fsmblks;  /* always 0 */
+  size_t uordblks; /* total allocated space */
+  size_t fordblks; /* total free space */
+  size_t keepcost; /* releasable (via malloc_trim) space */
+};
+#if defined(__cplusplus)
+}
+#endif
+
+#if defined(__cplusplus)
+ #if !defined(NO_NED_NAMESPACE)
+namespace nedalloc {
+ #else
+extern "C" {
+ #endif
+ #define THROWSPEC throw()
+#else
+ #define THROWSPEC
+#endif
+
+/* These are the global functions */
+
+/* Gets the usable size of an allocated block. Note this will always be bigger than what was
+asked for due to rounding etc. Optionally returns 1 in isforeign if the block came from the
+system allocator - note that there is a small (>0.01%) but real chance of segfault on non-Windows
+systems when passing non-nedmalloc blocks if you don't use USE_MAGIC_HEADERS.
+*/
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR size_t nedblksize(int *RESTRICT isforeign, void *RESTRICT mem) THROWSPEC;
+
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR void nedsetvalue(void *v) THROWSPEC;
+
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmalloc(size_t size) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedcalloc(size_t no, size_t size) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedrealloc(void *mem, size_t size) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR void   nedfree(void *mem) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR struct nedmallinfo nedmallinfo(void) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR int    nedmallopt(int parno, int value) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR void*  nedmalloc_internals(size_t *granularity, size_t *magic) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR int    nedmalloc_trim(size_t pad) THROWSPEC;
+NEDMALLOCEXTSPEC void   nedmalloc_stats(void) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR size_t nedmalloc_footprint(void) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC;
+
+/* Destroys the system memory pool used by the functions above.
+Useful for when you have nedmalloc in a DLL you're about to unload.
+If you call ANY nedmalloc functions after calling this you will
+get a fatal exception!
+*/
+NEDMALLOCEXTSPEC void neddestroysyspool() THROWSPEC;
+
+/* These are the pool functions */
+struct nedpool_t;
+typedef struct nedpool_t nedpool;
+
+/* Creates a memory pool for use with the nedp* functions below.
+Capacity is how much to allocate immediately (if you know you'll be allocating a lot
+of memory very soon) which you can leave at zero. Threads specifies how many threads
+will *normally* be accessing the pool concurrently. Setting this to zero means it
+extends on demand, but be careful of this as it can rapidly consume system resources
+where bursts of concurrent threads use a pool at once.
+*/
+NEDMALLOCEXTSPEC NEDMALLOCPTRATTR nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC;
+
+/* Destroys a memory pool previously created by nedcreatepool().
+*/
+NEDMALLOCEXTSPEC void neddestroypool(nedpool *p) THROWSPEC;
+
+/* Returns a zero terminated snapshot of threadpools existing at the time of call. Call
+nedfree() on the returned list when you are done. Returns zero if there is only the
+system pool in existence.
+*/
+NEDMALLOCEXTSPEC nedpool **nedpoollist() THROWSPEC;
+
+/* Sets a value to be associated with a pool. You can retrieve this value by passing
+any memory block allocated from that pool.
+*/
+NEDMALLOCEXTSPEC void nedpsetvalue(nedpool *p, void *v) THROWSPEC;
+
+/* Gets a previously set value using nedpsetvalue() or zero if memory is unknown.
+Optionally can also retrieve pool. You can detect an unknown block by the return
+being zero and *p being unmodifed.
+*/
+NEDMALLOCEXTSPEC void *nedgetvalue(nedpool **p, void *mem) THROWSPEC;
+
+/* Trims the thread cache for the calling thread, returning any existing cache
+data to the central pool. Remember to ALWAYS call with zero if you used the
+system pool. Setting disable to non-zero replicates neddisablethreadcache().
+*/
+NEDMALLOCEXTSPEC void nedtrimthreadcache(nedpool *p, int disable) THROWSPEC;
+
+/* Disables the thread cache for the calling thread, returning any existing cache
+data to the central pool. Remember to ALWAYS call with zero if you used the
+system pool.
+*/
+NEDMALLOCEXTSPEC void neddisablethreadcache(nedpool *p) THROWSPEC;
+
+
+NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedpmalloc(nedpool *p, size_t size) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC;
+NEDMALLOCEXTSPEC void   nedpfree(nedpool *p, void *mem) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC;
+NEDMALLOCEXTSPEC struct nedmallinfo nedpmallinfo(nedpool *p) THROWSPEC;
+NEDMALLOCEXTSPEC int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC;
+NEDMALLOCEXTSPEC int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC;
+NEDMALLOCEXTSPEC void   nedpmalloc_stats(nedpool *p) THROWSPEC;
+NEDMALLOCEXTSPEC size_t nedpmalloc_footprint(nedpool *p) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC;
+NEDMALLOCEXTSPEC NEDMALLOCPTRATTR void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
+#endif

+ 19 - 19
drivers/openssl/register_openssl.cpp

@@ -1,19 +1,19 @@
-#include "register_openssl.h"
-
-#include "stream_peer_openssl.h"
-#ifdef OPENSSL_ENABLED
-
-void register_openssl() {
-
-	ObjectTypeDB::register_type<StreamPeerOpenSSL>();
-	StreamPeerOpenSSL::initialize_ssl();
-
-}
-
-void unregister_openssl() {
-
-	StreamPeerOpenSSL::finalize_ssl();
-
-}
-#endif
-
+#include "register_openssl.h"
+
+#include "stream_peer_openssl.h"
+#ifdef OPENSSL_ENABLED
+
+void register_openssl() {
+
+	ObjectTypeDB::register_type<StreamPeerOpenSSL>();
+	StreamPeerOpenSSL::initialize_ssl();
+
+}
+
+void unregister_openssl() {
+
+	StreamPeerOpenSSL::finalize_ssl();
+
+}
+#endif
+

+ 11 - 11
drivers/openssl/register_openssl.h

@@ -1,11 +1,11 @@
-#ifndef REGISTER_OPENSSL_H
-#define REGISTER_OPENSSL_H
-
-#ifdef OPENSSL_ENABLED
-
-void register_openssl();
-void unregister_openssl();
-
-#endif
-
-#endif // REGISTER_OPENSSL_H
+#ifndef REGISTER_OPENSSL_H
+#define REGISTER_OPENSSL_H
+
+#ifdef OPENSSL_ENABLED
+
+void register_openssl();
+void unregister_openssl();
+
+#endif
+
+#endif // REGISTER_OPENSSL_H

+ 200 - 0
drivers/opus/SCsub

@@ -0,0 +1,200 @@
+Import('env')
+
+opus_sources = [
+	"opus/audio_stream_opus.cpp",
+]
+
+opus_sources_silk=[]
+
+opus_sources_lib = [
+	"opus/celt/bands.c",
+	"opus/celt/celt_lpc.c",
+	"opus/celt/entenc.c",
+	"opus/celt/mdct.c",
+	"opus/celt/quant_bands.c",
+	"opus/celt/celt.c",
+	"opus/celt/cwrs.c",
+	"opus/celt/kiss_fft.c",
+	"opus/celt/modes.c",
+	"opus/celt/rate.c",
+	"opus/celt/celt_decoder.c",
+	"opus/celt/entcode.c",
+	"opus/celt/laplace.c",
+	#opus/celt/opus_custom_demo.c",
+	"opus/celt/vq.c",
+	"opus/celt/celt_encoder.c",
+	"opus/celt/entdec.c",
+	"opus/celt/mathops.c",
+	"opus/celt/pitch.c",
+	"opus/silk/A2NLSF.c",
+	"opus/silk/decoder_set_fs.c",
+	"opus/silk/NLSF_stabilize.c",
+	"opus/silk/sigm_Q15.c",
+	"opus/silk/ana_filt_bank_1.c",
+	"opus/silk/enc_API.c",
+	"opus/silk/NLSF_unpack.c",
+	"opus/silk/sort.c",
+	"opus/silk/biquad_alt.c",
+	"opus/silk/encode_indices.c",
+	"opus/silk/NLSF_VQ.c",
+	"opus/silk/stereo_decode_pred.c",
+	"opus/silk/bwexpander_32.c",
+	"opus/silk/encode_pulses.c",
+	"opus/silk/NLSF_VQ_weights_laroia.c",
+	"opus/silk/stereo_encode_pred.c",
+	"opus/silk/bwexpander.c",
+	"opus/silk/gain_quant.c",
+	"opus/silk/NSQ.c",
+	"opus/silk/stereo_find_predictor.c",
+	"opus/silk/check_control_input.c",
+	"opus/silk/HP_variable_cutoff.c",
+	"opus/silk/NSQ_del_dec.c",
+	"opus/silk/stereo_LR_to_MS.c",
+	"opus/silk/CNG.c",
+	"opus/silk/init_decoder.c",
+	"opus/silk/pitch_est_tables.c",
+	"opus/silk/stereo_MS_to_LR.c",
+	"opus/silk/code_signs.c",
+	"opus/silk/init_encoder.c",
+	"opus/silk/PLC.c",
+	"opus/silk/stereo_quant_pred.c",
+	"opus/silk/control_audio_bandwidth.c",
+	"opus/silk/inner_prod_aligned.c",
+	"opus/silk/process_NLSFs.c",
+	"opus/silk/sum_sqr_shift.c",
+	"opus/silk/control_codec.c",
+	"opus/silk/interpolate.c",
+	"opus/silk/quant_LTP_gains.c",
+	"opus/silk/table_LSF_cos.c",
+	"opus/silk/control_SNR.c",
+	"opus/silk/lin2log.c",
+	"opus/silk/resampler.c",
+	"opus/silk/tables_gain.c",
+	"opus/silk/debug.c",
+	"opus/silk/log2lin.c",
+	"opus/silk/resampler_down2_3.c",
+	"opus/silk/tables_LTP.c",
+	"opus/silk/dec_API.c",
+	"opus/silk/LPC_analysis_filter.c",
+	"opus/silk/resampler_down2.c",
+	"opus/silk/tables_NLSF_CB_NB_MB.c",
+	"opus/silk/decode_core.c",
+	"opus/silk/LPC_inv_pred_gain.c",
+	"opus/silk/resampler_private_AR2.c",
+	"opus/silk/tables_NLSF_CB_WB.c",
+	"opus/silk/decode_frame.c",
+	"opus/silk/LP_variable_cutoff.c",
+	"opus/silk/resampler_private_down_FIR.c",
+	"opus/silk/tables_other.c",
+	"opus/silk/decode_indices.c",
+	"opus/silk/NLSF2A.c",
+	"opus/silk/resampler_private_IIR_FIR.c",
+	"opus/silk/tables_pitch_lag.c",
+	"opus/silk/decode_parameters.c",
+	"opus/silk/NLSF_decode.c",
+	"opus/silk/resampler_private_up2_HQ.c",
+	"opus/silk/tables_pulses_per_block.c",
+	"opus/silk/decode_pitch.c",
+	"opus/silk/NLSF_del_dec_quant.c",
+	"opus/silk/resampler_rom.c",
+	"opus/silk/VAD.c",
+	"opus/silk/decode_pulses.c",
+	"opus/silk/NLSF_encode.c",
+	"opus/silk/shell_coder.c",
+	"opus/silk/VQ_WMat_EC.c",
+	"opus/analysis.c",
+	"opus/internal.c",
+	"opus/opus.c",
+	#"opus/opus_demo.c",
+	"opus/opus_multistream.c",
+	"opus/repacketizer.c",
+	"opus/wincerts.c",
+	"opus/http.c",
+	"opus/mlp.c",
+	#"opus/opus_compare.c",
+	"opus/opus_encoder.c",
+	"opus/opus_multistream_decoder.c",
+	#"opus/repacketizer_demo.c",
+	"opus/info.c",
+	"opus/mlp_data.c",
+	"opus/opus_decoder.c",
+	"opus/opusfile.c",
+	"opus/opus_multistream_encoder.c",
+	"opus/stream.c"
+]
+
+if("opus_fixed_point" in env and env.opus_fixed_point=="yes"):
+	env.Append(CPPPATH=["#drivers/opus/silk/fixed"], CFLAGS=["-DOPUS_FIXED_POINT"])
+	opus_sources_silk = [
+		"opus/silk/fixed/apply_sine_window_FIX.c",
+		"opus/silk/fixed/k2a_FIX.c",
+		"opus/silk/fixed/residual_energy16_FIX.c",
+		"opus/silk/fixed/autocorr_FIX.c",
+		"opus/silk/fixed/k2a_Q16_FIX.c",
+		"opus/silk/fixed/residual_energy_FIX.c",
+		"opus/silk/fixed/burg_modified_FIX.c",
+		"opus/silk/fixed/LTP_analysis_filter_FIX.c",
+		"opus/silk/fixed/schur64_FIX.c",
+		"opus/silk/fixed/corrMatrix_FIX.c",
+		"opus/silk/fixed/LTP_scale_ctrl_FIX.c",
+		"opus/silk/fixed/schur_FIX.c",
+		"opus/silk/fixed/encode_frame_FIX.c",
+		"opus/silk/fixed/noise_shape_analysis_FIX.c",
+		"opus/silk/fixed/solve_LS_FIX.c",
+		"opus/silk/fixed/find_LPC_FIX.c",
+		"opus/silk/fixed/pitch_analysis_core_FIX.c",
+		"opus/silk/fixed/vector_ops_FIX.c",
+		"opus/silk/fixed/find_LTP_FIX.c",
+		"opus/silk/fixed/prefilter_FIX.c",
+		"opus/silk/fixed/warped_autocorrelation_FIX.c",
+		"opus/silk/fixed/find_pitch_lags_FIX.c",
+		"opus/silk/fixed/process_gains_FIX.c",
+		"opus/silk/fixed/find_pred_coefs_FIX.c",
+		"opus/silk/fixed/regularize_correlations_FIX.c"
+	]
+else:
+	env.Append(CPPPATH=["#drivers/opus/silk/float"])
+	opus_sources_silk = [
+		"opus/silk/float/apply_sine_window_FLP.c",
+		"opus/silk/float/inner_product_FLP.c",
+		"opus/silk/float/regularize_correlations_FLP.c",
+		"opus/silk/float/autocorrelation_FLP.c",
+		"opus/silk/float/k2a_FLP.c",
+		"opus/silk/float/residual_energy_FLP.c",
+		"opus/silk/float/burg_modified_FLP.c",
+		"opus/silk/float/levinsondurbin_FLP.c",
+		"opus/silk/float/scale_copy_vector_FLP.c",
+		"opus/silk/float/bwexpander_FLP.c",
+		"opus/silk/float/LPC_analysis_filter_FLP.c",
+		"opus/silk/float/scale_vector_FLP.c",
+		"opus/silk/float/corrMatrix_FLP.c",
+		"opus/silk/float/LPC_inv_pred_gain_FLP.c",
+		"opus/silk/float/schur_FLP.c",
+		"opus/silk/float/encode_frame_FLP.c",
+		"opus/silk/float/LTP_analysis_filter_FLP.c",
+		"opus/silk/float/solve_LS_FLP.c",
+		"opus/silk/float/energy_FLP.c",
+		"opus/silk/float/LTP_scale_ctrl_FLP.c",
+		"opus/silk/float/sort_FLP.c",
+		"opus/silk/float/find_LPC_FLP.c",
+		"opus/silk/float/noise_shape_analysis_FLP.c",
+		"opus/silk/float/warped_autocorrelation_FLP.c",
+		"opus/silk/float/find_LTP_FLP.c",
+		"opus/silk/float/pitch_analysis_core_FLP.c",
+		"opus/silk/float/wrappers_FLP.c",
+		"opus/silk/float/find_pitch_lags_FLP.c",
+		"opus/silk/float/prefilter_FLP.c",
+		"opus/silk/float/find_pred_coefs_FLP.c",
+		"opus/silk/float/process_gains_FLP.c"
+	]
+
+
+opus_sources_lib+=opus_sources_silk
+env.drivers_sources+=opus_sources_lib
+env.drivers_sources+=opus_sources
+
+env.Append(CPPPATH=["#drivers/opus"])
+env.Append(CPPPATH=["#drivers/opus/celt","#drivers/opus/silk","#drivers/opus/silk/float"])
+env.Append(CFLAGS=["-DOPUS_HAVE_CONFIG_H"])
+
+Export('env')

+ 645 - 0
drivers/opus/analysis.c

@@ -0,0 +1,645 @@
+/* Copyright (c) 2011 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "kiss_fft.h"
+#include "celt.h"
+#include "opus_modes.h"
+#include "arch.h"
+#include "quant_bands.h"
+#include <stdio.h>
+#include "analysis.h"
+#include "mlp.h"
+#include "stack_alloc.h"
+
+extern const MLP net;
+
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+static const float dct_table[128] = {
+        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
+        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
+        0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
+       -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
+        0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
+       -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
+        0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
+        0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
+        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
+        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
+        0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
+       -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
+        0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
+       -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
+        0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
+        0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
+};
+
+static const float analysis_window[240] = {
+      0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
+      0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
+      0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
+      0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
+      0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
+      0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
+      0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
+      0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
+      0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
+      0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
+      0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
+      0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
+      0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
+      0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
+      0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
+      0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
+      0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
+      0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
+      0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
+      0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
+      0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
+      0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
+      0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
+      0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
+      0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
+      0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
+      0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
+      0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
+      0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
+      0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
+};
+
+static const int tbands[NB_TBANDS+1] = {
+       2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
+};
+
+static const int extra_bands[NB_TOT_BANDS+1] = {
+      1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
+};
+
+/*static const float tweight[NB_TBANDS+1] = {
+      .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
+};*/
+
+#define NB_TONAL_SKIP_BANDS 9
+
+#define cA 0.43157974f
+#define cB 0.67848403f
+#define cC 0.08595542f
+#define cE ((float)M_PI/2)
+static OPUS_INLINE float fast_atan2f(float y, float x) {
+   float x2, y2;
+   /* Should avoid underflow on the values we'll get */
+   if (ABS16(x)+ABS16(y)<1e-9f)
+   {
+      x*=1e12f;
+      y*=1e12f;
+   }
+   x2 = x*x;
+   y2 = y*y;
+   if(x2<y2){
+      float den = (y2 + cB*x2) * (y2 + cC*x2);
+      if (den!=0)
+         return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
+      else
+         return (y<0 ? -cE : cE);
+   }else{
+      float den = (x2 + cB*y2) * (x2 + cC*y2);
+      if (den!=0)
+         return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
+      else
+         return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
+   }
+}
+
+void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
+{
+   int pos;
+   int curr_lookahead;
+   float psum;
+   int i;
+
+   pos = tonal->read_pos;
+   curr_lookahead = tonal->write_pos-tonal->read_pos;
+   if (curr_lookahead<0)
+      curr_lookahead += DETECT_SIZE;
+
+   if (len > 480 && pos != tonal->write_pos)
+   {
+      pos++;
+      if (pos==DETECT_SIZE)
+         pos=0;
+   }
+   if (pos == tonal->write_pos)
+      pos--;
+   if (pos<0)
+      pos = DETECT_SIZE-1;
+   OPUS_COPY(info_out, &tonal->info[pos], 1);
+   tonal->read_subframe += len/120;
+   while (tonal->read_subframe>=4)
+   {
+      tonal->read_subframe -= 4;
+      tonal->read_pos++;
+   }
+   if (tonal->read_pos>=DETECT_SIZE)
+      tonal->read_pos-=DETECT_SIZE;
+
+   /* Compensate for the delay in the features themselves.
+      FIXME: Need a better estimate the 10 I just made up */
+   curr_lookahead = IMAX(curr_lookahead-10, 0);
+
+   psum=0;
+   /* Summing the probability of transition patterns that involve music at
+      time (DETECT_SIZE-curr_lookahead-1) */
+   for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
+      psum += tonal->pmusic[i];
+   for (;i<DETECT_SIZE;i++)
+      psum += tonal->pspeech[i];
+   psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
+   /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
+
+   info_out->music_prob = psum;
+}
+
+void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
+{
+    int i, b;
+    const kiss_fft_state *kfft;
+    VARDECL(kiss_fft_cpx, in);
+    VARDECL(kiss_fft_cpx, out);
+    int N = 480, N2=240;
+    float * OPUS_RESTRICT A = tonal->angle;
+    float * OPUS_RESTRICT dA = tonal->d_angle;
+    float * OPUS_RESTRICT d2A = tonal->d2_angle;
+    VARDECL(float, tonality);
+    VARDECL(float, noisiness);
+    float band_tonality[NB_TBANDS];
+    float logE[NB_TBANDS];
+    float BFCC[8];
+    float features[25];
+    float frame_tonality;
+    float max_frame_tonality;
+    /*float tw_sum=0;*/
+    float frame_noisiness;
+    const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
+    float slope=0;
+    float frame_stationarity;
+    float relativeE;
+    float frame_probs[2];
+    float alpha, alphaE, alphaE2;
+    float frame_loudness;
+    float bandwidth_mask;
+    int bandwidth=0;
+    float maxE = 0;
+    float noise_floor;
+    int remaining;
+    AnalysisInfo *info;
+    SAVE_STACK;
+
+    tonal->last_transition++;
+    alpha = 1.f/IMIN(20, 1+tonal->count);
+    alphaE = 1.f/IMIN(50, 1+tonal->count);
+    alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
+
+    if (tonal->count<4)
+       tonal->music_prob = .5;
+    kfft = celt_mode->mdct.kfft[0];
+    if (tonal->count==0)
+       tonal->mem_fill = 240;
+    downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
+    if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
+    {
+       tonal->mem_fill += len;
+       /* Don't have enough to update the analysis */
+       RESTORE_STACK;
+       return;
+    }
+    info = &tonal->info[tonal->write_pos++];
+    if (tonal->write_pos>=DETECT_SIZE)
+       tonal->write_pos-=DETECT_SIZE;
+
+    ALLOC(in, 480, kiss_fft_cpx);
+    ALLOC(out, 480, kiss_fft_cpx);
+    ALLOC(tonality, 240, float);
+    ALLOC(noisiness, 240, float);
+    for (i=0;i<N2;i++)
+    {
+       float w = analysis_window[i];
+       in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
+       in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
+       in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
+       in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
+    }
+    OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
+    remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
+    downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
+    tonal->mem_fill = 240 + remaining;
+    opus_fft(kfft, in, out);
+
+    for (i=1;i<N2;i++)
+    {
+       float X1r, X2r, X1i, X2i;
+       float angle, d_angle, d2_angle;
+       float angle2, d_angle2, d2_angle2;
+       float mod1, mod2, avg_mod;
+       X1r = (float)out[i].r+out[N-i].r;
+       X1i = (float)out[i].i-out[N-i].i;
+       X2r = (float)out[i].i+out[N-i].i;
+       X2i = (float)out[N-i].r-out[i].r;
+
+       angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
+       d_angle = angle - A[i];
+       d2_angle = d_angle - dA[i];
+
+       angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
+       d_angle2 = angle2 - angle;
+       d2_angle2 = d_angle2 - d_angle;
+
+       mod1 = d2_angle - (float)floor(.5+d2_angle);
+       noisiness[i] = ABS16(mod1);
+       mod1 *= mod1;
+       mod1 *= mod1;
+
+       mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
+       noisiness[i] += ABS16(mod2);
+       mod2 *= mod2;
+       mod2 *= mod2;
+
+       avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
+       tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
+
+       A[i] = angle2;
+       dA[i] = d_angle2;
+       d2A[i] = mod2;
+    }
+
+    frame_tonality = 0;
+    max_frame_tonality = 0;
+    /*tw_sum = 0;*/
+    info->activity = 0;
+    frame_noisiness = 0;
+    frame_stationarity = 0;
+    if (!tonal->count)
+    {
+       for (b=0;b<NB_TBANDS;b++)
+       {
+          tonal->lowE[b] = 1e10;
+          tonal->highE[b] = -1e10;
+       }
+    }
+    relativeE = 0;
+    frame_loudness = 0;
+    for (b=0;b<NB_TBANDS;b++)
+    {
+       float E=0, tE=0, nE=0;
+       float L1, L2;
+       float stationarity;
+       for (i=tbands[b];i<tbands[b+1];i++)
+       {
+          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
+                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
+#ifdef OPUS_FIXED_POINT
+          /* FIXME: It's probably best to change the BFCC filter initial state instead */
+          binE *= 5.55e-17f;
+#endif
+          E += binE;
+          tE += binE*tonality[i];
+          nE += binE*2.f*(.5f-noisiness[i]);
+       }
+       tonal->E[tonal->E_count][b] = E;
+       frame_noisiness += nE/(1e-15f+E);
+
+       frame_loudness += (float)sqrt(E+1e-10f);
+       logE[b] = (float)log(E+1e-10f);
+       tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
+       tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
+       if (tonal->highE[b] < tonal->lowE[b]+1.f)
+       {
+          tonal->highE[b]+=.5f;
+          tonal->lowE[b]-=.5f;
+       }
+       relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
+
+       L1=L2=0;
+       for (i=0;i<NB_FRAMES;i++)
+       {
+          L1 += (float)sqrt(tonal->E[i][b]);
+          L2 += tonal->E[i][b];
+       }
+
+       stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
+       stationarity *= stationarity;
+       stationarity *= stationarity;
+       frame_stationarity += stationarity;
+       /*band_tonality[b] = tE/(1e-15+E)*/;
+       band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
+#if 0
+       if (b>=NB_TONAL_SKIP_BANDS)
+       {
+          frame_tonality += tweight[b]*band_tonality[b];
+          tw_sum += tweight[b];
+       }
+#else
+       frame_tonality += band_tonality[b];
+       if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
+          frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
+#endif
+       max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
+       slope += band_tonality[b]*(b-8);
+       /*printf("%f %f ", band_tonality[b], stationarity);*/
+       tonal->prev_band_tonality[b] = band_tonality[b];
+    }
+
+    bandwidth_mask = 0;
+    bandwidth = 0;
+    maxE = 0;
+    noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
+#ifdef OPUS_FIXED_POINT
+    noise_floor *= 1<<(15+SIG_SHIFT);
+#endif
+    noise_floor *= noise_floor;
+    for (b=0;b<NB_TOT_BANDS;b++)
+    {
+       float E=0;
+       int band_start, band_end;
+       /* Keep a margin of 300 Hz for aliasing */
+       band_start = extra_bands[b];
+       band_end = extra_bands[b+1];
+       for (i=band_start;i<band_end;i++)
+       {
+          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
+                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
+          E += binE;
+       }
+       maxE = MAX32(maxE, E);
+       tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
+       E = MAX32(E, tonal->meanE[b]);
+       /* Use a simple follower with 13 dB/Bark slope for spreading function */
+       bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
+       /* Consider the band "active" only if all these conditions are met:
+          1) less than 10 dB below the simple follower
+          2) less than 90 dB below the peak band (maximal masking possible considering
+             both the ATH and the loudness-dependent slope of the spreading function)
+          3) above the PCM quantization noise floor
+       */
+       if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
+          bandwidth = b;
+    }
+    if (tonal->count<=2)
+       bandwidth = 20;
+    frame_loudness = 20*(float)log10(frame_loudness);
+    tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
+    tonal->lowECount *= (1-alphaE);
+    if (frame_loudness < tonal->Etracker-30)
+       tonal->lowECount += alphaE;
+
+    for (i=0;i<8;i++)
+    {
+       float sum=0;
+       for (b=0;b<16;b++)
+          sum += dct_table[i*16+b]*logE[b];
+       BFCC[i] = sum;
+    }
+
+    frame_stationarity /= NB_TBANDS;
+    relativeE /= NB_TBANDS;
+    if (tonal->count<10)
+       relativeE = .5;
+    frame_noisiness /= NB_TBANDS;
+#if 1
+    info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
+#else
+    info->activity = .5*(1+frame_noisiness-frame_stationarity);
+#endif
+    frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
+    frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
+    tonal->prev_tonality = frame_tonality;
+
+    slope /= 8*8;
+    info->tonality_slope = slope;
+
+    tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
+    tonal->count++;
+    info->tonality = frame_tonality;
+
+    for (i=0;i<4;i++)
+       features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
+
+    for (i=0;i<4;i++)
+       tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
+
+    for (i=0;i<4;i++)
+        features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
+    for (i=0;i<3;i++)
+        features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
+
+    if (tonal->count > 5)
+    {
+       for (i=0;i<9;i++)
+          tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
+    }
+
+    for (i=0;i<8;i++)
+    {
+       tonal->mem[i+24] = tonal->mem[i+16];
+       tonal->mem[i+16] = tonal->mem[i+8];
+       tonal->mem[i+8] = tonal->mem[i];
+       tonal->mem[i] = BFCC[i];
+    }
+    for (i=0;i<9;i++)
+       features[11+i] = (float)sqrt(tonal->std[i]);
+    features[20] = info->tonality;
+    features[21] = info->activity;
+    features[22] = frame_stationarity;
+    features[23] = info->tonality_slope;
+    features[24] = tonal->lowECount;
+
+#ifndef DISABLE_FLOAT_API
+    mlp_process(&net, features, frame_probs);
+    frame_probs[0] = .5f*(frame_probs[0]+1);
+    /* Curve fitting between the MLP probability and the actual probability */
+    frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
+    /* Probability of active audio (as opposed to silence) */
+    frame_probs[1] = .5f*frame_probs[1]+.5f;
+    /* Consider that silence has a 50-50 probability. */
+    frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
+
+    /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
+    {
+       /* Probability of state transition */
+       float tau;
+       /* Represents independence of the MLP probabilities, where
+          beta=1 means fully independent. */
+       float beta;
+       /* Denormalized probability of speech (p0) and music (p1) after update */
+       float p0, p1;
+       /* Probabilities for "all speech" and "all music" */
+       float s0, m0;
+       /* Probability sum for renormalisation */
+       float psum;
+       /* Instantaneous probability of speech and music, with beta pre-applied. */
+       float speech0;
+       float music0;
+
+       /* One transition every 3 minutes of active audio */
+       tau = .00005f*frame_probs[1];
+       beta = .05f;
+       if (1) {
+          /* Adapt beta based on how "unexpected" the new prob is */
+          float p, q;
+          p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
+          q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
+          beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
+       }
+       /* p0 and p1 are the probabilities of speech and music at this frame
+          using only information from previous frame and applying the
+          state transition model */
+       p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
+       p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
+       /* We apply the current probability with exponent beta to work around
+          the fact that the probability estimates aren't independent. */
+       p0 *= (float)pow(1-frame_probs[0], beta);
+       p1 *= (float)pow(frame_probs[0], beta);
+       /* Normalise the probabilities to get the Marokv probability of music. */
+       tonal->music_prob = p1/(p0+p1);
+       info->music_prob = tonal->music_prob;
+
+       /* This chunk of code deals with delayed decision. */
+       psum=1e-20f;
+       /* Instantaneous probability of speech and music, with beta pre-applied. */
+       speech0 = (float)pow(1-frame_probs[0], beta);
+       music0  = (float)pow(frame_probs[0], beta);
+       if (tonal->count==1)
+       {
+          tonal->pspeech[0]=.5;
+          tonal->pmusic [0]=.5;
+       }
+       /* Updated probability of having only speech (s0) or only music (m0),
+          before considering the new observation. */
+       s0 = tonal->pspeech[0] + tonal->pspeech[1];
+       m0 = tonal->pmusic [0] + tonal->pmusic [1];
+       /* Updates s0 and m0 with instantaneous probability. */
+       tonal->pspeech[0] = s0*(1-tau)*speech0;
+       tonal->pmusic [0] = m0*(1-tau)*music0;
+       /* Propagate the transition probabilities */
+       for (i=1;i<DETECT_SIZE-1;i++)
+       {
+          tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
+          tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
+       }
+       /* Probability that the latest frame is speech, when all the previous ones were music. */
+       tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
+       /* Probability that the latest frame is music, when all the previous ones were speech. */
+       tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
+
+       /* Renormalise probabilities to 1 */
+       for (i=0;i<DETECT_SIZE;i++)
+          psum += tonal->pspeech[i] + tonal->pmusic[i];
+       psum = 1.f/psum;
+       for (i=0;i<DETECT_SIZE;i++)
+       {
+          tonal->pspeech[i] *= psum;
+          tonal->pmusic [i] *= psum;
+       }
+       psum = tonal->pmusic[0];
+       for (i=1;i<DETECT_SIZE;i++)
+          psum += tonal->pspeech[i];
+
+       /* Estimate our confidence in the speech/music decisions */
+       if (frame_probs[1]>.75)
+       {
+          if (tonal->music_prob>.9)
+          {
+             float adapt;
+             adapt = 1.f/(++tonal->music_confidence_count);
+             tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
+             tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
+          }
+          if (tonal->music_prob<.1)
+          {
+             float adapt;
+             adapt = 1.f/(++tonal->speech_confidence_count);
+             tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
+             tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
+          }
+       } else {
+          if (tonal->music_confidence_count==0)
+             tonal->music_confidence = .9f;
+          if (tonal->speech_confidence_count==0)
+             tonal->speech_confidence = .1f;
+       }
+    }
+    if (tonal->last_music != (tonal->music_prob>.5f))
+       tonal->last_transition=0;
+    tonal->last_music = tonal->music_prob>.5f;
+#else
+    info->music_prob = 0;
+#endif
+    /*for (i=0;i<25;i++)
+       printf("%f ", features[i]);
+    printf("\n");*/
+
+    info->bandwidth = bandwidth;
+    /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
+    info->noisiness = frame_noisiness;
+    info->valid = 1;
+    if (info_out!=NULL)
+       OPUS_COPY(info_out, info, 1);
+    RESTORE_STACK;
+}
+
+void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
+                 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
+                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
+{
+   int offset;
+   int pcm_len;
+
+   if (analysis_pcm != NULL)
+   {
+      /* Avoid overflow/wrap-around of the analysis buffer */
+      analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
+
+      pcm_len = analysis_frame_size - analysis->analysis_offset;
+      offset = analysis->analysis_offset;
+      do {
+         tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
+         offset += 480;
+         pcm_len -= 480;
+      } while (pcm_len>0);
+      analysis->analysis_offset = analysis_frame_size;
+
+      analysis->analysis_offset -= frame_size;
+   }
+
+   analysis_info->valid = 0;
+   tonality_get_info(analysis, analysis_info, frame_size);
+}

+ 90 - 0
drivers/opus/analysis.h

@@ -0,0 +1,90 @@
+/* Copyright (c) 2011 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef ANALYSIS_H
+#define ANALYSIS_H
+
+#include "celt.h"
+#include "opus_private.h"
+
+#define NB_FRAMES 8
+#define NB_TBANDS 18
+#define NB_TOT_BANDS 21
+#define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */
+
+#define DETECT_SIZE 200
+
+typedef struct {
+   float angle[240];
+   float d_angle[240];
+   float d2_angle[240];
+   opus_val32 inmem[ANALYSIS_BUF_SIZE];
+   int   mem_fill;                      /* number of usable samples in the buffer */
+   float prev_band_tonality[NB_TBANDS];
+   float prev_tonality;
+   float E[NB_FRAMES][NB_TBANDS];
+   float lowE[NB_TBANDS];
+   float highE[NB_TBANDS];
+   float meanE[NB_TOT_BANDS];
+   float mem[32];
+   float cmean[8];
+   float std[9];
+   float music_prob;
+   float Etracker;
+   float lowECount;
+   int E_count;
+   int last_music;
+   int last_transition;
+   int count;
+   float subframe_mem[3];
+   int analysis_offset;
+   /** Probability of having speech for time i to DETECT_SIZE-1 (and music before).
+       pspeech[0] is the probability that all frames in the window are speech. */
+   float pspeech[DETECT_SIZE];
+   /** Probability of having music for time i to DETECT_SIZE-1 (and speech before).
+       pmusic[0] is the probability that all frames in the window are music. */
+   float pmusic[DETECT_SIZE];
+   float speech_confidence;
+   float music_confidence;
+   int speech_confidence_count;
+   int music_confidence_count;
+   int write_pos;
+   int read_pos;
+   int read_subframe;
+   AnalysisInfo info[DETECT_SIZE];
+} TonalityAnalysisState;
+
+void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info,
+     const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix);
+
+void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len);
+
+void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
+                 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
+                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info);
+
+#endif

+ 376 - 0
drivers/opus/audio_stream_opus.cpp

@@ -0,0 +1,376 @@
+/*************************************************************************/
+/*  audio_stream_opus.cpp                                                */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                    http://www.godotengine.org                         */
+/*************************************************************************/
+/* Copyright (c) 2007-2015 Juan Linietsky, Ariel Manzur.                 */
+/*                                                                       */
+/* Author: George Marques <[email protected]>                             */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+#include "audio_stream_opus.h"
+
+const float AudioStreamPlaybackOpus::osrate=48000.0f;
+
+int AudioStreamPlaybackOpus::_op_read_func(void *_stream, unsigned char *_ptr, int _nbytes) {
+	FileAccess *fa=(FileAccess*)_stream;
+
+	if(fa->eof_reached())
+		return 0;
+
+	uint8_t *dst = (uint8_t*)_ptr;
+
+	int read = fa->get_buffer(dst, _nbytes);
+
+	return read;
+}
+
+int AudioStreamPlaybackOpus::_op_seek_func(void *_stream, opus_int64 _offset, int _whence){
+
+#ifdef SEEK_SET
+	FileAccess *fa=(FileAccess*)_stream;
+
+	switch (_whence) {
+		case SEEK_SET: {
+			fa->seek(_offset);
+		} break;
+		case SEEK_CUR: {
+			fa->seek(fa->get_pos()+_offset);
+		} break;
+		case SEEK_END: {
+			fa->seek_end(_offset);
+		} break;
+		default: {
+			ERR_PRINT("BUG, wtf was whence set to?\n");
+		}
+	}
+	int ret=fa->eof_reached()?-1:0;
+	return ret;
+#else
+	return -1; // no seeking
+#endif
+}
+
+int AudioStreamPlaybackOpus::_op_close_func(void *_stream) {
+	if (!_stream)
+		return 0;
+	FileAccess *fa=(FileAccess*)_stream;
+	if (fa->is_open())
+		fa->close();
+	return 0;
+}
+
+opus_int64 AudioStreamPlaybackOpus::_op_tell_func(void *_stream) {
+	FileAccess *_fa = (FileAccess*)_stream;
+	return (opus_int64)_fa->get_pos();
+}
+
+void AudioStreamPlaybackOpus::_clear_stream() {
+	if(!stream_loaded)
+		return;
+
+	op_free(opus_file);
+	_close_file();
+
+	stream_loaded=false;
+	stream_channels=1;
+	playing=false;
+}
+
+void AudioStreamPlaybackOpus::_close_file() {
+	if (f) {
+		memdelete(f);
+		f=NULL;
+	}
+}
+
+Error AudioStreamPlaybackOpus::_load_stream() {
+
+	ERR_FAIL_COND_V(!stream_valid,ERR_UNCONFIGURED);
+
+	_clear_stream();
+	if (file=="")
+		return ERR_INVALID_DATA;
+
+	Error err;
+	f=FileAccess::open(file,FileAccess::READ,&err);
+
+	if (err) {
+		ERR_FAIL_COND_V( err, err );
+	}
+
+	int _err = 0;
+
+	opus_file = op_open_callbacks(f,&_op_callbacks,NULL,0,&_err);
+
+	switch (_err) {
+		case OP_EREAD: { // - Can't read the file.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_FILE_CANT_READ );
+		} break;
+		case OP_EVERSION: // - Unrecognized version number.
+		case OP_ENOTFORMAT: // - Stream is not Opus data.
+		case OP_EIMPL : { // - Stream used non-implemented feature.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_FILE_UNRECOGNIZED );
+		} break;
+		case OP_EBADLINK: // - Failed to find old data after seeking.
+		case OP_EBADTIMESTAMP: // - Timestamp failed the validity checks.
+		case OP_EBADHEADER: { // - Invalid or mising Opus bitstream header.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_FILE_CORRUPT );
+		} break;
+		case OP_EFAULT: { // - Internal logic fault; indicates a bug or heap/stack corruption.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_BUG );
+		} break;
+	}
+	repeats=0;
+	stream_loaded=true;
+
+
+	return OK;
+}
+
+AudioStreamPlaybackOpus::AudioStreamPlaybackOpus() {
+	loops=false;
+	playing=false;
+	f = NULL;
+	stream_loaded=false;
+	stream_valid=false;
+	repeats=0;
+	paused=true;
+	stream_channels=0;
+	current_section=0;
+	length=0;
+	loop_restart_time=0;
+	pre_skip=0;
+
+	_op_callbacks.read = _op_read_func;
+	_op_callbacks.seek = _op_seek_func;
+	_op_callbacks.tell = _op_tell_func;
+	_op_callbacks.close = _op_close_func;
+}
+
+Error AudioStreamPlaybackOpus::set_file(const String &p_file) {
+	file=p_file;
+	stream_valid=false;
+	Error err;
+	f=FileAccess::open(file,FileAccess::READ,&err);
+
+	if (err) {
+		ERR_FAIL_COND_V( err, err );
+	}
+
+	int _err;
+
+	opus_file = op_open_callbacks(f,&_op_callbacks,NULL,0,&_err);
+
+	switch (_err) {
+		case OP_EREAD: { // - Can't read the file.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_FILE_CANT_READ );
+		} break;
+		case OP_EVERSION: // - Unrecognized version number.
+		case OP_ENOTFORMAT: // - Stream is not Opus data.
+		case OP_EIMPL : { // - Stream used non-implemented feature.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_FILE_UNRECOGNIZED );
+		} break;
+		case OP_EBADLINK: // - Failed to find old data after seeking.
+		case OP_EBADTIMESTAMP: // - Timestamp failed the validity checks.
+		case OP_EBADHEADER: { // - Invalid or mising Opus bitstream header.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_FILE_CORRUPT );
+		} break;
+		case OP_EFAULT: { // - Internal logic fault; indicates a bug or heap/stack corruption.
+			memdelete(f); f=NULL;
+			ERR_FAIL_V( ERR_BUG );
+		} break;
+	}
+
+	const OpusHead *oinfo = op_head(opus_file,-1);
+
+	stream_channels=oinfo->channel_count;
+	pre_skip=oinfo->pre_skip;
+	frames_mixed=pre_skip;
+	ogg_int64_t len = op_pcm_total(opus_file,-1);
+	if(len < 0) {
+		length = 0;
+	} else {
+		length=(len/osrate);
+	}
+
+	op_free(opus_file);
+	memdelete(f);
+	f=NULL;
+	stream_valid=true;
+
+
+	return OK;
+}
+
+void AudioStreamPlaybackOpus::play(float p_from) {
+	if (playing)
+		stop();
+
+	if (_load_stream()!=OK)
+		return;
+
+	frames_mixed=pre_skip;
+	playing=true;
+	if (p_from>0) {
+		seek_pos(p_from);
+	}
+}
+
+void AudioStreamPlaybackOpus::stop() {
+	_clear_stream();
+	playing=false;
+}
+
+void AudioStreamPlaybackOpus::seek_pos(float p_time) {
+	if(!playing) return;
+	ogg_int64_t pcm_offset = (ogg_int64_t)(p_time * osrate);
+	bool ok = op_pcm_seek(opus_file,pcm_offset)==0;
+	if(!ok) {
+		ERR_PRINT("Seek time over stream size.");
+		return;
+	}
+	frames_mixed=osrate*p_time;
+}
+
+int AudioStreamPlaybackOpus::mix(int16_t* p_bufer,int p_frames) {
+	if (!playing)
+		return 0;
+
+	int total=p_frames;
+
+	while (true) {
+
+		int todo = p_frames;
+
+		if (todo==0 || todo<MIN_MIX) {
+			break;
+		}
+
+		int ret=op_read(opus_file,(opus_int16*)p_bufer,todo*stream_channels,&current_section);
+		if (ret<0) {
+			playing = false;
+			ERR_EXPLAIN("Error reading Opus File: "+file);
+			ERR_BREAK(ret<0);
+		} else if (ret==0) { // end of song, reload?
+			op_free(opus_file);
+
+			_close_file();
+
+			f=FileAccess::open(file,FileAccess::READ);
+
+			int errv = 0;
+			opus_file = op_open_callbacks(f,&_op_callbacks,NULL,0,&errv);
+			if (errv!=0) {
+				playing=false;
+				break; // :(
+			}
+
+			if (!has_loop()) {
+				playing=false;
+				repeats=1;
+				break;
+			}
+
+			if (loop_restart_time) {
+				bool ok = op_pcm_seek(opus_file, (loop_restart_time*osrate)+pre_skip)==0;
+				if (!ok) {
+					playing=false;
+					ERR_PRINT("loop restart time rejected")
+				}
+
+				frames_mixed=(loop_restart_time*osrate)+pre_skip;
+			} else {
+				frames_mixed=pre_skip;
+			}
+			repeats++;
+			continue;
+
+		}
+
+		stream_channels=op_head(opus_file,current_section)->channel_count;
+
+		frames_mixed+=ret;
+
+		p_bufer+=ret*stream_channels;
+		p_frames-=ret;
+
+	}
+
+	return total-p_frames;
+}
+
+float AudioStreamPlaybackOpus::get_length() const {
+	if(!stream_loaded) {
+		if(const_cast<AudioStreamPlaybackOpus*>(this)->_load_stream() != OK)
+			return 0;
+	}
+	return length;
+}
+
+float AudioStreamPlaybackOpus::get_pos() const {
+
+	int32_t frames = int32_t(frames_mixed);
+	if (frames < 0)
+		frames=0;
+	return double(frames) / osrate;
+}
+
+int AudioStreamPlaybackOpus::get_minimum_buffer_size() const {
+	return MIN_MIX;
+}
+
+AudioStreamPlaybackOpus::~AudioStreamPlaybackOpus() {
+	_clear_stream();
+}
+
+RES ResourceFormatLoaderAudioStreamOpus::load(const String &p_path, const String& p_original_path, Error *r_error) {
+	if (r_error)
+		*r_error=OK;
+
+	AudioStreamOpus *opus_stream = memnew(AudioStreamOpus);
+	opus_stream->set_file(p_path);
+	return Ref<AudioStreamOpus>(opus_stream);
+}
+
+void ResourceFormatLoaderAudioStreamOpus::get_recognized_extensions(List<String> *p_extensions) const {
+
+	p_extensions->push_back("opus");
+}
+String ResourceFormatLoaderAudioStreamOpus::get_resource_type(const String &p_path) const {
+
+	if (p_path.extension().to_lower()=="opus")
+		return "AudioStreamOpus";
+	return "";
+}
+
+bool ResourceFormatLoaderAudioStreamOpus::handles_type(const String& p_type) const {
+	return (p_type=="AudioStream" || p_type=="AudioStreamOpus");
+}

+ 141 - 0
drivers/opus/audio_stream_opus.h

@@ -0,0 +1,141 @@
+/*************************************************************************/
+/*  audio_stream_opus.h                                                  */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                    http://www.godotengine.org                         */
+/*************************************************************************/
+/* Copyright (c) 2007-2015 Juan Linietsky, Ariel Manzur.                 */
+/*                                                                       */
+/* Author: George Marques <[email protected]>                             */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef AUDIO_STREAM_OPUS_H
+#define AUDIO_STREAM_OPUS_H
+
+#include "scene/resources/audio_stream.h"
+#include "opus/opusfile.h"
+#include "opus/internal.h"
+#include "os/file_access.h"
+#include "io/resource_loader.h"
+
+class AudioStreamPlaybackOpus : public AudioStreamPlayback {
+
+	OBJ_TYPE(AudioStreamPlaybackOpus,AudioStreamPlayback)
+
+	enum {
+		MIN_MIX=1024
+	};
+
+	FileAccess *f;
+
+	OpusFileCallbacks _op_callbacks;
+	float length;
+	static int _op_read_func(void *_stream, unsigned char *_ptr, int _nbytes);
+	static int _op_seek_func(void *_stream, opus_int64 _offset, int _whence);
+	static int _op_close_func(void *_stream);
+	static opus_int64 _op_tell_func(void *_stream);
+	static const float osrate;
+
+	String file;
+	int64_t frames_mixed;
+
+	bool stream_loaded;
+	volatile bool playing;
+	OggOpusFile *opus_file;
+	int stream_channels;
+	int current_section;
+	int pre_skip;
+
+	bool paused;
+	bool loops;
+	int repeats;
+
+	Error _load_stream();
+	void _clear_stream();
+	void _close_file();
+
+	bool stream_valid;
+	float loop_restart_time;
+
+public:
+	Error set_file(const String& p_file);
+
+	virtual void play(float p_from=0);
+	virtual void stop();
+	virtual bool is_playing() const { return playing; }
+
+	virtual void set_loop_restart_time(float p_time) { loop_restart_time=p_time; }
+
+	virtual void set_paused(bool p_paused) { paused=p_paused; }
+	virtual bool is_paused() const { return paused; }
+
+	virtual void set_loop(bool p_enable) { loops=p_enable; }
+	virtual bool has_loop() const {return loops; }
+
+	virtual float get_length() const;
+
+	virtual String get_stream_name() const { return ""; }
+
+	virtual int get_loop_count() const { return repeats; }
+
+	virtual float get_pos() const;
+	virtual void seek_pos(float p_time);
+
+	virtual int get_channels() const { return stream_channels; }
+	virtual int get_mix_rate() const { return osrate; }
+
+	virtual int get_minimum_buffer_size() const;
+
+	virtual int mix(int16_t* p_bufer,int p_frames);
+
+	AudioStreamPlaybackOpus();
+	~AudioStreamPlaybackOpus();
+};
+
+
+class AudioStreamOpus: public AudioStream {
+
+	OBJ_TYPE(AudioStreamOpus,AudioStream)
+
+	String file;
+public:
+
+	Ref<AudioStreamPlayback> instance_playback() {
+		Ref<AudioStreamPlaybackOpus> pb = memnew( AudioStreamPlaybackOpus );
+		pb->set_file(file);
+		return pb;
+	}
+
+	void set_file(const String& p_file) { file=p_file; }
+
+};
+
+class ResourceFormatLoaderAudioStreamOpus: public ResourceFormatLoader {
+public:
+	virtual RES load(const String &p_path,const String& p_original_path="",Error *r_error=NULL);
+	virtual void get_recognized_extensions(List<String> *p_extensions) const;
+	virtual bool handles_type(const String& p_type) const;
+	virtual String get_resource_type(const String &p_path) const;
+};
+
+#endif // AUDIO_STREAM_OPUS_H

+ 183 - 0
drivers/opus/celt/_kiss_fft_guts.h

@@ -0,0 +1,183 @@
+/*Copyright (c) 2003-2004, Mark Borgerding
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_GUTS_H
+#define KISS_FFT_GUTS_H
+
+#define MIN(a,b) ((a)<(b) ? (a):(b))
+#define MAX(a,b) ((a)>(b) ? (a):(b))
+
+/* kiss_fft.h
+   defines kiss_fft_scalar as either short or a float type
+   and defines
+   typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
+#include "kiss_fft.h"
+
+/*
+  Explanation of macros dealing with complex math:
+
+   C_MUL(m,a,b)         : m = a*b
+   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
+   C_SUB( res, a,b)     : res = a - b
+   C_SUBFROM( res , a)  : res -= a
+   C_ADDTO( res , a)    : res += a
+ * */
+#ifdef OPUS_FIXED_POINT
+#include "arch.h"
+
+
+#define SAMP_MAX 2147483647
+#define TWID_MAX 32767
+#define TRIG_UPSCALE 1
+
+#define SAMP_MIN -SAMP_MAX
+
+
+#   define S_MUL(a,b) MULT16_32_Q15(b, a)
+
+#   define C_MUL(m,a,b) \
+      do{ (m).r = SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
+          (m).i = ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
+
+#   define C_MULC(m,a,b) \
+      do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
+          (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
+
+#   define C_MUL4(m,a,b) \
+      do{ (m).r = SHR32(SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)),2); \
+          (m).i = SHR32(ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)),2); }while(0)
+
+#   define C_MULBYSCALAR( c, s ) \
+      do{ (c).r =  S_MUL( (c).r , s ) ;\
+          (c).i =  S_MUL( (c).i , s ) ; }while(0)
+
+#   define DIVSCALAR(x,k) \
+        (x) = S_MUL(  x, (TWID_MAX-((k)>>1))/(k)+1 )
+
+#   define C_FIXDIV(c,div) \
+        do {    DIVSCALAR( (c).r , div);  \
+                DIVSCALAR( (c).i  , div); }while (0)
+
+#define  C_ADD( res, a,b)\
+    do {(res).r=ADD32((a).r,(b).r);  (res).i=ADD32((a).i,(b).i); \
+    }while(0)
+#define  C_SUB( res, a,b)\
+    do {(res).r=SUB32((a).r,(b).r);  (res).i=SUB32((a).i,(b).i); \
+    }while(0)
+#define C_ADDTO( res , a)\
+    do {(res).r = ADD32((res).r, (a).r);  (res).i = ADD32((res).i,(a).i);\
+    }while(0)
+
+#define C_SUBFROM( res , a)\
+    do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \
+    }while(0)
+
+#if defined(OPUS_ARM_INLINE_ASM)
+#include "arm/kiss_fft_armv4.h"
+#endif
+
+#if defined(OPUS_ARM_INLINE_EDSP)
+#include "arm/kiss_fft_armv5e.h"
+#endif
+
+#else  /* not OPUS_FIXED_POINT*/
+
+#   define S_MUL(a,b) ( (a)*(b) )
+#define C_MUL(m,a,b) \
+    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
+        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
+#define C_MULC(m,a,b) \
+    do{ (m).r = (a).r*(b).r + (a).i*(b).i;\
+        (m).i = (a).i*(b).r - (a).r*(b).i; }while(0)
+
+#define C_MUL4(m,a,b) C_MUL(m,a,b)
+
+#   define C_FIXDIV(c,div) /* NOOP */
+#   define C_MULBYSCALAR( c, s ) \
+    do{ (c).r *= (s);\
+        (c).i *= (s); }while(0)
+#endif
+
+#ifndef CHECK_OVERFLOW_OP
+#  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
+#endif
+
+#ifndef C_ADD
+#define  C_ADD( res, a,b)\
+    do { \
+            CHECK_OVERFLOW_OP((a).r,+,(b).r)\
+            CHECK_OVERFLOW_OP((a).i,+,(b).i)\
+            (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
+    }while(0)
+#define  C_SUB( res, a,b)\
+    do { \
+            CHECK_OVERFLOW_OP((a).r,-,(b).r)\
+            CHECK_OVERFLOW_OP((a).i,-,(b).i)\
+            (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
+    }while(0)
+#define C_ADDTO( res , a)\
+    do { \
+            CHECK_OVERFLOW_OP((res).r,+,(a).r)\
+            CHECK_OVERFLOW_OP((res).i,+,(a).i)\
+            (res).r += (a).r;  (res).i += (a).i;\
+    }while(0)
+
+#define C_SUBFROM( res , a)\
+    do {\
+            CHECK_OVERFLOW_OP((res).r,-,(a).r)\
+            CHECK_OVERFLOW_OP((res).i,-,(a).i)\
+            (res).r -= (a).r;  (res).i -= (a).i; \
+    }while(0)
+#endif /* C_ADD defined */
+
+#ifdef OPUS_FIXED_POINT
+#  define KISS_FFT_COS(phase)  TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * cos (phase))))
+#  define KISS_FFT_SIN(phase)  TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * sin (phase))))
+#  define KISS_FFT_COS(phase)  floor(.5+TWID_MAX*cos (phase))
+#  define KISS_FFT_SIN(phase)  floor(.5+TWID_MAX*sin (phase))
+#  define HALF_OF(x) ((x)>>1)
+#elif defined(USE_SIMD)
+#  define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
+#  define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
+#  define HALF_OF(x) ((x)*_mm_set1_ps(.5f))
+#else
+#  define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
+#  define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
+#  define HALF_OF(x) ((x)*.5f)
+#endif
+
+#define  kf_cexp(x,phase) \
+        do{ \
+                (x)->r = KISS_FFT_COS(phase);\
+                (x)->i = KISS_FFT_SIN(phase);\
+        }while(0)
+
+#define  kf_cexp2(x,phase) \
+   do{ \
+      (x)->r = TRIG_UPSCALE*celt_cos_norm((phase));\
+      (x)->i = TRIG_UPSCALE*celt_cos_norm((phase)-32768);\
+}while(0)
+
+#endif /* KISS_FFT_GUTS_H */

+ 214 - 0
drivers/opus/celt/arch.h

@@ -0,0 +1,214 @@
+/* Copyright (c) 2003-2008 Jean-Marc Valin
+   Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file arch.h
+   @brief Various architecture definitions for CELT
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef ARCH_H
+#define ARCH_H
+
+#include "opus_types.h"
+#include "opus_defines.h"
+
+# if !defined(__GNUC_PREREQ)
+#  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
+#   define __GNUC_PREREQ(_maj,_min) \
+ ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
+#  else
+#   define __GNUC_PREREQ(_maj,_min) 0
+#  endif
+# endif
+
+#define CELT_SIG_SCALE 32768.f
+
+#define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__);
+#ifdef ENABLE_ASSERTIONS
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __GNUC__
+__attribute__((noreturn))
+#endif
+static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line)
+{
+   fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
+   abort();
+}
+#define celt_assert(cond) {if (!(cond)) {celt_fatal("assertion failed: " #cond);}}
+#define celt_assert2(cond, message) {if (!(cond)) {celt_fatal("assertion failed: " #cond "\n" message);}}
+#else
+#define celt_assert(cond)
+#define celt_assert2(cond, message)
+#endif
+
+#define IMUL32(a,b) ((a)*(b))
+
+#define ABS(x) ((x) < 0 ? (-(x)) : (x))      /**< Absolute integer value. */
+#define ABS16(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 16-bit value.  */
+#define MIN16(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 16-bit value.   */
+#define MAX16(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
+#define ABS32(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 32-bit value.  */
+#define MIN32(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 32-bit value.   */
+#define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
+#define IMIN(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum int value.   */
+#define IMAX(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum int value.   */
+#define UADD32(a,b) ((a)+(b))
+#define USUB32(a,b) ((a)-(b))
+
+#define PRINT_MIPS(file)
+
+#ifdef OPUS_FIXED_POINT
+
+typedef opus_int16 opus_val16;
+typedef opus_int32 opus_val32;
+
+typedef opus_val32 celt_sig;
+typedef opus_val16 celt_norm;
+typedef opus_val32 celt_ener;
+
+#define Q15ONE 32767
+
+#define SIG_SHIFT 12
+
+#define NORM_SCALING 16384
+
+#define DB_SHIFT 10
+
+#define EPSILON 1
+#define VERY_SMALL 0
+#define VERY_LARGE16 ((opus_val16)32767)
+#define Q15_ONE ((opus_val16)32767)
+
+#define SCALEIN(a)      (a)
+#define SCALEOUT(a)     (a)
+
+#ifdef FIXED_DEBUG
+#include "fixed_debug.h"
+#else
+
+#include "fixed_generic.h"
+
+#ifdef OPUS_ARM_INLINE_EDSP
+#include "arm/fixed_armv5e.h"
+#elif defined (OPUS_ARM_INLINE_ASM)
+#include "arm/fixed_armv4.h"
+#elif defined (BFIN_ASM)
+#include "fixed_bfin.h"
+#elif defined (TI_C5X_ASM)
+#include "fixed_c5x.h"
+#elif defined (TI_C6X_ASM)
+#include "fixed_c6x.h"
+#endif
+
+#endif
+
+#else /* OPUS_FIXED_POINT */
+
+typedef float opus_val16;
+typedef float opus_val32;
+
+typedef float celt_sig;
+typedef float celt_norm;
+typedef float celt_ener;
+
+#define Q15ONE 1.0f
+
+#define NORM_SCALING 1.f
+
+#define EPSILON 1e-15f
+#define VERY_SMALL 1e-30f
+#define VERY_LARGE16 1e15f
+#define Q15_ONE ((opus_val16)1.f)
+
+#define QCONST16(x,bits) (x)
+#define QCONST32(x,bits) (x)
+
+#define NEG16(x) (-(x))
+#define NEG32(x) (-(x))
+#define EXTRACT16(x) (x)
+#define EXTEND32(x) (x)
+#define SHR16(a,shift) (a)
+#define SHL16(a,shift) (a)
+#define SHR32(a,shift) (a)
+#define SHL32(a,shift) (a)
+#define PSHR32(a,shift) (a)
+#define VSHR32(a,shift) (a)
+
+#define PSHR(a,shift)   (a)
+#define SHR(a,shift)    (a)
+#define SHL(a,shift)    (a)
+#define SATURATE(x,a)   (x)
+#define SATURATE16(x)   (x)
+
+#define ROUND16(a,shift)  (a)
+#define HALF16(x)       (.5f*(x))
+#define HALF32(x)       (.5f*(x))
+
+#define ADD16(a,b) ((a)+(b))
+#define SUB16(a,b) ((a)-(b))
+#define ADD32(a,b) ((a)+(b))
+#define SUB32(a,b) ((a)-(b))
+#define MULT16_16_16(a,b)     ((a)*(b))
+#define MULT16_16(a,b)     ((opus_val32)(a)*(opus_val32)(b))
+#define MAC16_16(c,a,b)     ((c)+(opus_val32)(a)*(opus_val32)(b))
+
+#define MULT16_32_Q15(a,b)     ((a)*(b))
+#define MULT16_32_Q16(a,b)     ((a)*(b))
+
+#define MULT32_32_Q31(a,b)     ((a)*(b))
+
+#define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
+
+#define MULT16_16_Q11_32(a,b)     ((a)*(b))
+#define MULT16_16_Q11(a,b)     ((a)*(b))
+#define MULT16_16_Q13(a,b)     ((a)*(b))
+#define MULT16_16_Q14(a,b)     ((a)*(b))
+#define MULT16_16_Q15(a,b)     ((a)*(b))
+#define MULT16_16_P15(a,b)     ((a)*(b))
+#define MULT16_16_P13(a,b)     ((a)*(b))
+#define MULT16_16_P14(a,b)     ((a)*(b))
+#define MULT16_32_P16(a,b)     ((a)*(b))
+
+#define DIV32_16(a,b)     (((opus_val32)(a))/(opus_val16)(b))
+#define DIV32(a,b)     (((opus_val32)(a))/(opus_val32)(b))
+
+#define SCALEIN(a)      ((a)*CELT_SIG_SCALE)
+#define SCALEOUT(a)     ((a)*(1/CELT_SIG_SCALE))
+
+#endif /* !OPUS_FIXED_POINT */
+
+#ifndef GLOBAL_STACK_SIZE
+#ifdef OPUS_FIXED_POINT
+#define GLOBAL_STACK_SIZE 100000
+#else
+#define GLOBAL_STACK_SIZE 100000
+#endif
+#endif
+
+#endif /* ARCH_H */

+ 316 - 0
drivers/opus/celt/arm/arm2gnu.pl

@@ -0,0 +1,316 @@
+#!/usr/bin/perl
+
+my $bigend;  # little/big endian
+my $nxstack;
+
+$nxstack = 0;
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+    if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+    $_ = shift;
+  last if /^--/;
+    if (/^-n/) {
+    $nflag++;
+    next;
+    }
+    die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n";      # automatically add newline on print
+$n=0;
+
+$thumb = 0;     # ARM mode by default, not Thumb.
+@proc_stack = ();
+
+LINE:
+while (<>) {
+
+    # For ADRLs we need to add a new line after the substituted one.
+    $addPadding = 0;
+
+    # First, we do not dare to touch *anything* inside double quotes, do we?
+    # Second, if you want a dollar character in the string,
+    # insert two of them -- that's how ARM C and assembler treat strings.
+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };
+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };
+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };
+    # If there's nothing on a line but a comment, don't try to apply any further
+    #  substitutions (this is a cheap hack to avoid mucking up the license header)
+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };
+    # If substituted -- leave immediately !
+
+    s/@/,:/;
+    s/;/@/;
+    while ( /@.*'/ ) {
+      s/(@.*)'/$1/g;
+    }
+    s/\{FALSE\}/0/g;
+    s/\{TRUE\}/1/g;
+    s/\{(\w\w\w\w+)\}/$1/g;
+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+    s/\bIMPORT\b/.extern/;
+    s/\bEXPORT\b/.global/;
+    s/^(\s+)\[/$1IF/;
+    s/^(\s+)\|/$1ELSE/;
+    s/^(\s+)\]/$1ENDIF/;
+    s/IF *:DEF:/ .ifdef/;
+    s/IF *:LNOT: *:DEF:/ .ifndef/;
+    s/ELSE/ .else/;
+    s/ENDIF/ .endif/;
+
+    if( /\bIF\b/ ) {
+      s/\bIF\b/ .if/;
+      s/=/==/;
+    }
+    if ( $n == 2) {
+        s/\$/\\/g;
+    }
+    if ($n == 1) {
+        s/\$//g;
+        s/label//g;
+    $n = 2;
+      }
+    if ( /MACRO/ ) {
+      s/MACRO *\n/.macro/;
+      $n=1;
+    }
+    if ( /\bMEND\b/ ) {
+      s/\bMEND\b/.endm/;
+      $n=0;
+    }
+
+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+    #
+    if ( /\bAREA\b/ ) {
+        my $align;
+        $align = "2";
+        if ( /ALIGN=(\d+)/ ) {
+            $align = $1;
+        }
+        if ( /CODE/ ) {
+            $nxstack = 1;
+        }
+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;
+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata/;
+        s/^(.+)\|\|\.data\|\|(.+)/    .data/;
+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;
+        s/$/;   .p2align $align/;
+        # Enable NEON instructions but don't produce a binary that requires
+        # ARMv7. RVCT does not have equivalent directives, so we just do this
+        # for all CODE areas.
+        if ( /.text/ ) {
+            # Separating .arch, .fpu, etc., by semicolons does not work (gas
+            # thinks the semicolon is part of the arch name, even when there's
+            # whitespace separating them). Sadly this means our line numbers
+            # won't match the original source file (we could use the .line
+            # directive, which is documented to be obsolete, but then gdb will
+            # show the wrong line in the translated source file).
+            s/$/;   .arch armv7-a\n   .fpu neon\n   .object_arch armv4t/;
+        }
+    }
+
+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||
+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||
+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||
+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+    s/^(\s+)\%(\s)/    .space $1/;
+
+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123
+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+    if (/\bPROC\b/)
+    {
+        my $prefix;
+        my $proc;
+        /^([A-Za-z_\.]\w+)\b/;
+        $proc = $1;
+        $prefix = "";
+        if ($proc)
+        {
+            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc);
+            push(@proc_stack, $proc);
+            s/^[A-Za-z_\.]\w+/$&:/;
+        }
+        $prefix = $prefix."\t.thumb_func; " if ($thumb);
+        s/\bPROC\b/@ $&/;
+        $_ = $prefix.$_;
+    }
+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+    if (/\bENDP\b/)
+    {
+        my $proc;
+        s/\bENDP\b/@ $&/;
+        $proc = pop(@proc_stack);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+    }
+    s/\bSUBT\b/@ $&/;
+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
+    s/\bKEEP\b/@ $&/;
+    s/\bEXPORTAS\b/@ $&/;
+    s/\|\|(.)+\bEQU\b/@ $&/;
+    s/\|\|([\w\$]+)\|\|/$1/;
+    s/\bENTRY\b/@ $&/;
+    s/\bASSERT\b/@ $&/;
+    s/\bGBLL\b/@ $&/;
+    s/\bGBLA\b/@ $&/;
+    s/^\W+OPT\b/@ $&/;
+    s/:OR:/|/g;
+    s/:SHL:/<</g;
+    s/:SHR:/>>/g;
+    s/:AND:/&/g;
+    s/:LAND:/&&/g;
+    s/CPSR/cpsr/;
+    s/SPSR/spsr/;
+    s/ALIGN$/.balign 4/;
+    s/ALIGN\s+([0-9x]+)$/.balign $1/;
+    s/psr_cxsf/psr_all/;
+    s/LTORG/.ltorg/;
+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed
+    s/\{PC\} \+/ \. +/;
+
+    # Single hex constant on the line !
+    #
+    # >>> NOTE <<<
+    #   Double-precision floats in gcc are always mixed-endian, which means
+    #   bytes in two words are little-endian, but words are big-endian.
+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+    #   and 0xfeed0000 at high address.
+    #
+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+    # Single hex constant on the line !
+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+    s/\bDCFS[ \t]+0x/.word 0x/;
+    s/\bDCFS\b/.float/;
+
+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+    s/\bDCD\b/.word/;
+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+    s/\bDCW\b/.short/;
+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+    s/\bDCB\b/.byte/;
+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+    s/^[A-Za-z_\.]\w+/$&:/;
+    s/^(\d+)/$1:/;
+    s/\%(\d+)/$1b_or_f/;
+    s/\%[Bb](\d+)/$1b/;
+    s/\%[Ff](\d+)/$1f/;
+    s/\%[Ff][Tt](\d+)/$1f/;
+    s/&([\dA-Fa-f]+)/0x$1/;
+    if ( /\b2_[01]+\b/ ) {
+      s/\b2_([01]+)\b/conv$1&&&&/g;
+      while ( /[01][01][01][01]&&&&/ ) {
+        s/0000&&&&/&&&&0/g;
+        s/0001&&&&/&&&&1/g;
+        s/0010&&&&/&&&&2/g;
+        s/0011&&&&/&&&&3/g;
+        s/0100&&&&/&&&&4/g;
+        s/0101&&&&/&&&&5/g;
+        s/0110&&&&/&&&&6/g;
+        s/0111&&&&/&&&&7/g;
+        s/1000&&&&/&&&&8/g;
+        s/1001&&&&/&&&&9/g;
+        s/1010&&&&/&&&&A/g;
+        s/1011&&&&/&&&&B/g;
+        s/1100&&&&/&&&&C/g;
+        s/1101&&&&/&&&&D/g;
+        s/1110&&&&/&&&&E/g;
+        s/1111&&&&/&&&&F/g;
+      }
+      s/000&&&&/&&&&0/g;
+      s/001&&&&/&&&&1/g;
+      s/010&&&&/&&&&2/g;
+      s/011&&&&/&&&&3/g;
+      s/100&&&&/&&&&4/g;
+      s/101&&&&/&&&&5/g;
+      s/110&&&&/&&&&6/g;
+      s/111&&&&/&&&&7/g;
+      s/00&&&&/&&&&0/g;
+      s/01&&&&/&&&&1/g;
+      s/10&&&&/&&&&2/g;
+      s/11&&&&/&&&&3/g;
+      s/0&&&&/&&&&0/g;
+      s/1&&&&/&&&&1/g;
+      s/conv&&&&/0x/g;
+    }
+
+    if ( /commandline/)
+    {
+        if( /-bigend/)
+        {
+            $bigend=1;
+        }
+    }
+
+    if ( /\bDCDU\b/ )
+    {
+        my $cmd=$_;
+        my $value;
+        my $prefix;
+        my $w1;
+        my $w2;
+        my $w3;
+        my $w4;
+
+        s/\s+DCDU\b/@ $&/;
+
+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+        $value = $1;
+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+        $w1 = $1;
+        $w2 = $2;
+        $w3 = $3;
+        $w4 = $4;
+
+        if( $bigend ne "")
+        {
+            # big endian
+            $prefix = "\t.byte\t0x".$w1.";".
+                      "\t.byte\t0x".$w2.";".
+                      "\t.byte\t0x".$w3.";".
+                      "\t.byte\t0x".$w4."; ";
+        }
+        else
+        {
+            # little endian
+            $prefix = "\t.byte\t0x".$w4.";".
+                      "\t.byte\t0x".$w3.";".
+                      "\t.byte\t0x".$w2.";".
+                      "\t.byte\t0x".$w1."; ";
+        }
+        $_=$prefix.$_;
+    }
+
+    if ( /\badrl\b/i )
+    {
+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+        $addPadding = 1;
+    }
+    s/\bEND\b/@ END/;
+} continue {
+    printf ("%s", $_) if $printit;
+    if ($addPadding != 0)
+    {
+        printf ("   mov r0,r0\n");
+        $addPadding = 0;
+    }
+}
+#If we had a code section, mark that this object doesn't need an executable
+# stack.
+if ($nxstack) {
+    printf ("    .section\t.note.GNU-stack,\"\",\%\%progbits\n");
+}

+ 49 - 0
drivers/opus/celt/arm/arm_celt_map.c

@@ -0,0 +1,49 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "pitch.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(OPUS_FIXED_POINT)
+opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int , int) = {
+  celt_pitch_xcorr_c,               /* ARMv4 */
+  MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
+  MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
+  MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
+};
+# else
+#  error "Floating-point implementation is not supported by ARM asm yet." \
+ "Reconfigure with --disable-rtcd or send patches."
+# endif
+
+#endif

+ 174 - 0
drivers/opus/celt/arm/armcpu.c

@@ -0,0 +1,174 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from libtheora modified to suit to Opus */
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#ifdef OPUS_HAVE_RTCD
+
+#include "armcpu.h"
+#include "cpu_support.h"
+#include "os_support.h"
+#include "opus_types.h"
+
+#define OPUS_CPU_ARM_V4    (1)
+#define OPUS_CPU_ARM_EDSP  (1<<1)
+#define OPUS_CPU_ARM_MEDIA (1<<2)
+#define OPUS_CPU_ARM_NEON  (1<<3)
+
+#if defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
+  opus_uint32 flags;
+  flags=0;
+  /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
+   * instructions via their assembled hex code.
+   * All of these instructions should be essentially nops. */
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
+  __try{
+    /*PLD [r13]*/
+    __emit(0xF5DDF000);
+    flags|=OPUS_CPU_ARM_EDSP;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+  __try{
+    /*SHADD8 r3,r3,r3*/
+    __emit(0xE6333F93);
+    flags|=OPUS_CPU_ARM_MEDIA;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   if defined(OPUS_ARM_MAY_HAVE_NEON)
+  __try{
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags|=OPUS_CPU_ARM_NEON;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   endif
+#  endif
+# endif
+  return flags;
+}
+
+#elif defined(__linux__)
+/* Linux based */
+opus_uint32 opus_cpu_capabilities(void)
+{
+  opus_uint32 flags = 0;
+  FILE *cpuinfo;
+
+  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+   * Android */
+  cpuinfo = fopen("/proc/cpuinfo", "r");
+
+  if(cpuinfo != NULL)
+  {
+    /* 512 should be enough for anybody (it's even enough for all the flags that
+     * x86 has accumulated... so far). */
+    char buf[512];
+
+    while(fgets(buf, 512, cpuinfo) != NULL)
+    {
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)
+      /* Search for edsp and neon flag */
+      if(memcmp(buf, "Features", 8) == 0)
+      {
+        char *p;
+#  if defined(OPUS_ARM_MAY_HAVE_EDSP)
+        p = strstr(buf, " edsp");
+        if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
+          flags |= OPUS_CPU_ARM_EDSP;
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+        p = strstr(buf, " neon");
+        if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
+          flags |= OPUS_CPU_ARM_NEON;
+#  endif
+      }
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+      /* Search for media capabilities (>= ARMv6) */
+      if(memcmp(buf, "CPU architecture:", 17) == 0)
+      {
+        int version;
+        version = atoi(buf+17);
+
+        if(version >= 6)
+          flags |= OPUS_CPU_ARM_MEDIA;
+      }
+# endif
+    }
+
+    fclose(cpuinfo);
+  }
+  return flags;
+}
+#else
+/* The feature registers which can tell us what the processor supports are
+ * accessible in priveleged modes only, so we can't have a general user-space
+ * detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+   "your platform.  Reconfigure with --disable-rtcd (or send patches)."
+#endif
+
+int opus_select_arch(void)
+{
+  opus_uint32 flags = opus_cpu_capabilities();
+  int arch = 0;
+
+  if(!(flags & OPUS_CPU_ARM_EDSP))
+    return arch;
+  arch++;
+
+  if(!(flags & OPUS_CPU_ARM_MEDIA))
+    return arch;
+  arch++;
+
+  if(!(flags & OPUS_CPU_ARM_NEON))
+    return arch;
+  arch++;
+
+  return arch;
+}
+
+#endif

+ 71 - 0
drivers/opus/celt/arm/armcpu.h

@@ -0,0 +1,71 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(ARMCPU_H)
+# define ARMCPU_H
+
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
+#  define MAY_HAVE_EDSP(name) name ## _edsp
+# else
+#  define MAY_HAVE_EDSP(name) name ## _c
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+#  define MAY_HAVE_MEDIA(name) name ## _media
+# else
+#  define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+#  define MAY_HAVE_NEON(name) name ## _neon
+# else
+#  define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_EDSP)
+#  define PRESUME_EDSP(name) name ## _edsp
+# else
+#  define PRESUME_EDSP(name) name ## _c
+# endif
+
+# if defined(OPUS_ARM_PRESUME_MEDIA)
+#  define PRESUME_MEDIA(name) name ## _media
+# else
+#  define PRESUME_MEDIA(name) PRESUME_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_NEON)
+#  define PRESUME_NEON(name) name ## _neon
+# else
+#  define PRESUME_NEON(name) PRESUME_MEDIA(name)
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
+int opus_select_arch(void);
+# endif
+
+#endif

+ 37 - 0
drivers/opus/celt/arm/armopts.s

@@ -0,0 +1,37 @@
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OPUS_ARM_MAY_HAVE_EDSP  * 
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OPUS_ARM_MAY_HAVE_MEDIA * 
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OPUS_ARM_MAY_HAVE_NEON  * 
+
+END

+ 37 - 0
drivers/opus/celt/arm/armopts.s.in

@@ -0,0 +1,37 @@
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OPUS_ARM_MAY_HAVE_EDSP  * @OPUS_ARM_MAY_HAVE_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OPUS_ARM_MAY_HAVE_NEON  * @OPUS_ARM_MAY_HAVE_NEON@
+
+END

+ 545 - 0
drivers/opus/celt/arm/celt_pitch_xcorr_arm.s

@@ -0,0 +1,545 @@
+; Copyright (c) 2007-2008 CSIRO
+; Copyright (c) 2007-2009 Xiph.Org Foundation
+; Copyright (c) 2013      Parrot
+; Written by Aurélien Zanelli
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+;
+; - Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; - Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  AREA  |.text|, CODE, READONLY
+
+  GET    celt/arm/armopts.s
+
+IF OPUS_ARM_MAY_HAVE_EDSP
+  EXPORT celt_pitch_xcorr_edsp
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_NEON
+  EXPORT celt_pitch_xcorr_neon
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_NEON
+
+; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
+xcorr_kernel_neon PROC
+  ; input:
+  ;   r3     = int         len
+  ;   r4     = opus_val16 *x
+  ;   r5     = opus_val16 *y
+  ;   q0     = opus_val32  sum[4]
+  ; output:
+  ;   q0     = opus_val32  sum[4]
+  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
+  ; internal usage:
+  ;   r12 = int j
+  ;   d3  = y_3|y_2|y_1|y_0
+  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+  ;   q8  = scratch
+  ;
+  ; Load y[0...3]
+  ; This requires len>0 to always be valid (which we assert in the C code).
+  VLD1.16      {d5}, [r5]!
+  SUBS         r12, r3, #8
+  BLE xcorr_kernel_neon_process4
+; Process 8 samples at a time.
+; This loop loads one y value more than we actually need. Therefore we have to
+; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+; reading past the end of the array.
+xcorr_kernel_neon_process8
+  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
+  ; - 2 cycles of ARM insrtuctions,
+  ; - 10 cycles of load/store/byte permute instructions, and
+  ; - 9 cycles of data processing instructions.
+  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+  ; latter two categories, meaning the whole loop should run in 10 cycles per
+  ; iteration, barring cache misses.
+  ;
+  ; Load x[0...7]
+  VLD1.16      {d6, d7}, [r4]!
+  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+  VAND         d3, d5, d5
+  SUBS         r12, r12, #8
+  ; Load y[4...11]
+  VLD1.16      {d4, d5}, [r5]!
+  VMLAL.S16    q0, d3, d6[0]
+  VEXT.16      d16, d3, d4, #1
+  VMLAL.S16    q0, d4, d7[0]
+  VEXT.16      d17, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d3, d4, #2
+  VMLAL.S16    q0, d17, d7[1]
+  VEXT.16      d17, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d3, d4, #3
+  VMLAL.S16    q0, d17, d7[2]
+  VEXT.16      d17, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+  VMLAL.S16    q0, d17, d7[3]
+  BGT xcorr_kernel_neon_process8
+; Process 4 samples here if we have > 4 left (still reading one extra y value).
+xcorr_kernel_neon_process4
+  ADDS         r12, r12, #4
+  BLE xcorr_kernel_neon_process2
+  ; Load x[0...3]
+  VLD1.16      d6, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #4
+  ; Load y[4...7]
+  VLD1.16      d5, [r5]!
+  VMLAL.S16    q0, d4, d6[0]
+  VEXT.16      d16, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+; Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2
+  ADDS         r12, r12, #2
+  BLE xcorr_kernel_neon_process1
+  ; Load x[0...1]
+  VLD2.16      {d6[],d7[]}, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #2
+  ; Load y[4...5]
+  VLD1.32      {d5[]}, [r5]!
+  VMLAL.S16    q0, d4, d6
+  VEXT.16      d16, d4, d5, #1
+  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+  ; instead of VEXT, since it's a data-processing instruction.
+  VSRI.64      d5, d4, #32
+  VMLAL.S16    q0, d16, d7
+; Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1
+  ; Load next *x
+  VLD1.16      {d6[]}, [r4]!
+  ADDS         r12, r12, #1
+  ; y[0...3] are left in d5 from prior iteration(s) (if any)
+  VMLAL.S16    q0, d5, d6
+  MOVLE        pc, lr
+; Now process 1 last sample, not reading ahead.
+  ; Load last *y
+  VLD1.16      {d4[]}, [r5]!
+  VSRI.64      d4, d5, #16
+  ; Load last *x
+  VLD1.16      {d6[]}, [r4]!
+  VMLAL.S16    q0, d4, d6
+  MOV          pc, lr
+  ENDP
+
+; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+;  opus_val32 *xcorr, int len, int max_pitch)
+celt_pitch_xcorr_neon PROC
+  ; input:
+  ;   r0  = opus_val16 *_x
+  ;   r1  = opus_val16 *_y
+  ;   r2  = opus_val32 *xcorr
+  ;   r3  = int         len
+  ; output:
+  ;   r0  = int         maxcorr
+  ; internal usage:
+  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
+  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
+  ;   r6  = int         max_pitch
+  ;   r12 = int         j
+  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  STMFD        sp!, {r4-r6, lr}
+  LDR          r6, [sp, #16]
+  VMOV.S32     q15, #1
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  SUBS         r6, r6, #4
+  BLT celt_pitch_xcorr_neon_process4_done
+celt_pitch_xcorr_neon_process4
+  ; xcorr_kernel_neon parameters:
+  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+  MOV          r4, r0
+  MOV          r5, r1
+  VEOR         q0, q0, q0
+  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+  ; So we don't save/restore any other registers.
+  BL xcorr_kernel_neon
+  SUBS         r6, r6, #4
+  VST1.32      {q0}, [r2]!
+  ; _y += 4
+  ADD          r1, r1, #8
+  VMAX.S32     q15, q15, q0
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  BGE celt_pitch_xcorr_neon_process4
+; We have less than 4 sums left to compute.
+celt_pitch_xcorr_neon_process4_done
+  ADDS         r6, r6, #4
+  ; Reduce maxcorr to a single value
+  VMAX.S32     d30, d30, d31
+  VPMAX.S32    d30, d30, d30
+  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+  BLE celt_pitch_xcorr_neon_done
+; Now compute each remaining sum one at a time.
+celt_pitch_xcorr_neon_process_remaining
+  MOV          r4, r0
+  MOV          r5, r1
+  VMOV.I32     q0, #0
+  SUBS         r12, r3, #8
+  BLT celt_pitch_xcorr_neon_process_remaining4
+; Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8
+  ; Load x[0...7]
+  VLD1.16      {q1}, [r4]!
+  ; Load y[0...7]
+  VLD1.16      {q2}, [r5]!
+  SUBS         r12, r12, #8
+  VMLAL.S16    q0, d4, d2
+  VMLAL.S16    q0, d5, d3
+  BGE celt_pitch_xcorr_neon_process_remaining_loop8
+; Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4
+  ADDS         r12, r12, #4
+  BLT celt_pitch_xcorr_neon_process_remaining4_done
+  ; Load x[0...3]
+  VLD1.16      {d2}, [r4]!
+  ; Load y[0...3]
+  VLD1.16      {d3}, [r5]!
+  SUB          r12, r12, #4
+  VMLAL.S16    q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done
+  ; Reduce the sum to a single value.
+  VADD.S32     d0, d0, d1
+  VPADDL.S32   d0, d0
+  ADDS         r12, r12, #4
+  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+; Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1
+  VLD1.16      {d2[]}, [r4]!
+  VLD1.16      {d3[]}, [r5]!
+  SUBS         r12, r12, #1
+  VMLAL.S16    q0, d2, d3
+  BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done
+  VST1.32      {d0[0]}, [r2]!
+  VMAX.S32     d30, d30, d0
+  SUBS         r6, r6, #1
+  ; _y++
+  ADD          r1, r1, #2
+  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+  BGT celt_pitch_xcorr_neon_process_remaining
+celt_pitch_xcorr_neon_done
+  VMOV.32      r0, d30[0]
+  LDMFD        sp!, {r4-r6, pc}
+  ENDP
+
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_EDSP
+
+; This will get used on ARMv7 devices without NEON, so it has been optimized
+; to take advantage of dual-issuing where possible.
+xcorr_kernel_edsp PROC
+  ; input:
+  ;   r3      = int         len
+  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
+  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
+  ;   r6...r9 = opus_val32  sum[4]
+  ; output:
+  ;   r6...r9 = opus_val32  sum[4]
+  ; preserved: r0-r5
+  ; internal usage
+  ;   r2      = int         j
+  ;   r12,r14 = opus_val16  x[4]
+  ;   r10,r11 = opus_val16  y[4]
+  STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      ; Load y[0...1]
+  SUBS         r2, r3, #4         ; j = len-4
+  LDR          r11, [r5], #4      ; Load y[2...3]
+  BLE xcorr_kernel_edsp_process4_done
+  LDR          r12, [r4], #4      ; Load x[0...1]
+  ; Stall
+xcorr_kernel_edsp_process4
+  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
+  ; other. Every other instruction here dual-issues with a multiply, and is
+  ; thus "free". There should be no stalls in the body of the loop.
+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
+  LDR          r14, [r4], #4      ; Load x[2...3]
+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
+  SUBS         r2, r2, #4         ; j-=4
+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
+  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
+  LDR          r10, [r5], #4      ; Load y[4...5]
+  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
+  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
+  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
+  LDRGT        r12, [r4], #4      ; Load x[0...1]
+  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
+  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
+  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
+  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
+  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
+  LDR          r11, [r5], #4      ; Load y[6...7]
+  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
+  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
+  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
+  BGT xcorr_kernel_edsp_process4
+xcorr_kernel_edsp_process4_done
+  ADDS         r2, r2, #4
+  BLE xcorr_kernel_edsp_done
+  LDRH         r12, [r4], #2      ; r12 = *x++
+  SUBS         r2, r2, #1         ; j--
+  ; Stall
+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
+  LDRGTH       r14, [r4], #2      ; r14 = *x++
+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
+  SUBS         r2, r2, #1         ; j--
+  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
+  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
+  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
+  LDRGTH       r12, [r4], #2      ; r12 = *x++
+  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
+  BLE xcorr_kernel_edsp_done
+  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
+  CMP          r2, #1             ; j--
+  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
+  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
+  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
+  LDRGTH       r14, [r4]          ; r14 = *x
+  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
+  LDRH         r11, [r5]          ; r11 = y_6 = *y
+  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
+  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
+  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
+xcorr_kernel_edsp_done
+  LDMFD        sp!, {r2,r4,r5,pc}
+  ENDP
+
+celt_pitch_xcorr_edsp PROC
+  ; input:
+  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
+  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
+  ;   r2  = opus_val32 *xcorr
+  ;   r3  = int         len
+  ; output:
+  ;   r0  = maxcorr
+  ; internal usage
+  ;   r4  = opus_val16 *x
+  ;   r5  = opus_val16 *y
+  ;   r6  = opus_val32  sum0
+  ;   r7  = opus_val32  sum1
+  ;   r8  = opus_val32  sum2
+  ;   r9  = opus_val32  sum3
+  ;   r1  = int         max_pitch
+  ;   r12 = int         j
+  STMFD        sp!, {r4-r11, lr}
+  MOV          r5, r1
+  LDR          r1, [sp, #36]
+  MOV          r4, r0
+  TST          r5, #3
+  ; maxcorr = 1
+  MOV          r0, #1
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+; Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  MOV          r8, r8, LSL #16
+celt_pitch_xcorr_edsp_process1u_loop4
+  LDR          r9, [r5], #4
+  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r6, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+  MOV          r8, r8, LSR #16
+celt_pitch_xcorr_edsp_process1u_loop4_done
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1
+  LDRGEH       r6, [r4], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
+  SUBGES       r12, r12, #1
+  LDRGTH       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
+  SUBS         r1, r1, #4
+  BLT celt_pitch_xcorr_edsp_process2
+celt_pitch_xcorr_edsp_process4
+  ; xcorr_kernel_edsp parameters:
+  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
+  MOV          r6, #0
+  MOV          r7, #0
+  MOV          r8, #0
+  MOV          r9, #0
+  BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
+  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
+  CMP          r0, r6
+  ; _y+=4
+  ADD          r5, r5, #8
+  MOVLT        r0, r6
+  CMP          r0, r7
+  MOVLT        r0, r7
+  CMP          r0, r8
+  MOVLT        r0, r8
+  CMP          r0, r9
+  MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
+  SUBS         r1, r1, #4
+  BGE celt_pitch_xcorr_edsp_process4
+celt_pitch_xcorr_edsp_process2
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  ; {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRGTH       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  ; maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  ; xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4
+  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDRGE        r6, [r4], #4
+  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  LDRGE        r8, [r5], #4
+  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGE        r7, [r4], #4
+  LDRGE        r9, [r5], #4
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done
+  ADDS         r12, r12, #2
+  LDRGE        r6, [r4], #4
+  LDRGE        r8, [r5], #4
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBGE        r12, r12, #2
+  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  ADDS         r12, r12, #1
+  LDRGEH       r6, [r4], #2
+  LDRGEH       r8, [r5], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  MOVLT        r0, r14
+celt_pitch_xcorr_edsp_done
+  LDMFD        sp!, {r4-r11, pc}
+  ENDP
+
+ENDIF
+
+END

+ 76 - 0
drivers/opus/celt/arm/fixed_armv4.h

@@ -0,0 +1,76 @@
+/* Copyright (C) 2013 Xiph.Org Foundation and contributors */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv4_H
+#define FIXED_ARMv4_H
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#MULT16_32_Q16\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=&r"(rd_lo), "=&r"(rd_hi)
+      : "%r"(b),"r"(a<<16)
+  );
+  return rd_hi;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv4(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#MULT16_32_Q15\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=&r"(rd_lo), "=&r"(rd_hi)
+      : "%r"(b), "r"(a<<16)
+  );
+  /*We intentionally don't OR in the high bit of rd_lo for speed.*/
+  return rd_hi<<1;
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q15
+#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
+
+
+/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
+#undef MULT32_32_Q31
+#define MULT32_32_Q31(a,b) (opus_val32)((((opus_int64)(a)) * ((opus_int64)(b)))>>31)
+
+#endif

+ 116 - 0
drivers/opus/celt/arm/fixed_armv5e.h

@@ -0,0 +1,116 @@
+/* Copyright (C) 2007-2009 Xiph.Org Foundation
+   Copyright (C) 2003-2008 Jean-Marc Valin
+   Copyright (C) 2007-2008 CSIRO
+   Copyright (C) 2013      Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv5E_H
+#define FIXED_ARMv5E_H
+
+#include "fixed_armv4.h"
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static OPUS_INLINE opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_32_Q16\n\t"
+      "smulwb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(b),"r"(a)
+  );
+  return res;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv5e(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_32_Q15\n\t"
+      "smulwb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(b), "r"(a)
+  );
+  return res<<1;
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q15
+static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
+ opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_32_Q15\n\t"
+      "smlawb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(b<<1), "r"(a), "r"(c)
+  );
+  return res;
+}
+#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
+
+/** 16x16 multiply-add where the result fits in 32 bits */
+#undef MAC16_16
+static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
+ opus_val16 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_16\n\t"
+      "smlabb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(a), "r"(b), "r"(c)
+  );
+  return res;
+}
+#define MAC16_16(c, a, b) (MAC16_16_armv5e(c, a, b))
+
+/** 16x16 multiplication where the result fits in 32 bits */
+#undef MULT16_16
+static OPUS_INLINE opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_16\n\t"
+      "smulbb %0, %1, %2;\n"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define MULT16_16(a, b) (MULT16_16_armv5e(a, b))
+
+#endif

+ 121 - 0
drivers/opus/celt/arm/kiss_fft_armv4.h

@@ -0,0 +1,121 @@
+/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_ARMv4_H
+#define KISS_FFT_ARMv4_H
+
+#if !defined(KISS_FFT_GUTS_H)
+#error "This file should only be included from _kiss_fft_guts.h"
+#endif
+
+#ifdef OPUS_FIXED_POINT
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MUL\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mi], r1, %[br]\n\t" \
+            "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull %[br], %[mr], r0, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #15\n\t" \
+            "smlal %[br], %[mr], r1, %[bi]\n\t" \
+            "orr %[mi], %[tt], %[mi], lsl #17\n\t" \
+            "mov %[br], %[br], lsr #15\n\t" \
+            "orr %[mr], %[br], %[mr], lsl #17\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MUL4\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mi], r1, %[br]\n\t" \
+            "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull %[br], %[mr], r0, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #17\n\t" \
+            "smlal %[br], %[mr], r1, %[bi]\n\t" \
+            "orr %[mi], %[tt], %[mi], lsl #15\n\t" \
+            "mov %[br], %[br], lsr #17\n\t" \
+            "orr %[mr], %[br], %[mr], lsl #15\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MULC\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mr], r0, %[br]\n\t" \
+            "smlal %[tt], %[mr], r1, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull %[br], %[mi], r1, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #15\n\t" \
+            "smlal %[br], %[mi], r0, %[bi]\n\t" \
+            "orr %[mr], %[tt], %[mr], lsl #17\n\t" \
+            "mov %[br], %[br], lsr #15\n\t" \
+            "orr %[mi], %[br], %[mi], lsl #17\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#endif /* OPUS_FIXED_POINT */
+
+#endif /* KISS_FFT_ARMv4_H */

+ 118 - 0
drivers/opus/celt/arm/kiss_fft_armv5e.h

@@ -0,0 +1,118 @@
+/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_ARMv5E_H
+#define KISS_FFT_ARMv5E_H
+
+#if !defined(KISS_FFT_GUTS_H)
+#error "This file should only be included from _kiss_fft_guts.h"
+#endif
+
+#ifdef OPUS_FIXED_POINT
+
+#if defined(__thumb__)||defined(__thumb2__)
+#define LDRD_CONS "Q"
+#else
+#define LDRD_CONS "Uq"
+#endif
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+    do{ \
+        int mr1__; \
+        int mr2__; \
+        int mi__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MUL\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mi], %H[aval], %[bval]\n\t" \
+            "smulwb %[mr1], %[aval], %[bval]\n\t" \
+            "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+            "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+            : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHL32(SUB32(mr1__, mr2__), 1); \
+        (m).i = SHL32(mi__, 1); \
+    } \
+    while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+    do{ \
+        int mr1__; \
+        int mr2__; \
+        int mi__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MUL4\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mi], %H[aval], %[bval]\n\t" \
+            "smulwb %[mr1], %[aval], %[bval]\n\t" \
+            "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+            "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+            : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHR32(SUB32(mr1__, mr2__), 1); \
+        (m).i = SHR32(mi__, 1); \
+    } \
+    while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+    do{ \
+        int mr__; \
+        int mi1__; \
+        int mi2__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MULC\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mr], %[aval], %[bval]\n\t" \
+            "smulwb %[mi1], %H[aval], %[bval]\n\t" \
+            "smulwt %[mi2], %[aval], %[bval]\n\t" \
+            "smlawt %[mr], %H[aval], %[bval], %[mr]\n\t" \
+            : [mr]"=r"(mr__), [mi1]"=r"(mi1__), [mi2]"=r"(mi2__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHL32(mr__, 1); \
+        (m).i = SHL32(SUB32(mi1__, mi2__), 1); \
+    } \
+    while(0)
+
+#endif /* OPUS_FIXED_POINT */
+
+#endif /* KISS_FFT_GUTS_H */

+ 57 - 0
drivers/opus/celt/arm/pitch_arm.h

@@ -0,0 +1,57 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(PITCH_ARM_H)
+# define PITCH_ARM_H
+
+# include "armcpu.h"
+
+# if defined(OPUS_FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
+    opus_val32 *xcorr, int len, int max_pitch);
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+#   define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_EDSP)
+opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
+    opus_val32 *xcorr, int len, int max_pitch);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
+#  endif
+
+# endif
+
+#endif

+ 1518 - 0
drivers/opus/celt/bands.c

@@ -0,0 +1,1518 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2008-2009 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include <math.h>
+#include "bands.h"
+#include "opus_modes.h"
+#include "vq.h"
+#include "cwrs.h"
+#include "stack_alloc.h"
+#include "os_support.h"
+#include "mathops.h"
+#include "rate.h"
+#include "quant_bands.h"
+#include "pitch.h"
+
+int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev)
+{
+   int i;
+   for (i=0;i<N;i++)
+   {
+      if (val < thresholds[i])
+         break;
+   }
+   if (i>prev && val < thresholds[prev]+hysteresis[prev])
+      i=prev;
+   if (i<prev && val > thresholds[prev-1]-hysteresis[prev-1])
+      i=prev;
+   return i;
+}
+
+opus_uint32 celt_lcg_rand(opus_uint32 seed)
+{
+   return 1664525 * seed + 1013904223;
+}
+
+/* This is a cos() approximation designed to be bit-exact on any platform. Bit exactness
+   with this approximation is important because it has an impact on the bit allocation */
+static opus_int16 bitexact_cos(opus_int16 x)
+{
+   opus_int32 tmp;
+   opus_int16 x2;
+   tmp = (4096+((opus_int32)(x)*(x)))>>13;
+   celt_assert(tmp<=32767);
+   x2 = tmp;
+   x2 = (32767-x2) + FRAC_MUL16(x2, (-7651 + FRAC_MUL16(x2, (8277 + FRAC_MUL16(-626, x2)))));
+   celt_assert(x2<=32766);
+   return 1+x2;
+}
+
+static int bitexact_log2tan(int isin,int icos)
+{
+   int lc;
+   int ls;
+   lc=EC_ILOG(icos);
+   ls=EC_ILOG(isin);
+   icos<<=15-lc;
+   isin<<=15-ls;
+   return (ls-lc)*(1<<11)
+         +FRAC_MUL16(isin, FRAC_MUL16(isin, -2597) + 7932)
+         -FRAC_MUL16(icos, FRAC_MUL16(icos, -2597) + 7932);
+}
+
+#ifdef OPUS_FIXED_POINT
+/* Compute the amplitude (sqrt energy) in each of the bands */
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M)
+{
+   int i, c, N;
+   const opus_int16 *eBands = m->eBands;
+   N = M*m->shortMdctSize;
+   c=0; do {
+      for (i=0;i<end;i++)
+      {
+         int j;
+         opus_val32 maxval=0;
+         opus_val32 sum = 0;
+
+         j=M*eBands[i]; do {
+            maxval = MAX32(maxval, X[j+c*N]);
+            maxval = MAX32(maxval, -X[j+c*N]);
+         } while (++j<M*eBands[i+1]);
+
+         if (maxval > 0)
+         {
+            int shift = celt_ilog2(maxval)-10;
+            j=M*eBands[i]; do {
+               sum = MAC16_16(sum, EXTRACT16(VSHR32(X[j+c*N],shift)),
+                                   EXTRACT16(VSHR32(X[j+c*N],shift)));
+            } while (++j<M*eBands[i+1]);
+            /* We're adding one here to ensure the normalized band isn't larger than unity norm */
+            bandE[i+c*m->nbEBands] = EPSILON+VSHR32(EXTEND32(celt_sqrt(sum)),-shift);
+         } else {
+            bandE[i+c*m->nbEBands] = EPSILON;
+         }
+         /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
+      }
+   } while (++c<C);
+   /*printf ("\n");*/
+}
+
+/* Normalise each band such that the energy is one. */
+void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, celt_norm * OPUS_RESTRICT X, const celt_ener *bandE, int end, int C, int M)
+{
+   int i, c, N;
+   const opus_int16 *eBands = m->eBands;
+   N = M*m->shortMdctSize;
+   c=0; do {
+      i=0; do {
+         opus_val16 g;
+         int j,shift;
+         opus_val16 E;
+         shift = celt_zlog2(bandE[i+c*m->nbEBands])-13;
+         E = VSHR32(bandE[i+c*m->nbEBands], shift);
+         g = EXTRACT16(celt_rcp(SHL32(E,3)));
+         j=M*eBands[i]; do {
+            X[j+c*N] = MULT16_16_Q15(VSHR32(freq[j+c*N],shift-1),g);
+         } while (++j<M*eBands[i+1]);
+      } while (++i<end);
+   } while (++c<C);
+}
+
+#else /* OPUS_FIXED_POINT */
+/* Compute the amplitude (sqrt energy) in each of the bands */
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M)
+{
+   int i, c, N;
+   const opus_int16 *eBands = m->eBands;
+   N = M*m->shortMdctSize;
+   c=0; do {
+      for (i=0;i<end;i++)
+      {
+         int j;
+         opus_val32 sum = 1e-27f;
+         for (j=M*eBands[i];j<M*eBands[i+1];j++)
+            sum += X[j+c*N]*X[j+c*N];
+         bandE[i+c*m->nbEBands] = celt_sqrt(sum);
+         /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
+      }
+   } while (++c<C);
+   /*printf ("\n");*/
+}
+
+/* Normalise each band such that the energy is one. */
+void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, celt_norm * OPUS_RESTRICT X, const celt_ener *bandE, int end, int C, int M)
+{
+   int i, c, N;
+   const opus_int16 *eBands = m->eBands;
+   N = M*m->shortMdctSize;
+   c=0; do {
+      for (i=0;i<end;i++)
+      {
+         int j;
+         opus_val16 g = 1.f/(1e-27f+bandE[i+c*m->nbEBands]);
+         for (j=M*eBands[i];j<M*eBands[i+1];j++)
+            X[j+c*N] = freq[j+c*N]*g;
+      }
+   } while (++c<C);
+}
+
+#endif /* OPUS_FIXED_POINT */
+
+/* De-normalise the energy to produce the synthesis from the unit-energy bands */
+void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandLogE, int start, int end, int C, int M)
+{
+   int i, c, N;
+   const opus_int16 *eBands = m->eBands;
+   N = M*m->shortMdctSize;
+   celt_assert2(C<=2, "denormalise_bands() not implemented for >2 channels");
+   c=0; do {
+      celt_sig * OPUS_RESTRICT f;
+      const celt_norm * OPUS_RESTRICT x;
+      f = freq+c*N;
+      x = X+c*N+M*eBands[start];
+      for (i=0;i<M*eBands[start];i++)
+         *f++ = 0;
+      for (i=start;i<end;i++)
+      {
+         int j, band_end;
+         opus_val16 g;
+         opus_val16 lg;
+#ifdef OPUS_FIXED_POINT
+         int shift;
+#endif
+         j=M*eBands[i];
+         band_end = M*eBands[i+1];
+         lg = ADD16(bandLogE[i+c*m->nbEBands], SHL16((opus_val16)eMeans[i],6));
+#ifndef OPUS_FIXED_POINT
+         g = celt_exp2(lg);
+#else
+         /* Handle the integer part of the log energy */
+         shift = 16-(lg>>DB_SHIFT);
+         if (shift>31)
+         {
+            shift=0;
+            g=0;
+         } else {
+            /* Handle the fractional part. */
+            g = celt_exp2_frac(lg&((1<<DB_SHIFT)-1));
+         }
+         /* Handle extreme gains with negative shift. */
+         if (shift<0)
+         {
+            /* For shift < -2 we'd be likely to overflow, so we're capping
+               the gain here. This shouldn't happen unless the bitstream is
+               already corrupted. */
+            if (shift < -2)
+            {
+               g = 32767;
+               shift = -2;
+            }
+            do {
+               *f++ = SHL32(MULT16_16(*x++, g), -shift);
+            } while (++j<band_end);
+         } else
+#endif
+         /* Be careful of the fixed-point "else" just above when changing this code */
+         do {
+            *f++ = SHR32(MULT16_16(*x++, g), shift);
+         } while (++j<band_end);
+      }
+      celt_assert(start <= end);
+      for (i=M*eBands[end];i<N;i++)
+         *f++ = 0;
+   } while (++c<C);
+}
+
+/* This prevents energy collapse for transients with multiple short MDCTs */
+void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
+      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
+      opus_val16 *prev2logE, int *pulses, opus_uint32 seed)
+{
+   int c, i, j, k;
+   for (i=start;i<end;i++)
+   {
+      int N0;
+      opus_val16 thresh, sqrt_1;
+      int depth;
+#ifdef OPUS_FIXED_POINT
+      int shift;
+      opus_val32 thresh32;
+#endif
+
+      N0 = m->eBands[i+1]-m->eBands[i];
+      /* depth in 1/8 bits */
+      depth = (1+pulses[i])/((m->eBands[i+1]-m->eBands[i])<<LM);
+
+#ifdef OPUS_FIXED_POINT
+      thresh32 = SHR32(celt_exp2(-SHL16(depth, 10-BITRES)),1);
+      thresh = MULT16_32_Q15(QCONST16(0.5f, 15), MIN32(32767,thresh32));
+      {
+         opus_val32 t;
+         t = N0<<LM;
+         shift = celt_ilog2(t)>>1;
+         t = SHL32(t, (7-shift)<<1);
+         sqrt_1 = celt_rsqrt_norm(t);
+      }
+#else
+      thresh = .5f*celt_exp2(-.125f*depth);
+      sqrt_1 = celt_rsqrt(N0<<LM);
+#endif
+
+      c=0; do
+      {
+         celt_norm *X;
+         opus_val16 prev1;
+         opus_val16 prev2;
+         opus_val32 Ediff;
+         opus_val16 r;
+         int renormalize=0;
+         prev1 = prev1logE[c*m->nbEBands+i];
+         prev2 = prev2logE[c*m->nbEBands+i];
+         if (C==1)
+         {
+            prev1 = MAX16(prev1,prev1logE[m->nbEBands+i]);
+            prev2 = MAX16(prev2,prev2logE[m->nbEBands+i]);
+         }
+         Ediff = EXTEND32(logE[c*m->nbEBands+i])-EXTEND32(MIN16(prev1,prev2));
+         Ediff = MAX32(0, Ediff);
+
+#ifdef OPUS_FIXED_POINT
+         if (Ediff < 16384)
+         {
+            opus_val32 r32 = SHR32(celt_exp2(-EXTRACT16(Ediff)),1);
+            r = 2*MIN16(16383,r32);
+         } else {
+            r = 0;
+         }
+         if (LM==3)
+            r = MULT16_16_Q14(23170, MIN32(23169, r));
+         r = SHR16(MIN16(thresh, r),1);
+         r = SHR32(MULT16_16_Q15(sqrt_1, r),shift);
+#else
+         /* r needs to be multiplied by 2 or 2*sqrt(2) depending on LM because
+            short blocks don't have the same energy as long */
+         r = 2.f*celt_exp2(-Ediff);
+         if (LM==3)
+            r *= 1.41421356f;
+         r = MIN16(thresh, r);
+         r = r*sqrt_1;
+#endif
+         X = X_+c*size+(m->eBands[i]<<LM);
+         for (k=0;k<1<<LM;k++)
+         {
+            /* Detect collapse */
+            if (!(collapse_masks[i*C+c]&1<<k))
+            {
+               /* Fill with noise */
+               for (j=0;j<N0;j++)
+               {
+                  seed = celt_lcg_rand(seed);
+                  X[(j<<LM)+k] = (seed&0x8000 ? r : -r);
+               }
+               renormalize = 1;
+            }
+         }
+         /* We just added some energy, so we need to renormalise */
+         if (renormalize)
+            renormalise_vector(X, N0<<LM, Q15ONE);
+      } while (++c<C);
+   }
+}
+
+static void intensity_stereo(const CELTMode *m, celt_norm *X, celt_norm *Y, const celt_ener *bandE, int bandID, int N)
+{
+   int i = bandID;
+   int j;
+   opus_val16 a1, a2;
+   opus_val16 left, right;
+   opus_val16 norm;
+#ifdef OPUS_FIXED_POINT
+   int shift = celt_zlog2(MAX32(bandE[i], bandE[i+m->nbEBands]))-13;
+#endif
+   left = VSHR32(bandE[i],shift);
+   right = VSHR32(bandE[i+m->nbEBands],shift);
+   norm = EPSILON + celt_sqrt(EPSILON+MULT16_16(left,left)+MULT16_16(right,right));
+   a1 = DIV32_16(SHL32(EXTEND32(left),14),norm);
+   a2 = DIV32_16(SHL32(EXTEND32(right),14),norm);
+   for (j=0;j<N;j++)
+   {
+      celt_norm r, l;
+      l = X[j];
+      r = Y[j];
+      X[j] = MULT16_16_Q14(a1,l) + MULT16_16_Q14(a2,r);
+      /* Side is not encoded, no need to calculate */
+   }
+}
+
+static void stereo_split(celt_norm *X, celt_norm *Y, int N)
+{
+   int j;
+   for (j=0;j<N;j++)
+   {
+      celt_norm r, l;
+      l = MULT16_16_Q15(QCONST16(.70710678f,15), X[j]);
+      r = MULT16_16_Q15(QCONST16(.70710678f,15), Y[j]);
+      X[j] = l+r;
+      Y[j] = r-l;
+   }
+}
+
+static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
+{
+   int j;
+   opus_val32 xp=0, side=0;
+   opus_val32 El, Er;
+   opus_val16 mid2;
+#ifdef OPUS_FIXED_POINT
+   int kl, kr;
+#endif
+   opus_val32 t, lgain, rgain;
+
+   /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
+   dual_inner_prod(Y, X, Y, N, &xp, &side);
+   /* Compensating for the mid normalization */
+   xp = MULT16_32_Q15(mid, xp);
+   /* mid and side are in Q15, not Q14 like X and Y */
+   mid2 = SHR32(mid, 1);
+   El = MULT16_16(mid2, mid2) + side - 2*xp;
+   Er = MULT16_16(mid2, mid2) + side + 2*xp;
+   if (Er < QCONST32(6e-4f, 28) || El < QCONST32(6e-4f, 28))
+   {
+      for (j=0;j<N;j++)
+         Y[j] = X[j];
+      return;
+   }
+
+#ifdef OPUS_FIXED_POINT
+   kl = celt_ilog2(El)>>1;
+   kr = celt_ilog2(Er)>>1;
+#endif
+   t = VSHR32(El, (kl-7)<<1);
+   lgain = celt_rsqrt_norm(t);
+   t = VSHR32(Er, (kr-7)<<1);
+   rgain = celt_rsqrt_norm(t);
+
+#ifdef OPUS_FIXED_POINT
+   if (kl < 7)
+      kl = 7;
+   if (kr < 7)
+      kr = 7;
+#endif
+
+   for (j=0;j<N;j++)
+   {
+      celt_norm r, l;
+      /* Apply mid scaling (side is already scaled) */
+      l = MULT16_16_Q15(mid, X[j]);
+      r = Y[j];
+      X[j] = EXTRACT16(PSHR32(MULT16_16(lgain, SUB16(l,r)), kl+1));
+      Y[j] = EXTRACT16(PSHR32(MULT16_16(rgain, ADD16(l,r)), kr+1));
+   }
+}
+
+/* Decide whether we should spread the pulses in the current frame */
+int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
+      int end, int C, int M)
+{
+   int i, c, N0;
+   int sum = 0, nbBands=0;
+   const opus_int16 * OPUS_RESTRICT eBands = m->eBands;
+   int decision;
+   int hf_sum=0;
+
+   celt_assert(end>0);
+
+   N0 = M*m->shortMdctSize;
+
+   if (M*(eBands[end]-eBands[end-1]) <= 8)
+      return SPREAD_NONE;
+   c=0; do {
+      for (i=0;i<end;i++)
+      {
+         int j, N, tmp=0;
+         int tcount[3] = {0,0,0};
+         celt_norm * OPUS_RESTRICT x = X+M*eBands[i]+c*N0;
+         N = M*(eBands[i+1]-eBands[i]);
+         if (N<=8)
+            continue;
+         /* Compute rough CDF of |x[j]| */
+         for (j=0;j<N;j++)
+         {
+            opus_val32 x2N; /* Q13 */
+
+            x2N = MULT16_16(MULT16_16_Q15(x[j], x[j]), N);
+            if (x2N < QCONST16(0.25f,13))
+               tcount[0]++;
+            if (x2N < QCONST16(0.0625f,13))
+               tcount[1]++;
+            if (x2N < QCONST16(0.015625f,13))
+               tcount[2]++;
+         }
+
+         /* Only include four last bands (8 kHz and up) */
+         if (i>m->nbEBands-4)
+            hf_sum += 32*(tcount[1]+tcount[0])/N;
+         tmp = (2*tcount[2] >= N) + (2*tcount[1] >= N) + (2*tcount[0] >= N);
+         sum += tmp*256;
+         nbBands++;
+      }
+   } while (++c<C);
+
+   if (update_hf)
+   {
+      if (hf_sum)
+         hf_sum /= C*(4-m->nbEBands+end);
+      *hf_average = (*hf_average+hf_sum)>>1;
+      hf_sum = *hf_average;
+      if (*tapset_decision==2)
+         hf_sum += 4;
+      else if (*tapset_decision==0)
+         hf_sum -= 4;
+      if (hf_sum > 22)
+         *tapset_decision=2;
+      else if (hf_sum > 18)
+         *tapset_decision=1;
+      else
+         *tapset_decision=0;
+   }
+   /*printf("%d %d %d\n", hf_sum, *hf_average, *tapset_decision);*/
+   celt_assert(nbBands>0); /* end has to be non-zero */
+   sum /= nbBands;
+   /* Recursive averaging */
+   sum = (sum+*average)>>1;
+   *average = sum;
+   /* Hysteresis */
+   sum = (3*sum + (((3-last_decision)<<7) + 64) + 2)>>2;
+   if (sum < 80)
+   {
+      decision = SPREAD_AGGRESSIVE;
+   } else if (sum < 256)
+   {
+      decision = SPREAD_NORMAL;
+   } else if (sum < 384)
+   {
+      decision = SPREAD_LIGHT;
+   } else {
+      decision = SPREAD_NONE;
+   }
+#ifdef FUZZING
+   decision = rand()&0x3;
+   *tapset_decision=rand()%3;
+#endif
+   return decision;
+}
+
+/* Indexing table for converting from natural Hadamard to ordery Hadamard
+   This is essentially a bit-reversed Gray, on top of which we've added
+   an inversion of the order because we want the DC at the end rather than
+   the beginning. The lines are for N=2, 4, 8, 16 */
+static const int ordery_table[] = {
+       1,  0,
+       3,  0,  2,  1,
+       7,  0,  4,  3,  6,  1,  5,  2,
+      15,  0,  8,  7, 12,  3, 11,  4, 14,  1,  9,  6, 13,  2, 10,  5,
+};
+
+static void deinterleave_hadamard(celt_norm *X, int N0, int stride, int hadamard)
+{
+   int i,j;
+   VARDECL(celt_norm, tmp);
+   int N;
+   SAVE_STACK;
+   N = N0*stride;
+   ALLOC(tmp, N, celt_norm);
+   celt_assert(stride>0);
+   if (hadamard)
+   {
+      const int *ordery = ordery_table+stride-2;
+      for (i=0;i<stride;i++)
+      {
+         for (j=0;j<N0;j++)
+            tmp[ordery[i]*N0+j] = X[j*stride+i];
+      }
+   } else {
+      for (i=0;i<stride;i++)
+         for (j=0;j<N0;j++)
+            tmp[i*N0+j] = X[j*stride+i];
+   }
+   for (j=0;j<N;j++)
+      X[j] = tmp[j];
+   RESTORE_STACK;
+}
+
+static void interleave_hadamard(celt_norm *X, int N0, int stride, int hadamard)
+{
+   int i,j;
+   VARDECL(celt_norm, tmp);
+   int N;
+   SAVE_STACK;
+   N = N0*stride;
+   ALLOC(tmp, N, celt_norm);
+   if (hadamard)
+   {
+      const int *ordery = ordery_table+stride-2;
+      for (i=0;i<stride;i++)
+         for (j=0;j<N0;j++)
+            tmp[j*stride+i] = X[ordery[i]*N0+j];
+   } else {
+      for (i=0;i<stride;i++)
+         for (j=0;j<N0;j++)
+            tmp[j*stride+i] = X[i*N0+j];
+   }
+   for (j=0;j<N;j++)
+      X[j] = tmp[j];
+   RESTORE_STACK;
+}
+
+void haar1(celt_norm *X, int N0, int stride)
+{
+   int i, j;
+   N0 >>= 1;
+   for (i=0;i<stride;i++)
+      for (j=0;j<N0;j++)
+      {
+         celt_norm tmp1, tmp2;
+         tmp1 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*2*j+i]);
+         tmp2 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*(2*j+1)+i]);
+         X[stride*2*j+i] = tmp1 + tmp2;
+         X[stride*(2*j+1)+i] = tmp1 - tmp2;
+      }
+}
+
+static int compute_qn(int N, int b, int offset, int pulse_cap, int stereo)
+{
+   static const opus_int16 exp2_table8[8] =
+      {16384, 17866, 19483, 21247, 23170, 25267, 27554, 30048};
+   int qn, qb;
+   int N2 = 2*N-1;
+   if (stereo && N==2)
+      N2--;
+   /* The upper limit ensures that in a stereo split with itheta==16384, we'll
+       always have enough bits left over to code at least one pulse in the
+       side; otherwise it would collapse, since it doesn't get folded. */
+   qb = IMIN(b-pulse_cap-(4<<BITRES), (b+N2*offset)/N2);
+
+   qb = IMIN(8<<BITRES, qb);
+
+   if (qb<(1<<BITRES>>1)) {
+      qn = 1;
+   } else {
+      qn = exp2_table8[qb&0x7]>>(14-(qb>>BITRES));
+      qn = (qn+1)>>1<<1;
+   }
+   celt_assert(qn <= 256);
+   return qn;
+}
+
+struct band_ctx {
+   int encode;
+   const CELTMode *m;
+   int i;
+   int intensity;
+   int spread;
+   int tf_change;
+   ec_ctx *ec;
+   opus_int32 remaining_bits;
+   const celt_ener *bandE;
+   opus_uint32 seed;
+};
+
+struct split_ctx {
+   int inv;
+   int imid;
+   int iside;
+   int delta;
+   int itheta;
+   int qalloc;
+};
+
+static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
+      celt_norm *X, celt_norm *Y, int N, int *b, int B, int B0,
+      int LM,
+      int stereo, int *fill)
+{
+   int qn;
+   int itheta=0;
+   int delta;
+   int imid, iside;
+   int qalloc;
+   int pulse_cap;
+   int offset;
+   opus_int32 tell;
+   int inv=0;
+   int encode;
+   const CELTMode *m;
+   int i;
+   int intensity;
+   ec_ctx *ec;
+   const celt_ener *bandE;
+
+   encode = ctx->encode;
+   m = ctx->m;
+   i = ctx->i;
+   intensity = ctx->intensity;
+   ec = ctx->ec;
+   bandE = ctx->bandE;
+
+   /* Decide on the resolution to give to the split parameter theta */
+   pulse_cap = m->logN[i]+LM*(1<<BITRES);
+   offset = (pulse_cap>>1) - (stereo&&N==2 ? QTHETA_OFFSET_TWOPHASE : QTHETA_OFFSET);
+   qn = compute_qn(N, *b, offset, pulse_cap, stereo);
+   if (stereo && i>=intensity)
+      qn = 1;
+   if (encode)
+   {
+      /* theta is the atan() of the ratio between the (normalized)
+         side and mid. With just that parameter, we can re-scale both
+         mid and side because we know that 1) they have unit norm and
+         2) they are orthogonal. */
+      itheta = stereo_itheta(X, Y, stereo, N);
+   }
+   tell = ec_tell_frac(ec);
+   if (qn!=1)
+   {
+      if (encode)
+         itheta = (itheta*qn+8192)>>14;
+
+      /* Entropy coding of the angle. We use a uniform pdf for the
+         time split, a step for stereo, and a triangular one for the rest. */
+      if (stereo && N>2)
+      {
+         int p0 = 3;
+         int x = itheta;
+         int x0 = qn/2;
+         int ft = p0*(x0+1) + x0;
+         /* Use a probability of p0 up to itheta=8192 and then use 1 after */
+         if (encode)
+         {
+            ec_encode(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
+         } else {
+            int fs;
+            fs=ec_decode(ec,ft);
+            if (fs<(x0+1)*p0)
+               x=fs/p0;
+            else
+               x=x0+1+(fs-(x0+1)*p0);
+            ec_dec_update(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
+            itheta = x;
+         }
+      } else if (B0>1 || stereo) {
+         /* Uniform pdf */
+         if (encode)
+            ec_enc_uint(ec, itheta, qn+1);
+         else
+            itheta = ec_dec_uint(ec, qn+1);
+      } else {
+         int fs=1, ft;
+         ft = ((qn>>1)+1)*((qn>>1)+1);
+         if (encode)
+         {
+            int fl;
+
+            fs = itheta <= (qn>>1) ? itheta + 1 : qn + 1 - itheta;
+            fl = itheta <= (qn>>1) ? itheta*(itheta + 1)>>1 :
+             ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
+
+            ec_encode(ec, fl, fl+fs, ft);
+         } else {
+            /* Triangular pdf */
+            int fl=0;
+            int fm;
+            fm = ec_decode(ec, ft);
+
+            if (fm < ((qn>>1)*((qn>>1) + 1)>>1))
+            {
+               itheta = (isqrt32(8*(opus_uint32)fm + 1) - 1)>>1;
+               fs = itheta + 1;
+               fl = itheta*(itheta + 1)>>1;
+            }
+            else
+            {
+               itheta = (2*(qn + 1)
+                - isqrt32(8*(opus_uint32)(ft - fm - 1) + 1))>>1;
+               fs = qn + 1 - itheta;
+               fl = ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
+            }
+
+            ec_dec_update(ec, fl, fl+fs, ft);
+         }
+      }
+      itheta = (opus_int32)itheta*16384/qn;
+      if (encode && stereo)
+      {
+         if (itheta==0)
+            intensity_stereo(m, X, Y, bandE, i, N);
+         else
+            stereo_split(X, Y, N);
+      }
+      /* NOTE: Renormalising X and Y *may* help fixed-point a bit at very high rate.
+               Let's do that at higher complexity */
+   } else if (stereo) {
+      if (encode)
+      {
+         inv = itheta > 8192;
+         if (inv)
+         {
+            int j;
+            for (j=0;j<N;j++)
+               Y[j] = -Y[j];
+         }
+         intensity_stereo(m, X, Y, bandE, i, N);
+      }
+      if (*b>2<<BITRES && ctx->remaining_bits > 2<<BITRES)
+      {
+         if (encode)
+            ec_enc_bit_logp(ec, inv, 2);
+         else
+            inv = ec_dec_bit_logp(ec, 2);
+      } else
+         inv = 0;
+      itheta = 0;
+   }
+   qalloc = ec_tell_frac(ec) - tell;
+   *b -= qalloc;
+
+   if (itheta == 0)
+   {
+      imid = 32767;
+      iside = 0;
+      *fill &= (1<<B)-1;
+      delta = -16384;
+   } else if (itheta == 16384)
+   {
+      imid = 0;
+      iside = 32767;
+      *fill &= ((1<<B)-1)<<B;
+      delta = 16384;
+   } else {
+      imid = bitexact_cos((opus_int16)itheta);
+      iside = bitexact_cos((opus_int16)(16384-itheta));
+      /* This is the mid vs side allocation that minimizes squared error
+         in that band. */
+      delta = FRAC_MUL16((N-1)<<7,bitexact_log2tan(iside,imid));
+   }
+
+   sctx->inv = inv;
+   sctx->imid = imid;
+   sctx->iside = iside;
+   sctx->delta = delta;
+   sctx->itheta = itheta;
+   sctx->qalloc = qalloc;
+}
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+      celt_norm *lowband_out)
+{
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   int c;
+   int stereo;
+   celt_norm *x = X;
+   int encode;
+   ec_ctx *ec;
+
+   encode = ctx->encode;
+   ec = ctx->ec;
+
+   stereo = Y != NULL;
+   c=0; do {
+      int sign=0;
+      if (ctx->remaining_bits>=1<<BITRES)
+      {
+         if (encode)
+         {
+            sign = x[0]<0;
+            ec_enc_bits(ec, sign, 1);
+         } else {
+            sign = ec_dec_bits(ec, 1);
+         }
+         ctx->remaining_bits -= 1<<BITRES;
+         b-=1<<BITRES;
+      }
+      if (resynth)
+         x[0] = sign ? -NORM_SCALING : NORM_SCALING;
+      x = Y;
+   } while (++c<1+stereo);
+   if (lowband_out)
+      lowband_out[0] = SHR16(X[0],4);
+   return 1;
+}
+
+/* This function is responsible for encoding and decoding a mono partition.
+   It can split the band in two and transmit the energy difference with
+   the two half-bands. It can be called recursively so bands can end up being
+   split in 8 parts. */
+static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
+      int N, int b, int B, celt_norm *lowband,
+      int LM,
+      opus_val16 gain, int fill)
+{
+   const unsigned char *cache;
+   int q;
+   int curr_bits;
+   int imid=0, iside=0;
+   int B0=B;
+   opus_val16 mid=0, side=0;
+   unsigned cm=0;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   celt_norm *Y=NULL;
+   int encode;
+   const CELTMode *m;
+   int i;
+   int spread;
+   ec_ctx *ec;
+
+   encode = ctx->encode;
+   m = ctx->m;
+   i = ctx->i;
+   spread = ctx->spread;
+   ec = ctx->ec;
+
+   /* If we need 1.5 more bit than we can produce, split the band in two. */
+   cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i];
+   if (LM != -1 && b > cache[cache[0]]+12 && N>2)
+   {
+      int mbits, sbits, delta;
+      int itheta;
+      int qalloc;
+      struct split_ctx sctx;
+      celt_norm *next_lowband2=NULL;
+      opus_int32 rebalance;
+
+      N >>= 1;
+      Y = X+N;
+      LM -= 1;
+      if (B==1)
+         fill = (fill&1)|(fill<<1);
+      B = (B+1)>>1;
+
+      compute_theta(ctx, &sctx, X, Y, N, &b, B, B0,
+            LM, 0, &fill);
+      imid = sctx.imid;
+      iside = sctx.iside;
+      delta = sctx.delta;
+      itheta = sctx.itheta;
+      qalloc = sctx.qalloc;
+#ifdef OPUS_FIXED_POINT
+      mid = imid;
+      side = iside;
+#else
+      mid = (1.f/32768)*imid;
+      side = (1.f/32768)*iside;
+#endif
+
+      /* Give more bits to low-energy MDCTs than they would otherwise deserve */
+      if (B0>1 && (itheta&0x3fff))
+      {
+         if (itheta > 8192)
+            /* Rough approximation for pre-echo masking */
+            delta -= delta>>(4-LM);
+         else
+            /* Corresponds to a forward-masking slope of 1.5 dB per 10 ms */
+            delta = IMIN(0, delta + (N<<BITRES>>(5-LM)));
+      }
+      mbits = IMAX(0, IMIN(b, (b-delta)/2));
+      sbits = b-mbits;
+      ctx->remaining_bits -= qalloc;
+
+      if (lowband)
+         next_lowband2 = lowband+N; /* >32-bit split case */
+
+      rebalance = ctx->remaining_bits;
+      if (mbits >= sbits)
+      {
+         cm = quant_partition(ctx, X, N, mbits, B,
+               lowband, LM,
+               MULT16_16_P15(gain,mid), fill);
+         rebalance = mbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=0)
+            sbits += rebalance - (3<<BITRES);
+         cm |= quant_partition(ctx, Y, N, sbits, B,
+               next_lowband2, LM,
+               MULT16_16_P15(gain,side), fill>>B)<<(B0>>1);
+      } else {
+         cm = quant_partition(ctx, Y, N, sbits, B,
+               next_lowband2, LM,
+               MULT16_16_P15(gain,side), fill>>B)<<(B0>>1);
+         rebalance = sbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=16384)
+            mbits += rebalance - (3<<BITRES);
+         cm |= quant_partition(ctx, X, N, mbits, B,
+               lowband, LM,
+               MULT16_16_P15(gain,mid), fill);
+      }
+   } else {
+      /* This is the basic no-split case */
+      q = bits2pulses(m, i, LM, b);
+      curr_bits = pulses2bits(m, i, LM, q);
+      ctx->remaining_bits -= curr_bits;
+
+      /* Ensures we can never bust the budget */
+      while (ctx->remaining_bits < 0 && q > 0)
+      {
+         ctx->remaining_bits += curr_bits;
+         q--;
+         curr_bits = pulses2bits(m, i, LM, q);
+         ctx->remaining_bits -= curr_bits;
+      }
+
+      if (q!=0)
+      {
+         int K = get_pulses(q);
+
+         /* Finally do the actual quantization */
+         if (encode)
+         {
+            cm = alg_quant(X, N, K, spread, B, ec
+#ifdef RESYNTH
+                 , gain
+#endif
+                 );
+         } else {
+            cm = alg_unquant(X, N, K, spread, B, ec, gain);
+         }
+      } else {
+         /* If there's no pulse, fill the band anyway */
+         int j;
+         if (resynth)
+         {
+            unsigned cm_mask;
+            /* B can be as large as 16, so this shift might overflow an int on a
+               16-bit platform; use a long to get defined behavior.*/
+            cm_mask = (unsigned)(1UL<<B)-1;
+            fill &= cm_mask;
+            if (!fill)
+            {
+               for (j=0;j<N;j++)
+                  X[j] = 0;
+            } else {
+               if (lowband == NULL)
+               {
+                  /* Noise */
+                  for (j=0;j<N;j++)
+                  {
+                     ctx->seed = celt_lcg_rand(ctx->seed);
+                     X[j] = (celt_norm)((opus_int32)ctx->seed>>20);
+                  }
+                  cm = cm_mask;
+               } else {
+                  /* Folded spectrum */
+                  for (j=0;j<N;j++)
+                  {
+                     opus_val16 tmp;
+                     ctx->seed = celt_lcg_rand(ctx->seed);
+                     /* About 48 dB below the "normal" folding level */
+                     tmp = QCONST16(1.0f/256, 10);
+                     tmp = (ctx->seed)&0x8000 ? tmp : -tmp;
+                     X[j] = lowband[j]+tmp;
+                  }
+                  cm = fill;
+               }
+               renormalise_vector(X, N, gain);
+            }
+         }
+      }
+   }
+
+   return cm;
+}
+
+
+/* This function is responsible for encoding and decoding a band for the mono case. */
+static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
+      int N, int b, int B, celt_norm *lowband,
+      int LM, celt_norm *lowband_out,
+      opus_val16 gain, celt_norm *lowband_scratch, int fill)
+{
+   int N0=N;
+   int N_B=N;
+   int N_B0;
+   int B0=B;
+   int time_divide=0;
+   int recombine=0;
+   int longBlocks;
+   unsigned cm=0;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   int k;
+   int encode;
+   int tf_change;
+
+   encode = ctx->encode;
+   tf_change = ctx->tf_change;
+
+   longBlocks = B0==1;
+
+   N_B /= B;
+
+   /* Special case for one sample */
+   if (N==1)
+   {
+      return quant_band_n1(ctx, X, NULL, b, lowband_out);
+   }
+
+   if (tf_change>0)
+      recombine = tf_change;
+   /* Band recombining to increase frequency resolution */
+
+   if (lowband_scratch && lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1))
+   {
+      int j;
+      for (j=0;j<N;j++)
+         lowband_scratch[j] = lowband[j];
+      lowband = lowband_scratch;
+   }
+
+   for (k=0;k<recombine;k++)
+   {
+      static const unsigned char bit_interleave_table[16]={
+            0,1,1,1,2,3,3,3,2,3,3,3,2,3,3,3
+      };
+      if (encode)
+         haar1(X, N>>k, 1<<k);
+      if (lowband)
+         haar1(lowband, N>>k, 1<<k);
+      fill = bit_interleave_table[fill&0xF]|bit_interleave_table[fill>>4]<<2;
+   }
+   B>>=recombine;
+   N_B<<=recombine;
+
+   /* Increasing the time resolution */
+   while ((N_B&1) == 0 && tf_change<0)
+   {
+      if (encode)
+         haar1(X, N_B, B);
+      if (lowband)
+         haar1(lowband, N_B, B);
+      fill |= fill<<B;
+      B <<= 1;
+      N_B >>= 1;
+      time_divide++;
+      tf_change++;
+   }
+   B0=B;
+   N_B0 = N_B;
+
+   /* Reorganize the samples in time order instead of frequency order */
+   if (B0>1)
+   {
+      if (encode)
+         deinterleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
+      if (lowband)
+         deinterleave_hadamard(lowband, N_B>>recombine, B0<<recombine, longBlocks);
+   }
+
+   cm = quant_partition(ctx, X, N, b, B, lowband,
+         LM, gain, fill);
+
+   /* This code is used by the decoder and by the resynthesis-enabled encoder */
+   if (resynth)
+   {
+      /* Undo the sample reorganization going from time order to frequency order */
+      if (B0>1)
+         interleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
+
+      /* Undo time-freq changes that we did earlier */
+      N_B = N_B0;
+      B = B0;
+      for (k=0;k<time_divide;k++)
+      {
+         B >>= 1;
+         N_B <<= 1;
+         cm |= cm>>B;
+         haar1(X, N_B, B);
+      }
+
+      for (k=0;k<recombine;k++)
+      {
+         static const unsigned char bit_deinterleave_table[16]={
+               0x00,0x03,0x0C,0x0F,0x30,0x33,0x3C,0x3F,
+               0xC0,0xC3,0xCC,0xCF,0xF0,0xF3,0xFC,0xFF
+         };
+         cm = bit_deinterleave_table[cm];
+         haar1(X, N0>>k, 1<<k);
+      }
+      B<<=recombine;
+
+      /* Scale output for later folding */
+      if (lowband_out)
+      {
+         int j;
+         opus_val16 n;
+         n = celt_sqrt(SHL32(EXTEND32(N0),22));
+         for (j=0;j<N0;j++)
+            lowband_out[j] = MULT16_16_Q15(n,X[j]);
+      }
+      cm &= (1<<B)-1;
+   }
+   return cm;
+}
+
+
+/* This function is responsible for encoding and decoding a band for the stereo case. */
+static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
+      int N, int b, int B, celt_norm *lowband,
+      int LM, celt_norm *lowband_out,
+      celt_norm *lowband_scratch, int fill)
+{
+   int imid=0, iside=0;
+   int inv = 0;
+   opus_val16 mid=0, side=0;
+   unsigned cm=0;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   int mbits, sbits, delta;
+   int itheta;
+   int qalloc;
+   struct split_ctx sctx;
+   int orig_fill;
+   int encode;
+   ec_ctx *ec;
+
+   encode = ctx->encode;
+   ec = ctx->ec;
+
+   /* Special case for one sample */
+   if (N==1)
+   {
+      return quant_band_n1(ctx, X, Y, b, lowband_out);
+   }
+
+   orig_fill = fill;
+
+   compute_theta(ctx, &sctx, X, Y, N, &b, B, B,
+         LM, 1, &fill);
+   inv = sctx.inv;
+   imid = sctx.imid;
+   iside = sctx.iside;
+   delta = sctx.delta;
+   itheta = sctx.itheta;
+   qalloc = sctx.qalloc;
+#ifdef OPUS_FIXED_POINT
+   mid = imid;
+   side = iside;
+#else
+   mid = (1.f/32768)*imid;
+   side = (1.f/32768)*iside;
+#endif
+
+   /* This is a special case for N=2 that only works for stereo and takes
+      advantage of the fact that mid and side are orthogonal to encode
+      the side with just one bit. */
+   if (N==2)
+   {
+      int c;
+      int sign=0;
+      celt_norm *x2, *y2;
+      mbits = b;
+      sbits = 0;
+      /* Only need one bit for the side. */
+      if (itheta != 0 && itheta != 16384)
+         sbits = 1<<BITRES;
+      mbits -= sbits;
+      c = itheta > 8192;
+      ctx->remaining_bits -= qalloc+sbits;
+
+      x2 = c ? Y : X;
+      y2 = c ? X : Y;
+      if (sbits)
+      {
+         if (encode)
+         {
+            /* Here we only need to encode a sign for the side. */
+            sign = x2[0]*y2[1] - x2[1]*y2[0] < 0;
+            ec_enc_bits(ec, sign, 1);
+         } else {
+            sign = ec_dec_bits(ec, 1);
+         }
+      }
+      sign = 1-2*sign;
+      /* We use orig_fill here because we want to fold the side, but if
+         itheta==16384, we'll have cleared the low bits of fill. */
+      cm = quant_band(ctx, x2, N, mbits, B, lowband,
+            LM, lowband_out, Q15ONE, lowband_scratch, orig_fill);
+      /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
+         and there's no need to worry about mixing with the other channel. */
+      y2[0] = -sign*x2[1];
+      y2[1] = sign*x2[0];
+      if (resynth)
+      {
+         celt_norm tmp;
+         X[0] = MULT16_16_Q15(mid, X[0]);
+         X[1] = MULT16_16_Q15(mid, X[1]);
+         Y[0] = MULT16_16_Q15(side, Y[0]);
+         Y[1] = MULT16_16_Q15(side, Y[1]);
+         tmp = X[0];
+         X[0] = SUB16(tmp,Y[0]);
+         Y[0] = ADD16(tmp,Y[0]);
+         tmp = X[1];
+         X[1] = SUB16(tmp,Y[1]);
+         Y[1] = ADD16(tmp,Y[1]);
+      }
+   } else {
+      /* "Normal" split code */
+      opus_int32 rebalance;
+
+      mbits = IMAX(0, IMIN(b, (b-delta)/2));
+      sbits = b-mbits;
+      ctx->remaining_bits -= qalloc;
+
+      rebalance = ctx->remaining_bits;
+      if (mbits >= sbits)
+      {
+         /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
+            mid for folding later. */
+         cm = quant_band(ctx, X, N, mbits, B,
+               lowband, LM, lowband_out,
+               Q15ONE, lowband_scratch, fill);
+         rebalance = mbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=0)
+            sbits += rebalance - (3<<BITRES);
+
+         /* For a stereo split, the high bits of fill are always zero, so no
+            folding will be done to the side. */
+         cm |= quant_band(ctx, Y, N, sbits, B,
+               NULL, LM, NULL,
+               side, NULL, fill>>B);
+      } else {
+         /* For a stereo split, the high bits of fill are always zero, so no
+            folding will be done to the side. */
+         cm = quant_band(ctx, Y, N, sbits, B,
+               NULL, LM, NULL,
+               side, NULL, fill>>B);
+         rebalance = sbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=16384)
+            mbits += rebalance - (3<<BITRES);
+         /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
+            mid for folding later. */
+         cm |= quant_band(ctx, X, N, mbits, B,
+               lowband, LM, lowband_out,
+               Q15ONE, lowband_scratch, fill);
+      }
+   }
+
+
+   /* This code is used by the decoder and by the resynthesis-enabled encoder */
+   if (resynth)
+   {
+      if (N!=2)
+         stereo_merge(X, Y, mid, N);
+      if (inv)
+      {
+         int j;
+         for (j=0;j<N;j++)
+            Y[j] = -Y[j];
+      }
+   }
+   return cm;
+}
+
+
+void quant_all_bands(int encode, const CELTMode *m, int start, int end,
+      celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
+      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
+      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed)
+{
+   int i;
+   opus_int32 remaining_bits;
+   const opus_int16 * OPUS_RESTRICT eBands = m->eBands;
+   celt_norm * OPUS_RESTRICT norm, * OPUS_RESTRICT norm2;
+   VARDECL(celt_norm, _norm);
+   celt_norm *lowband_scratch;
+   int B;
+   int M;
+   int lowband_offset;
+   int update_lowband = 1;
+   int C = Y_ != NULL ? 2 : 1;
+   int norm_offset;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !encode;
+#endif
+   struct band_ctx ctx;
+   SAVE_STACK;
+
+   M = 1<<LM;
+   B = shortBlocks ? M : 1;
+   norm_offset = M*eBands[start];
+   /* No need to allocate norm for the last band because we don't need an
+      output in that band. */
+   ALLOC(_norm, C*(M*eBands[m->nbEBands-1]-norm_offset), celt_norm);
+   norm = _norm;
+   norm2 = norm + M*eBands[m->nbEBands-1]-norm_offset;
+   /* We can use the last band as scratch space because we don't need that
+      scratch space for the last band. */
+   lowband_scratch = X_+M*eBands[m->nbEBands-1];
+
+   lowband_offset = 0;
+   ctx.bandE = bandE;
+   ctx.ec = ec;
+   ctx.encode = encode;
+   ctx.intensity = intensity;
+   ctx.m = m;
+   ctx.seed = *seed;
+   ctx.spread = spread;
+   for (i=start;i<end;i++)
+   {
+      opus_int32 tell;
+      int b;
+      int N;
+      opus_int32 curr_balance;
+      int effective_lowband=-1;
+      celt_norm * OPUS_RESTRICT X, * OPUS_RESTRICT Y;
+      int tf_change=0;
+      unsigned x_cm;
+      unsigned y_cm;
+      int last;
+
+      ctx.i = i;
+      last = (i==end-1);
+
+      X = X_+M*eBands[i];
+      if (Y_!=NULL)
+         Y = Y_+M*eBands[i];
+      else
+         Y = NULL;
+      N = M*eBands[i+1]-M*eBands[i];
+      tell = ec_tell_frac(ec);
+
+      /* Compute how many bits we want to allocate to this band */
+      if (i != start)
+         balance -= tell;
+      remaining_bits = total_bits-tell-1;
+      ctx.remaining_bits = remaining_bits;
+      if (i <= codedBands-1)
+      {
+         curr_balance = balance / IMIN(3, codedBands-i);
+         b = IMAX(0, IMIN(16383, IMIN(remaining_bits+1,pulses[i]+curr_balance)));
+      } else {
+         b = 0;
+      }
+
+      if (resynth && M*eBands[i]-N >= M*eBands[start] && (update_lowband || lowband_offset==0))
+            lowband_offset = i;
+
+      tf_change = tf_res[i];
+      ctx.tf_change = tf_change;
+      if (i>=m->effEBands)
+      {
+         X=norm;
+         if (Y_!=NULL)
+            Y = norm;
+         lowband_scratch = NULL;
+      }
+      if (i==end-1)
+         lowband_scratch = NULL;
+
+      /* Get a conservative estimate of the collapse_mask's for the bands we're
+         going to be folding from. */
+      if (lowband_offset != 0 && (spread!=SPREAD_AGGRESSIVE || B>1 || tf_change<0))
+      {
+         int fold_start;
+         int fold_end;
+         int fold_i;
+         /* This ensures we never repeat spectral content within one band */
+         effective_lowband = IMAX(0, M*eBands[lowband_offset]-norm_offset-N);
+         fold_start = lowband_offset;
+         while(M*eBands[--fold_start] > effective_lowband+norm_offset);
+         fold_end = lowband_offset-1;
+         while(M*eBands[++fold_end] < effective_lowband+norm_offset+N);
+         x_cm = y_cm = 0;
+         fold_i = fold_start; do {
+           x_cm |= collapse_masks[fold_i*C+0];
+           y_cm |= collapse_masks[fold_i*C+C-1];
+         } while (++fold_i<fold_end);
+      }
+      /* Otherwise, we'll be using the LCG to fold, so all blocks will (almost
+         always) be non-zero. */
+      else
+         x_cm = y_cm = (1<<B)-1;
+
+      if (dual_stereo && i==intensity)
+      {
+         int j;
+
+         /* Switch off dual stereo to do intensity. */
+         dual_stereo = 0;
+         if (resynth)
+            for (j=0;j<M*eBands[i]-norm_offset;j++)
+               norm[j] = HALF32(norm[j]+norm2[j]);
+      }
+      if (dual_stereo)
+      {
+         x_cm = quant_band(&ctx, X, N, b/2, B,
+               effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+               last?NULL:norm+M*eBands[i]-norm_offset, Q15ONE, lowband_scratch, x_cm);
+         y_cm = quant_band(&ctx, Y, N, b/2, B,
+               effective_lowband != -1 ? norm2+effective_lowband : NULL, LM,
+               last?NULL:norm2+M*eBands[i]-norm_offset, Q15ONE, lowband_scratch, y_cm);
+      } else {
+         if (Y!=NULL)
+         {
+            x_cm = quant_band_stereo(&ctx, X, Y, N, b, B,
+                  effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                        last?NULL:norm+M*eBands[i]-norm_offset, lowband_scratch, x_cm|y_cm);
+         } else {
+            x_cm = quant_band(&ctx, X, N, b, B,
+                  effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                        last?NULL:norm+M*eBands[i]-norm_offset, Q15ONE, lowband_scratch, x_cm|y_cm);
+         }
+         y_cm = x_cm;
+      }
+      collapse_masks[i*C+0] = (unsigned char)x_cm;
+      collapse_masks[i*C+C-1] = (unsigned char)y_cm;
+      balance += pulses[i] + tell;
+
+      /* Update the folding position only as long as we have 1 bit/sample depth. */
+      update_lowband = b>(N<<BITRES);
+   }
+   *seed = ctx.seed;
+
+   RESTORE_STACK;
+}
+

+ 114 - 0
drivers/opus/celt/bands.h

@@ -0,0 +1,114 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2008-2009 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef BANDS_H
+#define BANDS_H
+
+#include "arch.h"
+#include "opus_modes.h"
+#include "entenc.h"
+#include "entdec.h"
+#include "rate.h"
+
+/** Compute the amplitude (sqrt energy) in each of the bands
+ * @param m Mode data
+ * @param X Spectrum
+ * @param bandE Square root of the energy for each band (returned)
+ */
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M);
+
+/*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const opus_val16 *tonality, celt_ener *bandE);*/
+
+/** Normalise each band of X such that the energy in each band is
+    equal to 1
+ * @param m Mode data
+ * @param X Spectrum (returned normalised)
+ * @param bandE Square root of the energy for each band
+ */
+void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, celt_norm * OPUS_RESTRICT X, const celt_ener *bandE, int end, int C, int M);
+
+/** Denormalise each band of X to restore full amplitude
+ * @param m Mode data
+ * @param X Spectrum (returned de-normalised)
+ * @param bandE Square root of the energy for each band
+ */
+void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start, int end, int C, int M);
+
+#define SPREAD_NONE       (0)
+#define SPREAD_LIGHT      (1)
+#define SPREAD_NORMAL     (2)
+#define SPREAD_AGGRESSIVE (3)
+
+int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
+      int end, int C, int M);
+
+#ifdef MEASURE_NORM_MSE
+void measure_norm_mse(const CELTMode *m, float *X, float *X0, float *bandE, float *bandE0, int M, int N, int C);
+#endif
+
+void haar1(celt_norm *X, int N0, int stride);
+
+/** Quantisation/encoding of the residual spectrum
+ * @param encode flag that indicates whether we're encoding (1) or decoding (0)
+ * @param m Mode data
+ * @param start First band to process
+ * @param end Last band to process + 1
+ * @param X Residual (normalised)
+ * @param Y Residual (normalised) for second channel (or NULL for mono)
+ * @param collapse_masks Anti-collapse tracking mask
+ * @param bandE Square root of the energy for each band
+ * @param pulses Bit allocation (per band) for PVQ
+ * @param shortBlocks Zero for long blocks, non-zero for short blocks
+ * @param spread Amount of spreading to use
+ * @param dual_stereo Zero for MS stereo, non-zero for dual stereo
+ * @param intensity First band to use intensity stereo
+ * @param tf_res Time-frequency resolution change
+ * @param total_bits Total number of bits that can be used for the frame (including the ones already spent)
+ * @param balance Number of unallocated bits
+ * @param en Entropy coder state
+ * @param LM log2() of the number of 2.5 subframes in the frame
+ * @param codedBands Last band to receive bits + 1
+ * @param seed Random generator seed
+ */
+void quant_all_bands(int encode, const CELTMode *m, int start, int end,
+      celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
+      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
+      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
+
+void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
+      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
+      opus_val16 *prev2logE, int *pulses, opus_uint32 seed);
+
+opus_uint32 celt_lcg_rand(opus_uint32 seed);
+
+int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev);
+
+#endif /* BANDS_H */

+ 223 - 0
drivers/opus/celt/celt.c

@@ -0,0 +1,223 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#define CELT_C
+
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "opus_modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "unknown"
+#endif
+
+
+int resampling_factor(opus_int32 rate)
+{
+   int ret;
+   switch (rate)
+   {
+   case 48000:
+      ret = 1;
+      break;
+   case 24000:
+      ret = 2;
+      break;
+   case 16000:
+      ret = 3;
+      break;
+   case 12000:
+      ret = 4;
+      break;
+   case 8000:
+      ret = 6;
+      break;
+   default:
+#ifndef CUSTOM_MODES
+      celt_assert(0);
+#endif
+      ret = 0;
+      break;
+   }
+   return ret;
+}
+
+#ifndef OVERRIDE_COMB_FILTER_CONST
+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = x[-T-2];
+   x3 = x[-T-1];
+   x2 = x[-T];
+   x1 = x[-T+1];
+   for (i=0;i<N;i++)
+   {
+      x0=x[i-T+2];
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x2)
+               + MULT16_32_Q15(g11,ADD32(x1,x3))
+               + MULT16_32_Q15(g12,ADD32(x0,x4));
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+
+}
+#endif
+
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
+      const opus_val16 *window, int overlap)
+{
+   int i;
+   /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
+   opus_val16 g00, g01, g02, g10, g11, g12;
+   opus_val32 x0, x1, x2, x3, x4;
+   static const opus_val16 gains[3][3] = {
+         {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
+         {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
+         {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+
+   if (g0==0 && g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y, x, N);
+      return;
+   }
+   g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
+   g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
+   g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
+   g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
+   g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
+   g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+   x1 = x[-T1+1];
+   x2 = x[-T1  ];
+   x3 = x[-T1-1];
+   x4 = x[-T1-2];
+   for (i=0;i<overlap;i++)
+   {
+      opus_val16 f;
+      x0=x[i-T1+2];
+      f = MULT16_16_Q15(window[i],window[i]);
+      y[i] = x[i]
+               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0])
+               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),ADD32(x[i-T0+1],x[i-T0-1]))
+               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),ADD32(x[i-T0+2],x[i-T0-2]))
+               + MULT16_32_Q15(MULT16_16_Q15(f,g10),x2)
+               + MULT16_32_Q15(MULT16_16_Q15(f,g11),ADD32(x1,x3))
+               + MULT16_32_Q15(MULT16_16_Q15(f,g12),ADD32(x0,x4));
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+
+   }
+   if (g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y+overlap, x+overlap, N-overlap);
+      return;
+   }
+
+   /* Compute the part with the constant filter. */
+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
+}
+
+const signed char tf_select_table[4][8] = {
+      {0, -1, 0, -1,    0,-1, 0,-1},
+      {0, -1, 0, -2,    1, 0, 1,-1},
+      {0, -2, 0, -3,    2, 0, 1,-1},
+      {0, -2, 0, -3,    3, 0, 1,-1},
+};
+
+
+void init_caps(const CELTMode *m,int *cap,int LM,int C)
+{
+   int i;
+   for (i=0;i<m->nbEBands;i++)
+   {
+      int N;
+      N=(m->eBands[i+1]-m->eBands[i])<<LM;
+      cap[i] = (m->cache.caps[m->nbEBands*(2*LM+C-1)+i]+64)*C*N>>2;
+   }
+}
+
+
+
+const char *opus_strerror(int error)
+{
+   static const char * const error_strings[8] = {
+      "success",
+      "invalid argument",
+      "buffer too small",
+      "internal error",
+      "corrupted stream",
+      "request not implemented",
+      "invalid state",
+      "memory allocation failed"
+   };
+   if (error > 0 || error < -7)
+      return "unknown error";
+   else
+      return error_strings[-error];
+}
+
+const char *opus_get_version_string(void)
+{
+    return "libopus " PACKAGE_VERSION
+#ifdef OPUS_FIXED_POINT
+          "-fixed"
+#endif
+#ifdef FUZZING
+          "-fuzzing"
+#endif
+          ;
+}

+ 218 - 0
drivers/opus/celt/celt.h

@@ -0,0 +1,218 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/**
+  @file celt.h
+  @brief Contains all the functions for encoding and decoding audio
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_H
+#define CELT_H
+
+#include "opus_types.h"
+#include "opus_defines.h"
+#include "opus_custom.h"
+#include "entenc.h"
+#include "entdec.h"
+#include "arch.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CELTEncoder OpusCustomEncoder
+#define CELTDecoder OpusCustomDecoder
+#define CELTMode OpusCustomMode
+
+typedef struct {
+   int valid;
+   float tonality;
+   float tonality_slope;
+   float noisiness;
+   float activity;
+   float music_prob;
+   int        bandwidth;
+}AnalysisInfo;
+
+#define __celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr)))
+
+#define __celt_check_analysis_ptr(ptr) ((ptr) + ((ptr) - (const AnalysisInfo*)(ptr)))
+
+/* Encoder/decoder Requests */
+
+/* Expose this option again when variable framesize actually works */
+#define OPUS_FRAMESIZE_VARIABLE              5010 /**< Optimize the frame size dynamically */
+
+
+#define CELT_SET_PREDICTION_REQUEST    10002
+/** Controls the use of interframe prediction.
+    0=Independent frames
+    1=Short term interframe prediction allowed
+    2=Long term prediction allowed
+ */
+#define CELT_SET_PREDICTION(x) CELT_SET_PREDICTION_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_INPUT_CLIPPING_REQUEST    10004
+#define CELT_SET_INPUT_CLIPPING(x) CELT_SET_INPUT_CLIPPING_REQUEST, __opus_check_int(x)
+
+#define CELT_GET_AND_CLEAR_ERROR_REQUEST   10007
+#define CELT_GET_AND_CLEAR_ERROR(x) CELT_GET_AND_CLEAR_ERROR_REQUEST, __opus_check_int_ptr(x)
+
+#define CELT_SET_CHANNELS_REQUEST    10008
+#define CELT_SET_CHANNELS(x) CELT_SET_CHANNELS_REQUEST, __opus_check_int(x)
+
+
+/* Internal */
+#define CELT_SET_START_BAND_REQUEST    10010
+#define CELT_SET_START_BAND(x) CELT_SET_START_BAND_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_END_BAND_REQUEST    10012
+#define CELT_SET_END_BAND(x) CELT_SET_END_BAND_REQUEST, __opus_check_int(x)
+
+#define CELT_GET_MODE_REQUEST    10015
+/** Get the CELTMode used by an encoder or decoder */
+#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, __celt_check_mode_ptr_ptr(x)
+
+#define CELT_SET_SIGNALLING_REQUEST    10016
+#define CELT_SET_SIGNALLING(x) CELT_SET_SIGNALLING_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_TONALITY_REQUEST    10018
+#define CELT_SET_TONALITY(x) CELT_SET_TONALITY_REQUEST, __opus_check_int(x)
+#define CELT_SET_TONALITY_SLOPE_REQUEST    10020
+#define CELT_SET_TONALITY_SLOPE(x) CELT_SET_TONALITY_SLOPE_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_ANALYSIS_REQUEST    10022
+#define CELT_SET_ANALYSIS(x) CELT_SET_ANALYSIS_REQUEST, __celt_check_analysis_ptr(x)
+
+#define OPUS_SET_LFE_REQUEST    10024
+#define OPUS_SET_LFE(x) OPUS_SET_LFE_REQUEST, __opus_check_int(x)
+
+#define OPUS_SET_ENERGY_MASK_REQUEST    10026
+#define OPUS_SET_ENERGY_MASK(x) OPUS_SET_ENERGY_MASK_REQUEST, __opus_check_val16_ptr(x)
+
+/* Encoder stuff */
+
+int celt_encoder_get_size(int channels);
+
+int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc);
+
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+                      int arch);
+
+
+
+/* Decoder stuff */
+
+int celt_decoder_get_size(int channels);
+
+
+int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);
+
+int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec);
+
+#define celt_encoder_ctl opus_custom_encoder_ctl
+#define celt_decoder_ctl opus_custom_decoder_ctl
+
+
+#ifdef CUSTOM_MODES
+#define OPUS_CUSTOM_NOSTATIC
+#else
+#define OPUS_CUSTOM_NOSTATIC static OPUS_INLINE
+#endif
+
+static const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0};
+/* Probs: NONE: 21.875%, LIGHT: 6.25%, NORMAL: 65.625%, AGGRESSIVE: 6.25% */
+static const unsigned char spread_icdf[4] = {25, 23, 2, 0};
+
+static const unsigned char tapset_icdf[3]={2,1,0};
+
+#ifdef CUSTOM_MODES
+static const unsigned char toOpusTable[20] = {
+      0xE0, 0xE8, 0xF0, 0xF8,
+      0xC0, 0xC8, 0xD0, 0xD8,
+      0xA0, 0xA8, 0xB0, 0xB8,
+      0x00, 0x00, 0x00, 0x00,
+      0x80, 0x88, 0x90, 0x98,
+};
+
+static const unsigned char fromOpusTable[16] = {
+      0x80, 0x88, 0x90, 0x98,
+      0x40, 0x48, 0x50, 0x58,
+      0x20, 0x28, 0x30, 0x38,
+      0x00, 0x08, 0x10, 0x18
+};
+
+static OPUS_INLINE int toOpus(unsigned char c)
+{
+   int ret=0;
+   if (c<0xA0)
+      ret = toOpusTable[c>>3];
+   if (ret == 0)
+      return -1;
+   else
+      return ret|(c&0x7);
+}
+
+static OPUS_INLINE int fromOpus(unsigned char c)
+{
+   if (c<0x80)
+      return -1;
+   else
+      return fromOpusTable[(c>>3)-16] | (c&0x7);
+}
+#endif /* CUSTOM_MODES */
+
+#define COMBFILTER_MAXPERIOD 1024
+#define COMBFILTER_MINPERIOD 15
+
+extern const signed char tf_select_table[4][8];
+
+int resampling_factor(opus_int32 rate);
+
+void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+                        int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip);
+
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
+      const opus_val16 *window, int overlap);
+
+void init_caps(const CELTMode *m,int *cap,int LM,int C);
+
+#ifdef RESYNTH
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch);
+
+void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
+      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CELT_H */

+ 1195 - 0
drivers/opus/celt/celt_decoder.c

@@ -0,0 +1,1195 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#define CELT_DECODER_C
+
+#include "cpu_support.h"
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "opus_modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+/**********************************************************************/
+/*                                                                    */
+/*                             DECODER                                */
+/*                                                                    */
+/**********************************************************************/
+#define DECODE_BUFFER_SIZE 2048
+
+/** Decoder state
+ @brief Decoder state
+ */
+struct OpusCustomDecoder {
+   const OpusCustomMode *mode;
+   int overlap;
+   int channels;
+   int stream_channels;
+
+   int downsample;
+   int start, end;
+   int signalling;
+   int arch;
+
+   /* Everything beyond this point gets cleared on a reset */
+#define DECODER_RESET_START rng
+
+   opus_uint32 rng;
+   int error;
+   int last_pitch_index;
+   int loss_count;
+   int postfilter_period;
+   int postfilter_period_old;
+   opus_val16 postfilter_gain;
+   opus_val16 postfilter_gain_old;
+   int postfilter_tapset;
+   int postfilter_tapset_old;
+
+   celt_sig preemph_memD[2];
+
+   celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */
+   /* opus_val16 lpc[],  Size = channels*LPC_ORDER */
+   /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */
+   /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */
+   /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */
+   /* opus_val16 backgroundLogE[], Size = 2*mode->nbEBands */
+};
+
+int celt_decoder_get_size(int channels)
+{
+   const CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
+   return opus_custom_decoder_get_size(mode, channels);
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int channels)
+{
+   int size = sizeof(struct CELTDecoder)
+            + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig)
+            + channels*LPC_ORDER*sizeof(opus_val16)
+            + 4*2*mode->nbEBands*sizeof(opus_val16);
+   return size;
+}
+
+#ifdef CUSTOM_MODES
+CELTDecoder *opus_custom_decoder_create(const CELTMode *mode, int channels, int *error)
+{
+   int ret;
+   CELTDecoder *st = (CELTDecoder *)opus_alloc(opus_custom_decoder_get_size(mode, channels));
+   ret = opus_custom_decoder_init(st, mode, channels);
+   if (ret != OPUS_OK)
+   {
+      opus_custom_decoder_destroy(st);
+      st = NULL;
+   }
+   if (error)
+      *error = ret;
+   return st;
+}
+#endif /* CUSTOM_MODES */
+
+int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels)
+{
+   int ret;
+   ret = opus_custom_decoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
+   if (ret != OPUS_OK)
+      return ret;
+   st->downsample = resampling_factor(sampling_rate);
+   if (st->downsample==0)
+      return OPUS_BAD_ARG;
+   else
+      return OPUS_OK;
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_init(CELTDecoder *st, const CELTMode *mode, int channels)
+{
+   if (channels < 0 || channels > 2)
+      return OPUS_BAD_ARG;
+
+   if (st==NULL)
+      return OPUS_ALLOC_FAIL;
+
+   OPUS_CLEAR((char*)st, opus_custom_decoder_get_size(mode, channels));
+
+   st->mode = mode;
+   st->overlap = mode->overlap;
+   st->stream_channels = st->channels = channels;
+
+   st->downsample = 1;
+   st->start = 0;
+   st->end = st->mode->effEBands;
+   st->signalling = 1;
+   st->arch = opus_select_arch();
+
+   st->loss_count = 0;
+
+   opus_custom_decoder_ctl(st, OPUS_RESET_STATE);
+
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+void opus_custom_decoder_destroy(CELTDecoder *st)
+{
+   opus_free(st);
+}
+#endif /* CUSTOM_MODES */
+
+static OPUS_INLINE opus_val16 SIG2WORD16(celt_sig x)
+{
+#ifdef OPUS_FIXED_POINT
+   x = PSHR32(x, SIG_SHIFT);
+   x = MAX32(x, -32768);
+   x = MIN32(x, 32767);
+   return EXTRACT16(x);
+#else
+   return (opus_val16)x;
+#endif
+}
+
+#ifndef RESYNTH
+static
+#endif
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch)
+{
+   int c;
+   int Nd;
+   int apply_downsampling=0;
+   opus_val16 coef0;
+
+   coef0 = coef[0];
+   Nd = N/downsample;
+   c=0; do {
+      int j;
+      celt_sig * OPUS_RESTRICT x;
+      opus_val16  * OPUS_RESTRICT y;
+      celt_sig m = mem[c];
+      x =in[c];
+      y = pcm+c;
+#ifdef CUSTOM_MODES
+      if (coef[1] != 0)
+      {
+         opus_val16 coef1 = coef[1];
+         opus_val16 coef3 = coef[3];
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = x[j] + m + VERY_SMALL;
+            m = MULT16_32_Q15(coef0, tmp)
+                          - MULT16_32_Q15(coef1, x[j]);
+            tmp = SHL32(MULT16_32_Q15(coef3, tmp), 2);
+            scratch[j] = tmp;
+         }
+         apply_downsampling=1;
+      } else
+#endif
+      if (downsample>1)
+      {
+         /* Shortcut for the standard (non-custom modes) case */
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = x[j] + m + VERY_SMALL;
+            m = MULT16_32_Q15(coef0, tmp);
+            scratch[j] = tmp;
+         }
+         apply_downsampling=1;
+      } else {
+         /* Shortcut for the standard (non-custom modes) case */
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = x[j] + m + VERY_SMALL;
+            m = MULT16_32_Q15(coef0, tmp);
+            y[j*C] = SCALEOUT(SIG2WORD16(tmp));
+         }
+      }
+      mem[c] = m;
+
+      if (apply_downsampling)
+      {
+         /* Perform down-sampling */
+         for (j=0;j<Nd;j++)
+            y[j*C] = SCALEOUT(SIG2WORD16(scratch[j*downsample]));
+      }
+   } while (++c<C);
+}
+
+/** Compute the IMDCT and apply window for all sub-frames and
+    all channels in a frame */
+#ifndef RESYNTH
+static
+#endif
+void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
+      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM)
+{
+   int b, c;
+   int B;
+   int N;
+   int shift;
+   const int overlap = OVERLAP(mode);
+
+   if (shortBlocks)
+   {
+      B = shortBlocks;
+      N = mode->shortMdctSize;
+      shift = mode->maxLM;
+   } else {
+      B = 1;
+      N = mode->shortMdctSize<<LM;
+      shift = mode->maxLM-LM;
+   }
+   c=0; do {
+      /* IMDCT on the interleaved the sub-frames, overlap-add is performed by the IMDCT */
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &X[b+c*N*B], out_mem[c]+N*b, mode->window, overlap, shift, B);
+   } while (++c<C);
+}
+
+static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM, ec_dec *dec)
+{
+   int i, curr, tf_select;
+   int tf_select_rsv;
+   int tf_changed;
+   int logp;
+   opus_uint32 budget;
+   opus_uint32 tell;
+
+   budget = dec->storage*8;
+   tell = ec_tell(dec);
+   logp = isTransient ? 2 : 4;
+   tf_select_rsv = LM>0 && tell+logp+1<=budget;
+   budget -= tf_select_rsv;
+   tf_changed = curr = 0;
+   for (i=start;i<end;i++)
+   {
+      if (tell+logp<=budget)
+      {
+         curr ^= ec_dec_bit_logp(dec, logp);
+         tell = ec_tell(dec);
+         tf_changed |= curr;
+      }
+      tf_res[i] = curr;
+      logp = isTransient ? 4 : 5;
+   }
+   tf_select = 0;
+   if (tf_select_rsv &&
+     tf_select_table[LM][4*isTransient+0+tf_changed] !=
+     tf_select_table[LM][4*isTransient+2+tf_changed])
+   {
+      tf_select = ec_dec_bit_logp(dec, 1);
+   }
+   for (i=start;i<end;i++)
+   {
+      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
+   }
+}
+
+/* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
+   CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
+   current value corresponds to a pitch of 66.67 Hz. */
+#define PLC_PITCH_LAG_MAX (720)
+/* The minimum pitch lag to allow in the pitch-based PLC. This corresponds to a
+   pitch of 480 Hz. */
+#define PLC_PITCH_LAG_MIN (100)
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_RESTRICT pcm, int N, int LM)
+{
+   int c;
+   int i;
+   const int C = st->channels;
+   celt_sig *decode_mem[2];
+   celt_sig *out_syn[2];
+   opus_val16 *lpc;
+   opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   int start;
+   int downsample;
+   int loss_count;
+   int noise_based;
+   const opus_int16 *eBands;
+   VARDECL(celt_sig, scratch);
+   SAVE_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
+   } while (++c<C);
+   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C);
+   oldBandE = lpc+C*LPC_ORDER;
+   oldLogE = oldBandE + 2*nbEBands;
+   oldLogE2 = oldLogE + 2*nbEBands;
+   backgroundLogE = oldLogE2  + 2*nbEBands;
+
+   loss_count = st->loss_count;
+   start = st->start;
+   downsample = st->downsample;
+   noise_based = loss_count >= 5 || start != 0;
+   ALLOC(scratch, noise_based?N*C:N, celt_sig);
+   if (noise_based)
+   {
+      /* Noise-based PLC/CNG */
+      celt_sig *freq;
+      VARDECL(celt_norm, X);
+      opus_uint32 seed;
+      opus_val16 *plcLogE;
+      int end;
+      int effEnd;
+
+      end = st->end;
+      effEnd = IMAX(start, IMIN(end, mode->effEBands));
+
+      /* Share the interleaved signal MDCT coefficient buffer with the
+         deemphasis scratch buffer. */
+      freq = scratch;
+      ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+
+      if (loss_count >= 5)
+         plcLogE = backgroundLogE;
+      else {
+         /* Energy decay */
+         opus_val16 decay = loss_count==0 ?
+               QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+         c=0; do
+         {
+            for (i=start;i<end;i++)
+               oldBandE[c*nbEBands+i] -= decay;
+         } while (++c<C);
+         plcLogE = oldBandE;
+      }
+      seed = st->rng;
+      for (c=0;c<C;c++)
+      {
+         for (i=start;i<effEnd;i++)
+         {
+            int j;
+            int boffs;
+            int blen;
+            boffs = N*c+(eBands[i]<<LM);
+            blen = (eBands[i+1]-eBands[i])<<LM;
+            for (j=0;j<blen;j++)
+            {
+               seed = celt_lcg_rand(seed);
+               X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
+            }
+            renormalise_vector(X+boffs, blen, Q15ONE);
+         }
+      }
+      st->rng = seed;
+
+      denormalise_bands(mode, X, freq, plcLogE, start, effEnd, C, 1<<LM);
+
+      c=0; do {
+         int bound = eBands[effEnd]<<LM;
+         if (downsample!=1)
+            bound = IMIN(bound, N/downsample);
+         for (i=bound;i<N;i++)
+            freq[c*N+i] = 0;
+      } while (++c<C);
+      c=0; do {
+         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+               DECODE_BUFFER_SIZE-N+(overlap>>1));
+      } while (++c<C);
+      compute_inv_mdcts(mode, 0, freq, out_syn, C, LM);
+   } else {
+      /* Pitch-based PLC */
+      const opus_val16 *window;
+      opus_val16 fade = Q15ONE;
+      int pitch_index;
+      VARDECL(opus_val32, etmp);
+      VARDECL(opus_val16, exc);
+
+      if (loss_count == 0)
+      {
+         VARDECL( opus_val16, lp_pitch_buf );
+         ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
+         pitch_downsample(decode_mem, lp_pitch_buf,
+               DECODE_BUFFER_SIZE, C, st->arch);
+         pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
+               DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
+               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch);
+         pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
+         st->last_pitch_index = pitch_index;
+      } else {
+         pitch_index = st->last_pitch_index;
+         fade = QCONST16(.8f,15);
+      }
+
+      ALLOC(etmp, overlap, opus_val32);
+      ALLOC(exc, MAX_PERIOD, opus_val16);
+      window = mode->window;
+      c=0; do {
+         opus_val16 decay;
+         opus_val16 attenuation;
+         opus_val32 S1=0;
+         celt_sig *buf;
+         int extrapolation_offset;
+         int extrapolation_len;
+         int exc_length;
+         int j;
+
+         buf = decode_mem[c];
+         for (i=0;i<MAX_PERIOD;i++) {
+            exc[i] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD+i], SIG_SHIFT);
+         }
+
+         if (loss_count == 0)
+         {
+            opus_val32 ac[LPC_ORDER+1];
+            /* Compute LPC coefficients for the last MAX_PERIOD samples before
+               the first loss so we can work in the excitation-filter domain. */
+            _celt_autocorr(exc, ac, window, overlap,
+                   LPC_ORDER, MAX_PERIOD, st->arch);
+            /* Add a noise floor of -40 dB. */
+#ifdef OPUS_FIXED_POINT
+            ac[0] += SHR32(ac[0],13);
+#else
+            ac[0] *= 1.0001f;
+#endif
+            /* Use lag windowing to stabilize the Levinson-Durbin recursion. */
+            for (i=1;i<=LPC_ORDER;i++)
+            {
+               /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
+#ifdef OPUS_FIXED_POINT
+               ac[i] -= MULT16_32_Q15(2*i*i, ac[i]);
+#else
+               ac[i] -= ac[i]*(0.008f*0.008f)*i*i;
+#endif
+            }
+            _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER);
+         }
+         /* We want the excitation for 2 pitch periods in order to look for a
+            decaying signal, but we can't get more than MAX_PERIOD. */
+         exc_length = IMIN(2*pitch_index, MAX_PERIOD);
+         /* Initialize the LPC history with the samples just before the start
+            of the region for which we're computing the excitation. */
+         {
+            opus_val16 lpc_mem[LPC_ORDER];
+            for (i=0;i<LPC_ORDER;i++)
+            {
+               lpc_mem[i] =
+                     ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT);
+            }
+            /* Compute the excitation for exc_length samples before the loss. */
+            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
+                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem);
+         }
+
+         /* Check if the waveform is decaying, and if so how fast.
+            We do this to avoid adding energy when concealing in a segment
+            with decaying energy. */
+         {
+            opus_val32 E1=1, E2=1;
+            int decay_length;
+#ifdef OPUS_FIXED_POINT
+            int shift = IMAX(0,2*celt_zlog2(celt_maxabs16(&exc[MAX_PERIOD-exc_length], exc_length))-20);
+#endif
+            decay_length = exc_length>>1;
+            for (i=0;i<decay_length;i++)
+            {
+               opus_val16 e;
+               e = exc[MAX_PERIOD-decay_length+i];
+               E1 += SHR32(MULT16_16(e, e), shift);
+               e = exc[MAX_PERIOD-2*decay_length+i];
+               E2 += SHR32(MULT16_16(e, e), shift);
+            }
+            E1 = MIN32(E1, E2);
+            decay = celt_sqrt(frac_div32(SHR32(E1, 1), E2));
+         }
+
+         /* Move the decoder memory one frame to the left to give us room to
+            add the data for the new frame. We ignore the overlap that extends
+            past the end of the buffer, because we aren't going to use it. */
+         OPUS_MOVE(buf, buf+N, DECODE_BUFFER_SIZE-N);
+
+         /* Extrapolate from the end of the excitation with a period of
+            "pitch_index", scaling down each period by an additional factor of
+            "decay". */
+         extrapolation_offset = MAX_PERIOD-pitch_index;
+         /* We need to extrapolate enough samples to cover a complete MDCT
+            window (including overlap/2 samples on both sides). */
+         extrapolation_len = N+overlap;
+         /* We also apply fading if this is not the first loss. */
+         attenuation = MULT16_16_Q15(fade, decay);
+         for (i=j=0;i<extrapolation_len;i++,j++)
+         {
+            opus_val16 tmp;
+            if (j >= pitch_index) {
+               j -= pitch_index;
+               attenuation = MULT16_16_Q15(attenuation, decay);
+            }
+            buf[DECODE_BUFFER_SIZE-N+i] =
+                  SHL32(EXTEND32(MULT16_16_Q15(attenuation,
+                        exc[extrapolation_offset+j])), SIG_SHIFT);
+            /* Compute the energy of the previously decoded signal whose
+               excitation we're copying. */
+            tmp = ROUND16(
+                  buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j],
+                  SIG_SHIFT);
+            S1 += SHR32(MULT16_16(tmp, tmp), 8);
+         }
+
+         {
+            opus_val16 lpc_mem[LPC_ORDER];
+            /* Copy the last decoded samples (prior to the overlap region) to
+               synthesis filter memory so we can have a continuous signal. */
+            for (i=0;i<LPC_ORDER;i++)
+               lpc_mem[i] = ROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
+            /* Apply the synthesis filter to convert the excitation back into
+               the signal domain. */
+            celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
+                  buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
+                  lpc_mem);
+         }
+
+         /* Check if the synthesis energy is higher than expected, which can
+            happen with the signal changes during our window. If so,
+            attenuate. */
+         {
+            opus_val32 S2=0;
+            for (i=0;i<extrapolation_len;i++)
+            {
+               opus_val16 tmp = ROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
+               S2 += SHR32(MULT16_16(tmp, tmp), 8);
+            }
+            /* This checks for an "explosion" in the synthesis. */
+#ifdef OPUS_FIXED_POINT
+            if (!(S1 > SHR32(S2,2)))
+#else
+            /* The float test is written this way to catch NaNs in the output
+               of the IIR filter at the same time. */
+            if (!(S1 > 0.2f*S2))
+#endif
+            {
+               for (i=0;i<extrapolation_len;i++)
+                  buf[DECODE_BUFFER_SIZE-N+i] = 0;
+            } else if (S1 < S2)
+            {
+               opus_val16 ratio = celt_sqrt(frac_div32(SHR32(S1,1)+1,S2+1));
+               for (i=0;i<overlap;i++)
+               {
+                  opus_val16 tmp_g = Q15ONE
+                        - MULT16_16_Q15(window[i], Q15ONE-ratio);
+                  buf[DECODE_BUFFER_SIZE-N+i] =
+                        MULT16_32_Q15(tmp_g, buf[DECODE_BUFFER_SIZE-N+i]);
+               }
+               for (i=overlap;i<extrapolation_len;i++)
+               {
+                  buf[DECODE_BUFFER_SIZE-N+i] =
+                        MULT16_32_Q15(ratio, buf[DECODE_BUFFER_SIZE-N+i]);
+               }
+            }
+         }
+
+         /* Apply the pre-filter to the MDCT overlap for the next frame because
+            the post-filter will be re-applied in the decoder after the MDCT
+            overlap. */
+         comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
+              st->postfilter_period, st->postfilter_period, overlap,
+              -st->postfilter_gain, -st->postfilter_gain,
+              st->postfilter_tapset, st->postfilter_tapset, NULL, 0);
+
+         /* Simulate TDAC on the concealed audio so that it blends with the
+            MDCT of the next frame. */
+         for (i=0;i<overlap/2;i++)
+         {
+            buf[DECODE_BUFFER_SIZE+i] =
+               MULT16_32_Q15(window[i], etmp[overlap-1-i])
+               + MULT16_32_Q15(window[overlap-i-1], etmp[i]);
+         }
+      } while (++c<C);
+   }
+
+   deemphasis(out_syn, pcm, N, C, downsample,
+         mode->preemph, st->preemph_memD, scratch);
+
+   st->loss_count = loss_count+1;
+
+   RESTORE_STACK;
+}
+
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec)
+{
+   int c, i, N;
+   int spread_decision;
+   opus_int32 bits;
+   ec_dec _dec;
+   VARDECL(celt_sig, freq);
+   VARDECL(celt_norm, X);
+   VARDECL(int, fine_quant);
+   VARDECL(int, pulses);
+   VARDECL(int, cap);
+   VARDECL(int, offsets);
+   VARDECL(int, fine_priority);
+   VARDECL(int, tf_res);
+   VARDECL(unsigned char, collapse_masks);
+   celt_sig *decode_mem[2];
+   celt_sig *out_syn[2];
+   opus_val16 *lpc;
+   opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
+
+   int shortBlocks;
+   int isTransient;
+   int intra_ener;
+   const int CC = st->channels;
+   int LM, M;
+   int effEnd;
+   int codedBands;
+   int alloc_trim;
+   int postfilter_pitch;
+   opus_val16 postfilter_gain;
+   int intensity=0;
+   int dual_stereo=0;
+   opus_int32 total_bits;
+   opus_int32 balance;
+   opus_int32 tell;
+   int dynalloc_logp;
+   int postfilter_tapset;
+   int anti_collapse_rsv;
+   int anti_collapse_on=0;
+   int silence;
+   int C = st->stream_channels;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   const opus_int16 *eBands;
+   ALLOC_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+   frame_size *= st->downsample;
+
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+   } while (++c<CC);
+   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
+   oldBandE = lpc+CC*LPC_ORDER;
+   oldLogE = oldBandE + 2*nbEBands;
+   oldLogE2 = oldLogE + 2*nbEBands;
+   backgroundLogE = oldLogE2  + 2*nbEBands;
+
+#ifdef CUSTOM_MODES
+   if (st->signalling && data!=NULL)
+   {
+      int data0=data[0];
+      /* Convert "standard mode" to Opus header */
+      if (mode->Fs==48000 && mode->shortMdctSize==120)
+      {
+         data0 = fromOpus(data0);
+         if (data0<0)
+            return OPUS_INVALID_PACKET;
+      }
+      st->end = IMAX(1, mode->effEBands-2*(data0>>5));
+      LM = (data0>>3)&0x3;
+      C = 1 + ((data0>>2)&0x1);
+      data++;
+      len--;
+      if (LM>mode->maxLM)
+         return OPUS_INVALID_PACKET;
+      if (frame_size < mode->shortMdctSize<<LM)
+         return OPUS_BUFFER_TOO_SMALL;
+      else
+         frame_size = mode->shortMdctSize<<LM;
+   } else {
+#else
+   {
+#endif
+      for (LM=0;LM<=mode->maxLM;LM++)
+         if (mode->shortMdctSize<<LM==frame_size)
+            break;
+      if (LM>mode->maxLM)
+         return OPUS_BAD_ARG;
+   }
+   M=1<<LM;
+
+   if (len<0 || len>1275 || pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   N = M*mode->shortMdctSize;
+
+   effEnd = st->end;
+   if (effEnd > mode->effEBands)
+      effEnd = mode->effEBands;
+
+   if (data == NULL || len<=1)
+   {
+      celt_decode_lost(st, pcm, N, LM);
+      RESTORE_STACK;
+      return frame_size/st->downsample;
+   }
+
+   if (dec == NULL)
+   {
+      ec_dec_init(&_dec,(unsigned char*)data,len);
+      dec = &_dec;
+   }
+
+   if (C==1)
+   {
+      for (i=0;i<nbEBands;i++)
+         oldBandE[i]=MAX16(oldBandE[i],oldBandE[nbEBands+i]);
+   }
+
+   total_bits = len*8;
+   tell = ec_tell(dec);
+
+   if (tell >= total_bits)
+      silence = 1;
+   else if (tell==1)
+      silence = ec_dec_bit_logp(dec, 15);
+   else
+      silence = 0;
+   if (silence)
+   {
+      /* Pretend we've read all the remaining bits */
+      tell = len*8;
+      dec->nbits_total+=tell-ec_tell(dec);
+   }
+
+   postfilter_gain = 0;
+   postfilter_pitch = 0;
+   postfilter_tapset = 0;
+   if (st->start==0 && tell+16 <= total_bits)
+   {
+      if(ec_dec_bit_logp(dec, 1))
+      {
+         int qg, octave;
+         octave = ec_dec_uint(dec, 6);
+         postfilter_pitch = (16<<octave)+ec_dec_bits(dec, 4+octave)-1;
+         qg = ec_dec_bits(dec, 3);
+         if (ec_tell(dec)+2<=total_bits)
+            postfilter_tapset = ec_dec_icdf(dec, tapset_icdf, 2);
+         postfilter_gain = QCONST16(.09375f,15)*(qg+1);
+      }
+      tell = ec_tell(dec);
+   }
+
+   if (LM > 0 && tell+3 <= total_bits)
+   {
+      isTransient = ec_dec_bit_logp(dec, 3);
+      tell = ec_tell(dec);
+   }
+   else
+      isTransient = 0;
+
+   if (isTransient)
+      shortBlocks = M;
+   else
+      shortBlocks = 0;
+
+   /* Decode the global flags (first symbols in the stream) */
+   intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
+   /* Get band energies */
+   unquant_coarse_energy(mode, st->start, st->end, oldBandE,
+         intra_ener, dec, C, LM);
+
+   ALLOC(tf_res, nbEBands, int);
+   tf_decode(st->start, st->end, isTransient, tf_res, LM, dec);
+
+   tell = ec_tell(dec);
+   spread_decision = SPREAD_NORMAL;
+   if (tell+4 <= total_bits)
+      spread_decision = ec_dec_icdf(dec, spread_icdf, 5);
+
+   ALLOC(cap, nbEBands, int);
+
+   init_caps(mode,cap,LM,C);
+
+   ALLOC(offsets, nbEBands, int);
+
+   dynalloc_logp = 6;
+   total_bits<<=BITRES;
+   tell = ec_tell_frac(dec);
+   for (i=st->start;i<st->end;i++)
+   {
+      int width, quanta;
+      int dynalloc_loop_logp;
+      int boost;
+      width = C*(eBands[i+1]-eBands[i])<<LM;
+      /* quanta is 6 bits, but no more than 1 bit/sample
+         and no less than 1/8 bit/sample */
+      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
+      dynalloc_loop_logp = dynalloc_logp;
+      boost = 0;
+      while (tell+(dynalloc_loop_logp<<BITRES) < total_bits && boost < cap[i])
+      {
+         int flag;
+         flag = ec_dec_bit_logp(dec, dynalloc_loop_logp);
+         tell = ec_tell_frac(dec);
+         if (!flag)
+            break;
+         boost += quanta;
+         total_bits -= quanta;
+         dynalloc_loop_logp = 1;
+      }
+      offsets[i] = boost;
+      /* Making dynalloc more likely */
+      if (boost>0)
+         dynalloc_logp = IMAX(2, dynalloc_logp-1);
+   }
+
+   ALLOC(fine_quant, nbEBands, int);
+   alloc_trim = tell+(6<<BITRES) <= total_bits ?
+         ec_dec_icdf(dec, trim_icdf, 7) : 5;
+
+   bits = (((opus_int32)len*8)<<BITRES) - ec_tell_frac(dec) - 1;
+   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
+   bits -= anti_collapse_rsv;
+
+   ALLOC(pulses, nbEBands, int);
+   ALLOC(fine_priority, nbEBands, int);
+
+   codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
+         alloc_trim, &intensity, &dual_stereo, bits, &balance, pulses,
+         fine_quant, fine_priority, C, LM, dec, 0, 0, 0);
+
+   unquant_fine_energy(mode, st->start, st->end, oldBandE, fine_quant, dec, C);
+
+   /* Decode fixed codebook */
+   ALLOC(collapse_masks, C*nbEBands, unsigned char);
+   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+
+   quant_all_bands(0, mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
+         NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
+         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
+
+   if (anti_collapse_rsv > 0)
+   {
+      anti_collapse_on = ec_dec_bits(dec, 1);
+   }
+
+   unquant_energy_finalise(mode, st->start, st->end, oldBandE,
+         fine_quant, fine_priority, len*8-ec_tell(dec), dec, C);
+
+   if (anti_collapse_on)
+      anti_collapse(mode, X, collapse_masks, LM, C, N,
+            st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+
+   ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
+
+   if (silence)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
+      for (i=0;i<C*N;i++)
+         freq[i] = 0;
+   } else {
+      /* Synthesis */
+      denormalise_bands(mode, X, freq, oldBandE, st->start, effEnd, C, M);
+   }
+   c=0; do {
+      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
+   } while (++c<CC);
+
+   c=0; do {
+      int bound = M*eBands[effEnd];
+      if (st->downsample!=1)
+         bound = IMIN(bound, N/st->downsample);
+      for (i=bound;i<N;i++)
+         freq[c*N+i] = 0;
+   } while (++c<C);
+
+   c=0; do {
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
+   } while (++c<CC);
+
+   if (CC==2&&C==1)
+   {
+      for (i=0;i<N;i++)
+         freq[N+i] = freq[i];
+   }
+   if (CC==1&&C==2)
+   {
+      for (i=0;i<N;i++)
+         freq[i] = HALF32(ADD32(freq[i],freq[N+i]));
+   }
+
+   /* Compute inverse MDCTs */
+   compute_inv_mdcts(mode, shortBlocks, freq, out_syn, CC, LM);
+
+   c=0; do {
+      st->postfilter_period=IMAX(st->postfilter_period, COMBFILTER_MINPERIOD);
+      st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD);
+      comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize,
+            st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset,
+            mode->window, overlap);
+      if (LM!=0)
+         comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize,
+               st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset,
+               mode->window, overlap);
+
+   } while (++c<CC);
+   st->postfilter_period_old = st->postfilter_period;
+   st->postfilter_gain_old = st->postfilter_gain;
+   st->postfilter_tapset_old = st->postfilter_tapset;
+   st->postfilter_period = postfilter_pitch;
+   st->postfilter_gain = postfilter_gain;
+   st->postfilter_tapset = postfilter_tapset;
+   if (LM!=0)
+   {
+      st->postfilter_period_old = st->postfilter_period;
+      st->postfilter_gain_old = st->postfilter_gain;
+      st->postfilter_tapset_old = st->postfilter_tapset;
+   }
+
+   if (C==1) {
+      for (i=0;i<nbEBands;i++)
+         oldBandE[nbEBands+i]=oldBandE[i];
+   }
+
+   /* In case start or end were to change */
+   if (!isTransient)
+   {
+      for (i=0;i<2*nbEBands;i++)
+         oldLogE2[i] = oldLogE[i];
+      for (i=0;i<2*nbEBands;i++)
+         oldLogE[i] = oldBandE[i];
+      for (i=0;i<2*nbEBands;i++)
+         backgroundLogE[i] = MIN16(backgroundLogE[i] + M*QCONST16(0.001f,DB_SHIFT), oldBandE[i]);
+   } else {
+      for (i=0;i<2*nbEBands;i++)
+         oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
+   }
+   c=0; do
+   {
+      for (i=0;i<st->start;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+      for (i=st->end;i<nbEBands;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+   } while (++c<2);
+   st->rng = dec->rng;
+
+   /* We reuse freq[] as scratch space for the de-emphasis */
+   deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, freq);
+   st->loss_count = 0;
+   RESTORE_STACK;
+   if (ec_tell(dec) > 8*len)
+      return OPUS_INTERNAL_ERROR;
+   if(ec_get_error(dec))
+      st->error = 1;
+   return frame_size/st->downsample;
+}
+
+
+#ifdef CUSTOM_MODES
+
+#ifdef OPUS_FIXED_POINT
+int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
+{
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+}
+
+#ifndef DISABLE_FLOAT_API
+int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
+{
+   int j, ret, C, N;
+   VARDECL(opus_int16, out);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+
+   ALLOC(out, C*N, opus_int16);
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+   if (ret>0)
+      for (j=0;j<C*ret;j++)
+         pcm[j]=out[j]*(1.f/32768.f);
+
+   RESTORE_STACK;
+   return ret;
+}
+#endif /* DISABLE_FLOAT_API */
+
+#else
+
+int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
+{
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+}
+
+int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
+{
+   int j, ret, C, N;
+   VARDECL(celt_sig, out);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+   ALLOC(out, C*N, celt_sig);
+
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+
+   if (ret>0)
+      for (j=0;j<C*ret;j++)
+         pcm[j] = FLOAT2INT16 (out[j]);
+
+   RESTORE_STACK;
+   return ret;
+}
+
+#endif
+#endif /* CUSTOM_MODES */
+
+int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
+{
+   va_list ap;
+
+   va_start(ap, request);
+   switch (request)
+   {
+      case CELT_SET_START_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<0 || value>=st->mode->nbEBands)
+            goto bad_arg;
+         st->start = value;
+      }
+      break;
+      case CELT_SET_END_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>st->mode->nbEBands)
+            goto bad_arg;
+         st->end = value;
+      }
+      break;
+      case CELT_SET_CHANNELS_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>2)
+            goto bad_arg;
+         st->stream_channels = value;
+      }
+      break;
+      case CELT_GET_AND_CLEAR_ERROR_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value=st->error;
+         st->error = 0;
+      }
+      break;
+      case OPUS_GET_LOOKAHEAD_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value = st->overlap/st->downsample;
+      }
+      break;
+      case OPUS_RESET_STATE:
+      {
+         int i;
+         opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2;
+         lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels);
+         oldBandE = lpc+st->channels*LPC_ORDER;
+         oldLogE = oldBandE + 2*st->mode->nbEBands;
+         oldLogE2 = oldLogE + 2*st->mode->nbEBands;
+         OPUS_CLEAR((char*)&st->DECODER_RESET_START,
+               opus_custom_decoder_get_size(st->mode, st->channels)-
+               ((char*)&st->DECODER_RESET_START - (char*)st));
+         for (i=0;i<2*st->mode->nbEBands;i++)
+            oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT);
+      }
+      break;
+      case OPUS_GET_PITCH_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value = st->postfilter_period;
+      }
+      break;
+      case CELT_GET_MODE_REQUEST:
+      {
+         const CELTMode ** value = va_arg(ap, const CELTMode**);
+         if (value==0)
+            goto bad_arg;
+         *value=st->mode;
+      }
+      break;
+      case CELT_SET_SIGNALLING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->signalling = value;
+      }
+      break;
+      case OPUS_GET_FINAL_RANGE_REQUEST:
+      {
+         opus_uint32 * value = va_arg(ap, opus_uint32 *);
+         if (value==0)
+            goto bad_arg;
+         *value=st->rng;
+      }
+      break;
+      default:
+         goto bad_request;
+   }
+   va_end(ap);
+   return OPUS_OK;
+bad_arg:
+   va_end(ap);
+   return OPUS_BAD_ARG;
+bad_request:
+      va_end(ap);
+  return OPUS_UNIMPLEMENTED;
+}

+ 2353 - 0
drivers/opus/celt/celt_encoder.c

@@ -0,0 +1,2353 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#define CELT_ENCODER_C
+
+#include "cpu_support.h"
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "opus_modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+
+/** Encoder state
+ @brief Encoder state
+ */
+struct OpusCustomEncoder {
+   const OpusCustomMode *mode;     /**< Mode used by the encoder */
+   int overlap;
+   int channels;
+   int stream_channels;
+
+   int force_intra;
+   int clip;
+   int disable_pf;
+   int complexity;
+   int upsample;
+   int start, end;
+
+   opus_int32 bitrate;
+   int vbr;
+   int signalling;
+   int constrained_vbr;      /* If zero, VBR can do whatever it likes with the rate */
+   int loss_rate;
+   int lsb_depth;
+   int variable_duration;
+   int lfe;
+   int arch;
+
+   /* Everything beyond this point gets cleared on a reset */
+#define ENCODER_RESET_START rng
+
+   opus_uint32 rng;
+   int spread_decision;
+   opus_val32 delayedIntra;
+   int tonal_average;
+   int lastCodedBands;
+   int hf_average;
+   int tapset_decision;
+
+   int prefilter_period;
+   opus_val16 prefilter_gain;
+   int prefilter_tapset;
+#ifdef RESYNTH
+   int prefilter_period_old;
+   opus_val16 prefilter_gain_old;
+   int prefilter_tapset_old;
+#endif
+   int consec_transient;
+   AnalysisInfo analysis;
+
+   opus_val32 preemph_memE[2];
+   opus_val32 preemph_memD[2];
+
+   /* VBR-related parameters */
+   opus_int32 vbr_reservoir;
+   opus_int32 vbr_drift;
+   opus_int32 vbr_offset;
+   opus_int32 vbr_count;
+   opus_val32 overlap_max;
+   opus_val16 stereo_saving;
+   int intensity;
+   opus_val16 *energy_mask;
+   opus_val16 spec_avg;
+
+#ifdef RESYNTH
+   /* +MAX_PERIOD/2 to make space for overlap */
+   celt_sig syn_mem[2][2*MAX_PERIOD+MAX_PERIOD/2];
+#endif
+
+   celt_sig in_mem[1]; /* Size = channels*mode->overlap */
+   /* celt_sig prefilter_mem[],  Size = channels*COMBFILTER_MAXPERIOD */
+   /* opus_val16 oldBandE[],     Size = channels*mode->nbEBands */
+   /* opus_val16 oldLogE[],      Size = channels*mode->nbEBands */
+   /* opus_val16 oldLogE2[],     Size = channels*mode->nbEBands */
+};
+
+int celt_encoder_get_size(int channels)
+{
+   CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
+   return opus_custom_encoder_get_size(mode, channels);
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_get_size(const CELTMode *mode, int channels)
+{
+   int size = sizeof(struct CELTEncoder)
+         + (channels*mode->overlap-1)*sizeof(celt_sig)    /* celt_sig in_mem[channels*mode->overlap]; */
+         + channels*COMBFILTER_MAXPERIOD*sizeof(celt_sig) /* celt_sig prefilter_mem[channels*COMBFILTER_MAXPERIOD]; */
+         + 3*channels*mode->nbEBands*sizeof(opus_val16);  /* opus_val16 oldBandE[channels*mode->nbEBands]; */
+                                                          /* opus_val16 oldLogE[channels*mode->nbEBands]; */
+                                                          /* opus_val16 oldLogE2[channels*mode->nbEBands]; */
+   return size;
+}
+
+#ifdef CUSTOM_MODES
+CELTEncoder *opus_custom_encoder_create(const CELTMode *mode, int channels, int *error)
+{
+   int ret;
+   CELTEncoder *st = (CELTEncoder *)opus_alloc(opus_custom_encoder_get_size(mode, channels));
+   /* init will handle the NULL case */
+   ret = opus_custom_encoder_init(st, mode, channels);
+   if (ret != OPUS_OK)
+   {
+      opus_custom_encoder_destroy(st);
+      st = NULL;
+   }
+   if (error)
+      *error = ret;
+   return st;
+}
+#endif /* CUSTOM_MODES */
+
+static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode,
+                                         int channels, int arch)
+{
+   if (channels < 0 || channels > 2)
+      return OPUS_BAD_ARG;
+
+   if (st==NULL || mode==NULL)
+      return OPUS_ALLOC_FAIL;
+
+   OPUS_CLEAR((char*)st, opus_custom_encoder_get_size(mode, channels));
+
+   st->mode = mode;
+   st->overlap = mode->overlap;
+   st->stream_channels = st->channels = channels;
+
+   st->upsample = 1;
+   st->start = 0;
+   st->end = st->mode->effEBands;
+   st->signalling = 1;
+
+   st->arch = arch;
+
+   st->constrained_vbr = 1;
+   st->clip = 1;
+
+   st->bitrate = OPUS_BITRATE_MAX;
+   st->vbr = 0;
+   st->force_intra  = 0;
+   st->complexity = 5;
+   st->lsb_depth=24;
+
+   opus_custom_encoder_ctl(st, OPUS_RESET_STATE);
+
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
+{
+   return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch());
+}
+#endif
+
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+                      int arch)
+{
+   int ret;
+   ret = opus_custom_encoder_init_arch(st,
+           opus_custom_mode_create(48000, 960, NULL), channels, arch);
+   if (ret != OPUS_OK)
+      return ret;
+   st->upsample = resampling_factor(sampling_rate);
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+void opus_custom_encoder_destroy(CELTEncoder *st)
+{
+   opus_free(st);
+}
+#endif /* CUSTOM_MODES */
+
+
+static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int C,
+                              opus_val16 *tf_estimate, int *tf_chan)
+{
+   int i;
+   VARDECL(opus_val16, tmp);
+   opus_val32 mem0,mem1;
+   int is_transient = 0;
+   opus_int32 mask_metric = 0;
+   int c;
+   opus_val16 tf_max;
+   int len2;
+   /* Table of 6*64/x, trained on real data to minimize the average error */
+   static const unsigned char inv_table[128] = {
+         255,255,156,110, 86, 70, 59, 51, 45, 40, 37, 33, 31, 28, 26, 25,
+          23, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12,
+          12, 12, 11, 11, 11, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,
+           8,  8,  8,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,
+           6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,
+           5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,
+           3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,
+   };
+   SAVE_STACK;
+   ALLOC(tmp, len, opus_val16);
+
+   len2=len/2;
+   for (c=0;c<C;c++)
+   {
+      opus_val32 mean;
+      opus_int32 unmask=0;
+      opus_val32 norm;
+      opus_val16 maxE;
+      mem0=0;
+      mem1=0;
+      /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */
+      for (i=0;i<len;i++)
+      {
+         opus_val32 x,y;
+         x = SHR32(in[i+c*len],SIG_SHIFT);
+         y = ADD32(mem0, x);
+#ifdef OPUS_FIXED_POINT
+         mem0 = mem1 + y - SHL32(x,1);
+         mem1 = x - SHR32(y,1);
+#else
+         mem0 = mem1 + y - 2*x;
+         mem1 = x - .5f*y;
+#endif
+         tmp[i] = EXTRACT16(SHR32(y,2));
+         /*printf("%f ", tmp[i]);*/
+      }
+      /*printf("\n");*/
+      /* First few samples are bad because we don't propagate the memory */
+      for (i=0;i<12;i++)
+         tmp[i] = 0;
+
+#ifdef OPUS_FIXED_POINT
+      /* Normalize tmp to max range */
+      {
+         int shift=0;
+         shift = 14-celt_ilog2(1+celt_maxabs16(tmp, len));
+         if (shift!=0)
+         {
+            for (i=0;i<len;i++)
+               tmp[i] = SHL16(tmp[i], shift);
+         }
+      }
+#endif
+
+      mean=0;
+      mem0=0;
+      /* Grouping by two to reduce complexity */
+      /* Forward pass to compute the post-echo threshold*/
+      for (i=0;i<len2;i++)
+      {
+         opus_val16 x2 = PSHR32(MULT16_16(tmp[2*i],tmp[2*i]) + MULT16_16(tmp[2*i+1],tmp[2*i+1]),16);
+         mean += x2;
+#ifdef OPUS_FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         tmp[i] = mem0 + PSHR32(x2-mem0,4);
+#else
+         tmp[i] = mem0 + MULT16_16_P15(QCONST16(.0625f,15),x2-mem0);
+#endif
+         mem0 = tmp[i];
+      }
+
+      mem0=0;
+      maxE=0;
+      /* Backward pass to compute the pre-echo threshold */
+      for (i=len2-1;i>=0;i--)
+      {
+#ifdef OPUS_FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3);
+#else
+         tmp[i] = mem0 + MULT16_16_P15(QCONST16(0.125f,15),tmp[i]-mem0);
+#endif
+         mem0 = tmp[i];
+         maxE = MAX16(maxE, mem0);
+      }
+      /*for (i=0;i<len2;i++)printf("%f ", tmp[i]/mean);printf("\n");*/
+
+      /* Compute the ratio of the "frame energy" over the harmonic mean of the energy.
+         This essentially corresponds to a bitrate-normalized temporal noise-to-mask
+         ratio */
+
+      /* As a compromise with the old transient detector, frame energy is the
+         geometric mean of the energy and half the max */
+#ifdef OPUS_FIXED_POINT
+      /* Costs two sqrt() to avoid overflows */
+      mean = MULT16_16(celt_sqrt(mean), celt_sqrt(MULT16_16(maxE,len2>>1)));
+#else
+      mean = celt_sqrt(mean * maxE*.5*len2);
+#endif
+      /* Inverse of the mean energy in Q15+6 */
+      norm = SHL32(EXTEND32(len2),6+14)/ADD32(EPSILON,SHR32(mean,1));
+      /* Compute harmonic mean discarding the unreliable boundaries
+         The data is smooth, so we only take 1/4th of the samples */
+      unmask=0;
+      for (i=12;i<len2-5;i+=4)
+      {
+         int id;
+#ifdef OPUS_FIXED_POINT
+         id = IMAX(0,IMIN(127,MULT16_32_Q15(tmp[i],norm))); /* Do not round to nearest */
+#else
+         id = IMAX(0,IMIN(127,(int)floor(64*norm*tmp[i]))); /* Do not round to nearest */
+#endif
+         unmask += inv_table[id];
+      }
+      /*printf("%d\n", unmask);*/
+      /* Normalize, compensate for the 1/4th of the sample and the factor of 6 in the inverse table */
+      unmask = 64*unmask*4/(6*(len2-17));
+      if (unmask>mask_metric)
+      {
+         *tf_chan = c;
+         mask_metric = unmask;
+      }
+   }
+   is_transient = mask_metric>200;
+
+   /* Arbitrary metric for VBR boost */
+   tf_max = MAX16(0,celt_sqrt(27*mask_metric)-42);
+   /* *tf_estimate = 1 + MIN16(1, sqrt(MAX16(0, tf_max-30))/20); */
+   *tf_estimate = celt_sqrt(MAX16(0, SHL32(MULT16_16(QCONST16(0.0069,14),MIN16(163,tf_max)),14)-QCONST32(0.139,28)));
+   /*printf("%d %f\n", tf_max, mask_metric);*/
+   RESTORE_STACK;
+#ifdef FUZZING
+   is_transient = rand()&0x1;
+#endif
+   /*printf("%d %f %d\n", is_transient, (float)*tf_estimate, tf_max);*/
+   return is_transient;
+}
+
+/* Looks for sudden increases of energy to decide whether we need to patch
+   the transient decision */
+int patch_transient_decision(opus_val16 *newE, opus_val16 *oldE, int nbEBands,
+      int end, int C)
+{
+   int i, c;
+   opus_val32 mean_diff=0;
+   opus_val16 spread_old[26];
+   /* Apply an aggressive (-6 dB/Bark) spreading function to the old frame to
+      avoid false detection caused by irrelevant bands */
+   if (C==1)
+   {
+      spread_old[0] = oldE[0];
+      for (i=1;i<end;i++)
+         spread_old[i] = MAX16(spread_old[i-1]-QCONST16(1.0f, DB_SHIFT), oldE[i]);
+   } else {
+      spread_old[0] = MAX16(oldE[0],oldE[nbEBands]);
+      for (i=1;i<end;i++)
+         spread_old[i] = MAX16(spread_old[i-1]-QCONST16(1.0f, DB_SHIFT),
+                               MAX16(oldE[i],oldE[i+nbEBands]));
+   }
+   for (i=end-2;i>=0;i--)
+      spread_old[i] = MAX16(spread_old[i], spread_old[i+1]-QCONST16(1.0f, DB_SHIFT));
+   /* Compute mean increase */
+   c=0; do {
+      for (i=2;i<end-1;i++)
+      {
+         opus_val16 x1, x2;
+         x1 = MAX16(0, newE[i]);
+         x2 = MAX16(0, spread_old[i]);
+         mean_diff = ADD32(mean_diff, EXTEND32(MAX16(0, SUB16(x1, x2))));
+      }
+   } while (++c<C);
+   mean_diff = DIV32(mean_diff, C*(end-3));
+   /*printf("%f %f %d\n", mean_diff, max_diff, count);*/
+   return mean_diff > QCONST16(1.f, DB_SHIFT);
+}
+
+/** Apply window and compute the MDCT for all sub-frames and
+    all channels in a frame */
+static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS_RESTRICT in,
+                          celt_sig * OPUS_RESTRICT out, int C, int CC, int LM, int upsample)
+{
+   const int overlap = OVERLAP(mode);
+   int N;
+   int B;
+   int shift;
+   int i, b, c;
+   if (shortBlocks)
+   {
+      B = shortBlocks;
+      N = mode->shortMdctSize;
+      shift = mode->maxLM;
+   } else {
+      B = 1;
+      N = mode->shortMdctSize<<LM;
+      shift = mode->maxLM-LM;
+   }
+   c=0; do {
+      for (b=0;b<B;b++)
+      {
+         /* Interleaving the sub-frames while doing the MDCTs */
+         clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N, &out[b+c*N*B], mode->window, overlap, shift, B);
+      }
+   } while (++c<CC);
+   if (CC==2&&C==1)
+   {
+      for (i=0;i<B*N;i++)
+         out[i] = ADD32(HALF32(out[i]), HALF32(out[B*N+i]));
+   }
+   if (upsample != 1)
+   {
+      c=0; do
+      {
+         int bound = B*N/upsample;
+         for (i=0;i<bound;i++)
+            out[c*B*N+i] *= upsample;
+         for (;i<B*N;i++)
+            out[c*B*N+i] = 0;
+      } while (++c<C);
+   }
+}
+
+
+void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+                        int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip)
+{
+   int i;
+   opus_val16 coef0;
+   celt_sig m;
+   int Nu;
+
+   coef0 = coef[0];
+
+
+   Nu = N/upsample;
+   if (upsample!=1)
+   {
+      for (i=0;i<N;i++)
+         inp[i] = 0;
+   }
+   for (i=0;i<Nu;i++)
+   {
+      celt_sig x;
+
+      x = SCALEIN(pcmp[CC*i]);
+#ifndef OPUS_FIXED_POINT
+      /* Replace NaNs with zeros */
+      if (!(x==x))
+         x = 0;
+#endif
+      inp[i*upsample] = x;
+   }
+
+#ifndef OPUS_FIXED_POINT
+   if (clip)
+   {
+      /* Clip input to avoid encoding non-portable files */
+      for (i=0;i<Nu;i++)
+         inp[i*upsample] = MAX32(-65536.f, MIN32(65536.f,inp[i*upsample]));
+   }
+#else
+   (void)clip; /* Avoids a warning about clip being unused. */
+#endif
+   m = *mem;
+#ifdef CUSTOM_MODES
+   if (coef[1] != 0)
+   {
+      opus_val16 coef1 = coef[1];
+      opus_val16 coef2 = coef[2];
+      for (i=0;i<N;i++)
+      {
+         celt_sig x, tmp;
+         x = inp[i];
+         /* Apply pre-emphasis */
+         tmp = MULT16_16(coef2, x);
+         inp[i] = tmp + m;
+         m = MULT16_32_Q15(coef1, inp[i]) - MULT16_32_Q15(coef0, tmp);
+      }
+   } else
+#endif
+   {
+      for (i=0;i<N;i++)
+      {
+         celt_sig x;
+         x = SHL32(inp[i], SIG_SHIFT);
+         /* Apply pre-emphasis */
+         inp[i] = x + m;
+         m = - MULT16_32_Q15(coef0, x);
+      }
+   }
+   *mem = m;
+}
+
+
+
+static opus_val32 l1_metric(const celt_norm *tmp, int N, int LM, opus_val16 bias)
+{
+   int i;
+   opus_val32 L1;
+   L1 = 0;
+   for (i=0;i<N;i++)
+      L1 += EXTEND32(ABS16(tmp[i]));
+   /* When in doubt, prefer good freq resolution */
+   L1 = MAC16_32_Q15(L1, LM*bias, L1);
+   return L1;
+
+}
+
+static int tf_analysis(const CELTMode *m, int len, int isTransient,
+      int *tf_res, int lambda, celt_norm *X, int N0, int LM,
+      int *tf_sum, opus_val16 tf_estimate, int tf_chan)
+{
+   int i;
+   VARDECL(int, metric);
+   int cost0;
+   int cost1;
+   VARDECL(int, path0);
+   VARDECL(int, path1);
+   VARDECL(celt_norm, tmp);
+   VARDECL(celt_norm, tmp_1);
+   int sel;
+   int selcost[2];
+   int tf_select=0;
+   opus_val16 bias;
+
+   SAVE_STACK;
+   bias = MULT16_16_Q14(QCONST16(.04f,15), MAX16(-QCONST16(.25f,14), QCONST16(.5f,14)-tf_estimate));
+   /*printf("%f ", bias);*/
+
+   ALLOC(metric, len, int);
+   ALLOC(tmp, (m->eBands[len]-m->eBands[len-1])<<LM, celt_norm);
+   ALLOC(tmp_1, (m->eBands[len]-m->eBands[len-1])<<LM, celt_norm);
+   ALLOC(path0, len, int);
+   ALLOC(path1, len, int);
+
+   *tf_sum = 0;
+   for (i=0;i<len;i++)
+   {
+      int j, k, N;
+      int narrow;
+      opus_val32 L1, best_L1;
+      int best_level=0;
+      N = (m->eBands[i+1]-m->eBands[i])<<LM;
+      /* band is too narrow to be split down to LM=-1 */
+      narrow = (m->eBands[i+1]-m->eBands[i])==1;
+      for (j=0;j<N;j++)
+         tmp[j] = X[tf_chan*N0 + j+(m->eBands[i]<<LM)];
+      /* Just add the right channel if we're in stereo */
+      /*if (C==2)
+         for (j=0;j<N;j++)
+            tmp[j] = ADD16(SHR16(tmp[j], 1),SHR16(X[N0+j+(m->eBands[i]<<LM)], 1));*/
+      L1 = l1_metric(tmp, N, isTransient ? LM : 0, bias);
+      best_L1 = L1;
+      /* Check the -1 case for transients */
+      if (isTransient && !narrow)
+      {
+         for (j=0;j<N;j++)
+            tmp_1[j] = tmp[j];
+         haar1(tmp_1, N>>LM, 1<<LM);
+         L1 = l1_metric(tmp_1, N, LM+1, bias);
+         if (L1<best_L1)
+         {
+            best_L1 = L1;
+            best_level = -1;
+         }
+      }
+      /*printf ("%f ", L1);*/
+      for (k=0;k<LM+!(isTransient||narrow);k++)
+      {
+         int B;
+
+         if (isTransient)
+            B = (LM-k-1);
+         else
+            B = k+1;
+
+         haar1(tmp, N>>k, 1<<k);
+
+         L1 = l1_metric(tmp, N, B, bias);
+
+         if (L1 < best_L1)
+         {
+            best_L1 = L1;
+            best_level = k+1;
+         }
+      }
+      /*printf ("%d ", isTransient ? LM-best_level : best_level);*/
+      /* metric is in Q1 to be able to select the mid-point (-0.5) for narrower bands */
+      if (isTransient)
+         metric[i] = 2*best_level;
+      else
+         metric[i] = -2*best_level;
+      *tf_sum += (isTransient ? LM : 0) - metric[i]/2;
+      /* For bands that can't be split to -1, set the metric to the half-way point to avoid
+         biasing the decision */
+      if (narrow && (metric[i]==0 || metric[i]==-2*LM))
+         metric[i]-=1;
+      /*printf("%d ", metric[i]);*/
+   }
+   /*printf("\n");*/
+   /* Search for the optimal tf resolution, including tf_select */
+   tf_select = 0;
+   for (sel=0;sel<2;sel++)
+   {
+      cost0 = 0;
+      cost1 = isTransient ? 0 : lambda;
+      for (i=1;i<len;i++)
+      {
+         int curr0, curr1;
+         curr0 = IMIN(cost0, cost1 + lambda);
+         curr1 = IMIN(cost0 + lambda, cost1);
+         cost0 = curr0 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+0]);
+         cost1 = curr1 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+1]);
+      }
+      cost0 = IMIN(cost0, cost1);
+      selcost[sel]=cost0;
+   }
+   /* For now, we're conservative and only allow tf_select=1 for transients.
+    * If tests confirm it's useful for non-transients, we could allow it. */
+   if (selcost[1]<selcost[0] && isTransient)
+      tf_select=1;
+   cost0 = 0;
+   cost1 = isTransient ? 0 : lambda;
+   /* Viterbi forward pass */
+   for (i=1;i<len;i++)
+   {
+      int curr0, curr1;
+      int from0, from1;
+
+      from0 = cost0;
+      from1 = cost1 + lambda;
+      if (from0 < from1)
+      {
+         curr0 = from0;
+         path0[i]= 0;
+      } else {
+         curr0 = from1;
+         path0[i]= 1;
+      }
+
+      from0 = cost0 + lambda;
+      from1 = cost1;
+      if (from0 < from1)
+      {
+         curr1 = from0;
+         path1[i]= 0;
+      } else {
+         curr1 = from1;
+         path1[i]= 1;
+      }
+      cost0 = curr0 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+0]);
+      cost1 = curr1 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+1]);
+   }
+   tf_res[len-1] = cost0 < cost1 ? 0 : 1;
+   /* Viterbi backward pass to check the decisions */
+   for (i=len-2;i>=0;i--)
+   {
+      if (tf_res[i+1] == 1)
+         tf_res[i] = path1[i+1];
+      else
+         tf_res[i] = path0[i+1];
+   }
+   /*printf("%d %f\n", *tf_sum, tf_estimate);*/
+   RESTORE_STACK;
+#ifdef FUZZING
+   tf_select = rand()&0x1;
+   tf_res[0] = rand()&0x1;
+   for (i=1;i<len;i++)
+      tf_res[i] = tf_res[i-1] ^ ((rand()&0xF) == 0);
+#endif
+   return tf_select;
+}
+
+static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM, int tf_select, ec_enc *enc)
+{
+   int curr, i;
+   int tf_select_rsv;
+   int tf_changed;
+   int logp;
+   opus_uint32 budget;
+   opus_uint32 tell;
+   budget = enc->storage*8;
+   tell = ec_tell(enc);
+   logp = isTransient ? 2 : 4;
+   /* Reserve space to code the tf_select decision. */
+   tf_select_rsv = LM>0 && tell+logp+1 <= budget;
+   budget -= tf_select_rsv;
+   curr = tf_changed = 0;
+   for (i=start;i<end;i++)
+   {
+      if (tell+logp<=budget)
+      {
+         ec_enc_bit_logp(enc, tf_res[i] ^ curr, logp);
+         tell = ec_tell(enc);
+         curr = tf_res[i];
+         tf_changed |= curr;
+      }
+      else
+         tf_res[i] = curr;
+      logp = isTransient ? 4 : 5;
+   }
+   /* Only code tf_select if it would actually make a difference. */
+   if (tf_select_rsv &&
+         tf_select_table[LM][4*isTransient+0+tf_changed]!=
+         tf_select_table[LM][4*isTransient+2+tf_changed])
+      ec_enc_bit_logp(enc, tf_select, 1);
+   else
+      tf_select = 0;
+   for (i=start;i<end;i++)
+      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
+   /*for(i=0;i<end;i++)printf("%d ", isTransient ? tf_res[i] : LM+tf_res[i]);printf("\n");*/
+}
+
+
+static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
+      const opus_val16 *bandLogE, int end, int LM, int C, int N0,
+      AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
+      int intensity, opus_val16 surround_trim)
+{
+   int i;
+   opus_val32 diff=0;
+   int c;
+   int trim_index = 5;
+   opus_val16 trim = QCONST16(5.f, 8);
+   opus_val16 logXC, logXC2;
+   if (C==2)
+   {
+      opus_val16 sum = 0; /* Q10 */
+      opus_val16 minXC; /* Q10 */
+      /* Compute inter-channel correlation for low frequencies */
+      for (i=0;i<8;i++)
+      {
+         int j;
+         opus_val32 partial = 0;
+         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
+            partial = MAC16_16(partial, X[j], X[N0+j]);
+         sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
+      }
+      sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
+      sum = MIN16(QCONST16(1.f, 10), ABS16(sum));
+      minXC = sum;
+      for (i=8;i<intensity;i++)
+      {
+         int j;
+         opus_val32 partial = 0;
+         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
+            partial = MAC16_16(partial, X[j], X[N0+j]);
+         minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
+      }
+      minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
+      /*printf ("%f\n", sum);*/
+      if (sum > QCONST16(.995f,10))
+         trim_index-=4;
+      else if (sum > QCONST16(.92f,10))
+         trim_index-=3;
+      else if (sum > QCONST16(.85f,10))
+         trim_index-=2;
+      else if (sum > QCONST16(.8f,10))
+         trim_index-=1;
+      /* mid-side savings estimations based on the LF average*/
+      logXC = celt_log2(QCONST32(1.001f, 20)-MULT16_16(sum, sum));
+      /* mid-side savings estimations based on min correlation */
+      logXC2 = MAX16(HALF16(logXC), celt_log2(QCONST32(1.001f, 20)-MULT16_16(minXC, minXC)));
+#ifdef OPUS_FIXED_POINT
+      /* Compensate for Q20 vs Q14 input and convert output to Q8 */
+      logXC = PSHR32(logXC-QCONST16(6.f, DB_SHIFT),DB_SHIFT-8);
+      logXC2 = PSHR32(logXC2-QCONST16(6.f, DB_SHIFT),DB_SHIFT-8);
+#endif
+
+      trim += MAX16(-QCONST16(4.f, 8), MULT16_16_Q15(QCONST16(.75f,15),logXC));
+      *stereo_saving = MIN16(*stereo_saving + QCONST16(0.25f, 8), -HALF16(logXC2));
+   }
+
+   /* Estimate spectral tilt */
+   c=0; do {
+      for (i=0;i<end-1;i++)
+      {
+         diff += bandLogE[i+c*m->nbEBands]*(opus_int32)(2+2*i-end);
+      }
+   } while (++c<C);
+   diff /= C*(end-1);
+   /*printf("%f\n", diff);*/
+   if (diff > QCONST16(2.f, DB_SHIFT))
+      trim_index--;
+   if (diff > QCONST16(8.f, DB_SHIFT))
+      trim_index--;
+   if (diff < -QCONST16(4.f, DB_SHIFT))
+      trim_index++;
+   if (diff < -QCONST16(10.f, DB_SHIFT))
+      trim_index++;
+   trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), SHR16(diff+QCONST16(1.f, DB_SHIFT),DB_SHIFT-8)/6 ));
+   trim -= SHR16(surround_trim, DB_SHIFT-8);
+   trim -= 2*SHR16(tf_estimate, 14-8);
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid)
+   {
+      trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8),
+            (opus_val16)(QCONST16(2.f, 8)*(analysis->tonality_slope+.05f))));
+   }
+#endif
+
+#ifdef OPUS_FIXED_POINT
+   trim_index = PSHR32(trim, 8);
+#else
+   trim_index = (int)floor(.5f+trim);
+#endif
+   if (trim_index<0)
+      trim_index = 0;
+   if (trim_index>10)
+      trim_index = 10;
+   /*printf("%d\n", trim_index);*/
+#ifdef FUZZING
+   trim_index = rand()%11;
+#endif
+   return trim_index;
+}
+
+static int stereo_analysis(const CELTMode *m, const celt_norm *X,
+      int LM, int N0)
+{
+   int i;
+   int thetas;
+   opus_val32 sumLR = EPSILON, sumMS = EPSILON;
+
+   /* Use the L1 norm to model the entropy of the L/R signal vs the M/S signal */
+   for (i=0;i<13;i++)
+   {
+      int j;
+      for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
+      {
+         opus_val32 L, R, M, S;
+         /* We cast to 32-bit first because of the -32768 case */
+         L = EXTEND32(X[j]);
+         R = EXTEND32(X[N0+j]);
+         M = ADD32(L, R);
+         S = SUB32(L, R);
+         sumLR = ADD32(sumLR, ADD32(ABS32(L), ABS32(R)));
+         sumMS = ADD32(sumMS, ADD32(ABS32(M), ABS32(S)));
+      }
+   }
+   sumMS = MULT16_32_Q15(QCONST16(0.707107f, 15), sumMS);
+   thetas = 13;
+   /* We don't need thetas for lower bands with LM<=1 */
+   if (LM<=1)
+      thetas -= 8;
+   return MULT16_32_Q15((m->eBands[13]<<(LM+1))+thetas, sumMS)
+         > MULT16_32_Q15(m->eBands[13]<<(LM+1), sumLR);
+}
+
+static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2,
+      int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
+      int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
+      int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc)
+{
+   int i, c;
+   opus_int32 tot_boost=0;
+   opus_val16 maxDepth;
+   VARDECL(opus_val16, follower);
+   VARDECL(opus_val16, noise_floor);
+   SAVE_STACK;
+   ALLOC(follower, C*nbEBands, opus_val16);
+   ALLOC(noise_floor, C*nbEBands, opus_val16);
+   for (i=0;i<nbEBands;i++)
+      offsets[i] = 0;
+   /* Dynamic allocation code */
+   maxDepth=-QCONST16(31.9f, DB_SHIFT);
+   for (i=0;i<end;i++)
+   {
+      /* Noise floor must take into account eMeans, the depth, the width of the bands
+         and the preemphasis filter (approx. square of bark band ID) */
+      noise_floor[i] = MULT16_16(QCONST16(0.0625f, DB_SHIFT),logN[i])
+            +QCONST16(.5f,DB_SHIFT)+SHL16(9-lsb_depth,DB_SHIFT)-SHL16(eMeans[i],6)
+            +MULT16_16(QCONST16(.0062,DB_SHIFT),(i+5)*(i+5));
+   }
+   c=0;do
+   {
+      for (i=0;i<end;i++)
+         maxDepth = MAX16(maxDepth, bandLogE[c*nbEBands+i]-noise_floor[i]);
+   } while (++c<C);
+   /* Make sure that dynamic allocation can't make us bust the budget */
+   if (effectiveBytes > 50 && LM>=1 && !lfe)
+   {
+      int last=0;
+      c=0;do
+      {
+         follower[c*nbEBands] = bandLogE2[c*nbEBands];
+         for (i=1;i<end;i++)
+         {
+            /* The last band to be at least 3 dB higher than the previous one
+               is the last we'll consider. Otherwise, we run into problems on
+               bandlimited signals. */
+            if (bandLogE2[c*nbEBands+i] > bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT))
+               last=i;
+            follower[c*nbEBands+i] = MIN16(follower[c*nbEBands+i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]);
+         }
+         for (i=last-1;i>=0;i--)
+            follower[c*nbEBands+i] = MIN16(follower[c*nbEBands+i], MIN16(follower[c*nbEBands+i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i]));
+         for (i=0;i<end;i++)
+            follower[c*nbEBands+i] = MAX16(follower[c*nbEBands+i], noise_floor[i]);
+      } while (++c<C);
+      if (C==2)
+      {
+         for (i=start;i<end;i++)
+         {
+            /* Consider 24 dB "cross-talk" */
+            follower[nbEBands+i] = MAX16(follower[nbEBands+i], follower[         i]-QCONST16(4.f,DB_SHIFT));
+            follower[         i] = MAX16(follower[         i], follower[nbEBands+i]-QCONST16(4.f,DB_SHIFT));
+            follower[i] = HALF16(MAX16(0, bandLogE[i]-follower[i]) + MAX16(0, bandLogE[nbEBands+i]-follower[nbEBands+i]));
+         }
+      } else {
+         for (i=start;i<end;i++)
+         {
+            follower[i] = MAX16(0, bandLogE[i]-follower[i]);
+         }
+      }
+      for (i=start;i<end;i++)
+         follower[i] = MAX16(follower[i], surround_dynalloc[i]);
+      /* For non-transient CBR/CVBR frames, halve the dynalloc contribution */
+      if ((!vbr || constrained_vbr)&&!isTransient)
+      {
+         for (i=start;i<end;i++)
+            follower[i] = HALF16(follower[i]);
+      }
+      for (i=start;i<end;i++)
+      {
+         int width;
+         int boost;
+         int boost_bits;
+
+         if (i<8)
+            follower[i] *= 2;
+         if (i>=12)
+            follower[i] = HALF16(follower[i]);
+         follower[i] = MIN16(follower[i], QCONST16(4, DB_SHIFT));
+
+         width = C*(eBands[i+1]-eBands[i])<<LM;
+         if (width<6)
+         {
+            boost = (int)SHR32(EXTEND32(follower[i]),DB_SHIFT);
+            boost_bits = boost*width<<BITRES;
+         } else if (width > 48) {
+            boost = (int)SHR32(EXTEND32(follower[i])*8,DB_SHIFT);
+            boost_bits = (boost*width<<BITRES)/8;
+         } else {
+            boost = (int)SHR32(EXTEND32(follower[i])*width/6,DB_SHIFT);
+            boost_bits = boost*6<<BITRES;
+         }
+         /* For CBR and non-transient CVBR frames, limit dynalloc to 1/4 of the bits */
+         if ((!vbr || (constrained_vbr&&!isTransient))
+               && (tot_boost+boost_bits)>>BITRES>>3 > effectiveBytes/4)
+         {
+            opus_int32 cap = ((effectiveBytes/4)<<BITRES<<3);
+            offsets[i] = cap-tot_boost;
+            tot_boost = cap;
+            break;
+         } else {
+            offsets[i] = boost;
+            tot_boost += boost_bits;
+         }
+      }
+   }
+   *tot_boost_ = tot_boost;
+   RESTORE_STACK;
+   return maxDepth;
+}
+
+
+static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, int CC, int N,
+      int prefilter_tapset, int *pitch, opus_val16 *gain, int *qgain, int enabled, int nbAvailableBytes)
+{
+   int c;
+   VARDECL(celt_sig, _pre);
+   celt_sig *pre[2];
+   const CELTMode *mode;
+   int pitch_index;
+   opus_val16 gain1;
+   opus_val16 pf_threshold;
+   int pf_on;
+   int qg;
+   SAVE_STACK;
+
+   mode = st->mode;
+   ALLOC(_pre, CC*(N+COMBFILTER_MAXPERIOD), celt_sig);
+
+   pre[0] = _pre;
+   pre[1] = _pre + (N+COMBFILTER_MAXPERIOD);
+
+
+   c=0; do {
+      OPUS_COPY(pre[c], prefilter_mem+c*COMBFILTER_MAXPERIOD, COMBFILTER_MAXPERIOD);
+      OPUS_COPY(pre[c]+COMBFILTER_MAXPERIOD, in+c*(N+st->overlap)+st->overlap, N);
+   } while (++c<CC);
+
+   if (enabled)
+   {
+      VARDECL(opus_val16, pitch_buf);
+      ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16);
+
+      pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch);
+      /* Don't search for the fir last 1.5 octave of the range because
+         there's too many false-positives due to short-term correlation */
+      pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N,
+            COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index,
+            st->arch);
+      pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
+
+      gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
+            N, &pitch_index, st->prefilter_period, st->prefilter_gain);
+      if (pitch_index > COMBFILTER_MAXPERIOD-2)
+         pitch_index = COMBFILTER_MAXPERIOD-2;
+      gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
+      /*printf("%d %d %f %f\n", pitch_change, pitch_index, gain1, st->analysis.tonality);*/
+      if (st->loss_rate>2)
+         gain1 = HALF32(gain1);
+      if (st->loss_rate>4)
+         gain1 = HALF32(gain1);
+      if (st->loss_rate>8)
+         gain1 = 0;
+   } else {
+      gain1 = 0;
+      pitch_index = COMBFILTER_MINPERIOD;
+   }
+
+   /* Gain threshold for enabling the prefilter/postfilter */
+   pf_threshold = QCONST16(.2f,15);
+
+   /* Adjusting the threshold based on rate and continuity */
+   if (abs(pitch_index-st->prefilter_period)*10>pitch_index)
+      pf_threshold += QCONST16(.2f,15);
+   if (nbAvailableBytes<25)
+      pf_threshold += QCONST16(.1f,15);
+   if (nbAvailableBytes<35)
+      pf_threshold += QCONST16(.1f,15);
+   if (st->prefilter_gain > QCONST16(.4f,15))
+      pf_threshold -= QCONST16(.1f,15);
+   if (st->prefilter_gain > QCONST16(.55f,15))
+      pf_threshold -= QCONST16(.1f,15);
+
+   /* Hard threshold at 0.2 */
+   pf_threshold = MAX16(pf_threshold, QCONST16(.2f,15));
+   if (gain1<pf_threshold)
+   {
+      gain1 = 0;
+      pf_on = 0;
+      qg = 0;
+   } else {
+      /*This block is not gated by a total bits check only because
+        of the nbAvailableBytes check above.*/
+      if (ABS16(gain1-st->prefilter_gain)<QCONST16(.1f,15))
+         gain1=st->prefilter_gain;
+
+#ifdef OPUS_FIXED_POINT
+      qg = ((gain1+1536)>>10)/3-1;
+#else
+      qg = (int)floor(.5f+gain1*32/3)-1;
+#endif
+      qg = IMAX(0, IMIN(7, qg));
+      gain1 = QCONST16(0.09375f,15)*(qg+1);
+      pf_on = 1;
+   }
+   /*printf("%d %f\n", pitch_index, gain1);*/
+
+   c=0; do {
+      int offset = mode->shortMdctSize-st->overlap;
+      st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
+      OPUS_COPY(in+c*(N+st->overlap), st->in_mem+c*(st->overlap), st->overlap);
+      if (offset)
+         comb_filter(in+c*(N+st->overlap)+st->overlap, pre[c]+COMBFILTER_MAXPERIOD,
+               st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain,
+               st->prefilter_tapset, st->prefilter_tapset, NULL, 0);
+
+      comb_filter(in+c*(N+st->overlap)+st->overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset,
+            st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1,
+            st->prefilter_tapset, prefilter_tapset, mode->window, st->overlap);
+      OPUS_COPY(st->in_mem+c*(st->overlap), in+c*(N+st->overlap)+N, st->overlap);
+
+      if (N>COMBFILTER_MAXPERIOD)
+      {
+         OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, pre[c]+N, COMBFILTER_MAXPERIOD);
+      } else {
+         OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, prefilter_mem+c*COMBFILTER_MAXPERIOD+N, COMBFILTER_MAXPERIOD-N);
+         OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD+COMBFILTER_MAXPERIOD-N, pre[c]+COMBFILTER_MAXPERIOD, N);
+      }
+   } while (++c<CC);
+
+   RESTORE_STACK;
+   *gain = gain1;
+   *pitch = pitch_index;
+   *qgain = qg;
+   return pf_on;
+}
+
+static int compute_vbr(const CELTMode *mode, AnalysisInfo *analysis, opus_int32 base_target,
+      int LM, opus_int32 bitrate, int lastCodedBands, int C, int intensity,
+      int constrained_vbr, opus_val16 stereo_saving, int tot_boost,
+      opus_val16 tf_estimate, int pitch_change, opus_val16 maxDepth,
+      int variable_duration, int lfe, int has_surround_mask, opus_val16 surround_masking,
+      opus_val16 temporal_vbr)
+{
+   /* The target rate in 8th bits per frame */
+   opus_int32 target;
+   int coded_bins;
+   int coded_bands;
+   opus_val16 tf_calibration;
+   int nbEBands;
+   const opus_int16 *eBands;
+
+   nbEBands = mode->nbEBands;
+   eBands = mode->eBands;
+
+   coded_bands = lastCodedBands ? lastCodedBands : nbEBands;
+   coded_bins = eBands[coded_bands]<<LM;
+   if (C==2)
+      coded_bins += eBands[IMIN(intensity, coded_bands)]<<LM;
+
+   target = base_target;
+
+   /*printf("%f %f %f %f %d %d ", st->analysis.activity, st->analysis.tonality, tf_estimate, st->stereo_saving, tot_boost, coded_bands);*/
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid && analysis->activity<.4)
+      target -= (opus_int32)((coded_bins<<BITRES)*(.4f-analysis->activity));
+#endif
+   /* Stereo savings */
+   if (C==2)
+   {
+      int coded_stereo_bands;
+      int coded_stereo_dof;
+      opus_val16 max_frac;
+      coded_stereo_bands = IMIN(intensity, coded_bands);
+      coded_stereo_dof = (eBands[coded_stereo_bands]<<LM)-coded_stereo_bands;
+      /* Maximum fraction of the bits we can save if the signal is mono. */
+      max_frac = DIV32_16(MULT16_16(QCONST16(0.8f, 15), coded_stereo_dof), coded_bins);
+      stereo_saving = MIN16(stereo_saving, QCONST16(1.f, 8));
+      /*printf("%d %d %d ", coded_stereo_dof, coded_bins, tot_boost);*/
+      target -= (opus_int32)MIN32(MULT16_32_Q15(max_frac,target),
+                      SHR32(MULT16_16(stereo_saving-QCONST16(0.1f,8),(coded_stereo_dof<<BITRES)),8));
+   }
+   /* Boost the rate according to dynalloc (minus the dynalloc average for calibration). */
+   target += tot_boost-(16<<LM);
+   /* Apply transient boost, compensating for average boost. */
+   tf_calibration = variable_duration==OPUS_FRAMESIZE_VARIABLE ?
+                    QCONST16(0.02f,14) : QCONST16(0.04f,14);
+   target += (opus_int32)SHL32(MULT16_32_Q15(tf_estimate-tf_calibration, target),1);
+
+#ifndef DISABLE_FLOAT_API
+   /* Apply tonality boost */
+   if (analysis->valid && !lfe)
+   {
+      opus_int32 tonal_target;
+      float tonal;
+
+      /* Tonality boost (compensating for the average). */
+      tonal = MAX16(0.f,analysis->tonality-.15f)-0.09f;
+      tonal_target = target + (opus_int32)((coded_bins<<BITRES)*1.2f*tonal);
+      if (pitch_change)
+         tonal_target +=  (opus_int32)((coded_bins<<BITRES)*.8f);
+      /*printf("%f %f ", analysis->tonality, tonal);*/
+      target = tonal_target;
+   }
+#endif
+
+   if (has_surround_mask&&!lfe)
+   {
+      opus_int32 surround_target = target + (opus_int32)SHR32(MULT16_16(surround_masking,coded_bins<<BITRES), DB_SHIFT);
+      /*printf("%f %d %d %d %d %d %d ", surround_masking, coded_bins, st->end, st->intensity, surround_target, target, st->bitrate);*/
+      target = IMAX(target/4, surround_target);
+   }
+
+   {
+      opus_int32 floor_depth;
+      int bins;
+      bins = eBands[nbEBands-2]<<LM;
+      /*floor_depth = SHR32(MULT16_16((C*bins<<BITRES),celt_log2(SHL32(MAX16(1,sample_max),13))), DB_SHIFT);*/
+      floor_depth = (opus_int32)SHR32(MULT16_16((C*bins<<BITRES),maxDepth), DB_SHIFT);
+      floor_depth = IMAX(floor_depth, target>>2);
+      target = IMIN(target, floor_depth);
+      /*printf("%f %d\n", maxDepth, floor_depth);*/
+   }
+
+   if ((!has_surround_mask||lfe) && (constrained_vbr || bitrate<64000))
+   {
+      opus_val16 rate_factor;
+#ifdef OPUS_FIXED_POINT
+      rate_factor = MAX16(0,(bitrate-32000));
+#else
+      rate_factor = MAX16(0,(1.f/32768)*(bitrate-32000));
+#endif
+      if (constrained_vbr)
+         rate_factor = MIN16(rate_factor, QCONST16(0.67f, 15));
+      target = base_target + (opus_int32)MULT16_32_Q15(rate_factor, target-base_target);
+
+   }
+
+   if (!has_surround_mask && tf_estimate < QCONST16(.2f, 14))
+   {
+      opus_val16 amount;
+      opus_val16 tvbr_factor;
+      amount = MULT16_16_Q15(QCONST16(.0000031f, 30), IMAX(0, IMIN(32000, 96000-bitrate)));
+      tvbr_factor = SHR32(MULT16_16(temporal_vbr, amount), DB_SHIFT);
+      target += (opus_int32)MULT16_32_Q15(tvbr_factor, target);
+   }
+
+   /* Don't allow more than doubling the rate */
+   target = IMIN(2*base_target, target);
+
+   return target;
+}
+
+int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc)
+{
+   int i, c, N;
+   opus_int32 bits;
+   ec_enc _enc;
+   VARDECL(celt_sig, in);
+   VARDECL(celt_sig, freq);
+   VARDECL(celt_norm, X);
+   VARDECL(celt_ener, bandE);
+   VARDECL(opus_val16, bandLogE);
+   VARDECL(opus_val16, bandLogE2);
+   VARDECL(int, fine_quant);
+   VARDECL(opus_val16, error);
+   VARDECL(int, pulses);
+   VARDECL(int, cap);
+   VARDECL(int, offsets);
+   VARDECL(int, fine_priority);
+   VARDECL(int, tf_res);
+   VARDECL(unsigned char, collapse_masks);
+   celt_sig *prefilter_mem;
+   opus_val16 *oldBandE, *oldLogE, *oldLogE2;
+   int shortBlocks=0;
+   int isTransient=0;
+   const int CC = st->channels;
+   const int C = st->stream_channels;
+   int LM, M;
+   int tf_select;
+   int nbFilledBytes, nbAvailableBytes;
+   int effEnd;
+   int codedBands;
+   int tf_sum;
+   int alloc_trim;
+   int pitch_index=COMBFILTER_MINPERIOD;
+   opus_val16 gain1 = 0;
+   int dual_stereo=0;
+   int effectiveBytes;
+   int dynalloc_logp;
+   opus_int32 vbr_rate;
+   opus_int32 total_bits;
+   opus_int32 total_boost;
+   opus_int32 balance;
+   opus_int32 tell;
+   int prefilter_tapset=0;
+   int pf_on;
+   int anti_collapse_rsv;
+   int anti_collapse_on=0;
+   int silence=0;
+   int tf_chan = 0;
+   opus_val16 tf_estimate;
+   int pitch_change=0;
+   opus_int32 tot_boost;
+   opus_val32 sample_max;
+   opus_val16 maxDepth;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   const opus_int16 *eBands;
+   int secondMdct;
+   int signalBandwidth;
+   int transient_got_disabled=0;
+   opus_val16 surround_masking=0;
+   opus_val16 temporal_vbr=0;
+   opus_val16 surround_trim = 0;
+   opus_int32 equiv_rate = 510000;
+   VARDECL(opus_val16, surround_dynalloc);
+   ALLOC_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+   tf_estimate = 0;
+   if (nbCompressedBytes<2 || pcm==NULL)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+
+   frame_size *= st->upsample;
+   for (LM=0;LM<=mode->maxLM;LM++)
+      if (mode->shortMdctSize<<LM==frame_size)
+         break;
+   if (LM>mode->maxLM)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+   M=1<<LM;
+   N = M*mode->shortMdctSize;
+
+   prefilter_mem = st->in_mem+CC*(st->overlap);
+   oldBandE = (opus_val16*)(st->in_mem+CC*(st->overlap+COMBFILTER_MAXPERIOD));
+   oldLogE = oldBandE + CC*nbEBands;
+   oldLogE2 = oldLogE + CC*nbEBands;
+
+   if (enc==NULL)
+   {
+      tell=1;
+      nbFilledBytes=0;
+   } else {
+      tell=ec_tell(enc);
+      nbFilledBytes=(tell+4)>>3;
+   }
+
+#ifdef CUSTOM_MODES
+   if (st->signalling && enc==NULL)
+   {
+      int tmp = (mode->effEBands-st->end)>>1;
+      st->end = IMAX(1, mode->effEBands-tmp);
+      compressed[0] = tmp<<5;
+      compressed[0] |= LM<<3;
+      compressed[0] |= (C==2)<<2;
+      /* Convert "standard mode" to Opus header */
+      if (mode->Fs==48000 && mode->shortMdctSize==120)
+      {
+         int c0 = toOpus(compressed[0]);
+         if (c0<0)
+         {
+            RESTORE_STACK;
+            return OPUS_BAD_ARG;
+         }
+         compressed[0] = c0;
+      }
+      compressed++;
+      nbCompressedBytes--;
+   }
+#else
+   celt_assert(st->signalling==0);
+#endif
+
+   /* Can't produce more than 1275 output bytes */
+   nbCompressedBytes = IMIN(nbCompressedBytes,1275);
+   nbAvailableBytes = nbCompressedBytes - nbFilledBytes;
+
+   if (st->vbr && st->bitrate!=OPUS_BITRATE_MAX)
+   {
+      opus_int32 den=mode->Fs>>BITRES;
+      vbr_rate=(st->bitrate*frame_size+(den>>1))/den;
+#ifdef CUSTOM_MODES
+      if (st->signalling)
+         vbr_rate -= 8<<BITRES;
+#endif
+      effectiveBytes = vbr_rate>>(3+BITRES);
+   } else {
+      opus_int32 tmp;
+      vbr_rate = 0;
+      tmp = st->bitrate*frame_size;
+      if (tell>1)
+         tmp += tell;
+      if (st->bitrate!=OPUS_BITRATE_MAX)
+         nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes,
+               (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling));
+      effectiveBytes = nbCompressedBytes;
+   }
+   if (st->bitrate != OPUS_BITRATE_MAX)
+      equiv_rate = st->bitrate - (40*C+20)*((400>>LM) - 50);
+
+   if (enc==NULL)
+   {
+      ec_enc_init(&_enc, compressed, nbCompressedBytes);
+      enc = &_enc;
+   }
+
+   if (vbr_rate>0)
+   {
+      /* Computes the max bit-rate allowed in VBR mode to avoid violating the
+          target rate and buffering.
+         We must do this up front so that bust-prevention logic triggers
+          correctly if we don't have enough bits. */
+      if (st->constrained_vbr)
+      {
+         opus_int32 vbr_bound;
+         opus_int32 max_allowed;
+         /* We could use any multiple of vbr_rate as bound (depending on the
+             delay).
+            This is clamped to ensure we use at least two bytes if the encoder
+             was entirely empty, but to allow 0 in hybrid mode. */
+         vbr_bound = vbr_rate;
+         max_allowed = IMIN(IMAX(tell==1?2:0,
+               (vbr_rate+vbr_bound-st->vbr_reservoir)>>(BITRES+3)),
+               nbAvailableBytes);
+         if(max_allowed < nbAvailableBytes)
+         {
+            nbCompressedBytes = nbFilledBytes+max_allowed;
+            nbAvailableBytes = max_allowed;
+            ec_enc_shrink(enc, nbCompressedBytes);
+         }
+      }
+   }
+   total_bits = nbCompressedBytes*8;
+
+   effEnd = st->end;
+   if (effEnd > mode->effEBands)
+      effEnd = mode->effEBands;
+
+   ALLOC(in, CC*(N+st->overlap), celt_sig);
+
+   sample_max=MAX32(st->overlap_max, celt_maxabs16(pcm, C*(N-overlap)/st->upsample));
+   st->overlap_max=celt_maxabs16(pcm+C*(N-overlap)/st->upsample, C*overlap/st->upsample);
+   sample_max=MAX32(sample_max, st->overlap_max);
+#ifdef OPUS_FIXED_POINT
+   silence = (sample_max==0);
+#else
+   silence = (sample_max <= (opus_val16)1/(1<<st->lsb_depth));
+#endif
+#ifdef FUZZING
+   if ((rand()&0x3F)==0)
+      silence = 1;
+#endif
+   if (tell==1)
+      ec_enc_bit_logp(enc, silence, 15);
+   else
+      silence=0;
+   if (silence)
+   {
+      /*In VBR mode there is no need to send more than the minimum. */
+      if (vbr_rate>0)
+      {
+         effectiveBytes=nbCompressedBytes=IMIN(nbCompressedBytes, nbFilledBytes+2);
+         total_bits=nbCompressedBytes*8;
+         nbAvailableBytes=2;
+         ec_enc_shrink(enc, nbCompressedBytes);
+      }
+      /* Pretend we've filled all the remaining bits with zeros
+            (that's what the initialiser did anyway) */
+      tell = nbCompressedBytes*8;
+      enc->nbits_total+=tell-ec_tell(enc);
+   }
+   c=0; do {
+      celt_preemphasis(pcm+c, in+c*(N+st->overlap)+st->overlap, N, CC, st->upsample,
+                  mode->preemph, st->preemph_memE+c, st->clip);
+   } while (++c<CC);
+
+
+
+   /* Find pitch period and gain */
+   {
+      int enabled;
+      int qg;
+      enabled = ((st->lfe&&nbAvailableBytes>3) || nbAvailableBytes>12*C) && st->start==0 && !silence && !st->disable_pf
+            && st->complexity >= 5 && !(st->consec_transient && LM!=3 && st->variable_duration==OPUS_FRAMESIZE_VARIABLE);
+
+      prefilter_tapset = st->tapset_decision;
+      pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes);
+      if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3)
+            && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period))
+         pitch_change = 1;
+      if (pf_on==0)
+      {
+         if(st->start==0 && tell+16<=total_bits)
+            ec_enc_bit_logp(enc, 0, 1);
+      } else {
+         /*This block is not gated by a total bits check only because
+           of the nbAvailableBytes check above.*/
+         int octave;
+         ec_enc_bit_logp(enc, 1, 1);
+         pitch_index += 1;
+         octave = EC_ILOG(pitch_index)-5;
+         ec_enc_uint(enc, octave, 6);
+         ec_enc_bits(enc, pitch_index-(16<<octave), 4+octave);
+         pitch_index -= 1;
+         ec_enc_bits(enc, qg, 3);
+         ec_enc_icdf(enc, prefilter_tapset, tapset_icdf, 2);
+      }
+   }
+
+   isTransient = 0;
+   shortBlocks = 0;
+   if (st->complexity >= 1 && !st->lfe)
+   {
+      isTransient = transient_analysis(in, N+st->overlap, CC,
+            &tf_estimate, &tf_chan);
+   }
+   if (LM>0 && ec_tell(enc)+3<=total_bits)
+   {
+      if (isTransient)
+         shortBlocks = M;
+   } else {
+      isTransient = 0;
+      transient_got_disabled=1;
+   }
+
+   ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */
+   ALLOC(bandE,nbEBands*CC, celt_ener);
+   ALLOC(bandLogE,nbEBands*CC, opus_val16);
+
+   secondMdct = shortBlocks && st->complexity>=8;
+   ALLOC(bandLogE2, C*nbEBands, opus_val16);
+   if (secondMdct)
+   {
+      compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample);
+      compute_band_energies(mode, freq, bandE, effEnd, C, M);
+      amp2Log2(mode, effEnd, st->end, bandE, bandLogE2, C);
+      for (i=0;i<C*nbEBands;i++)
+         bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+   }
+
+   compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample);
+   if (CC==2&&C==1)
+      tf_chan = 0;
+   compute_band_energies(mode, freq, bandE, effEnd, C, M);
+
+   if (st->lfe)
+   {
+      for (i=2;i<st->end;i++)
+      {
+         bandE[i] = IMIN(bandE[i], MULT16_32_Q15(QCONST16(1e-4f,15),bandE[0]));
+         bandE[i] = MAX32(bandE[i], EPSILON);
+      }
+   }
+   amp2Log2(mode, effEnd, st->end, bandE, bandLogE, C);
+
+   ALLOC(surround_dynalloc, C*nbEBands, opus_val16);
+   for(i=0;i<st->end;i++)
+      surround_dynalloc[i] = 0;
+   /* This computes how much masking takes place between surround channels */
+   if (st->start==0&&st->energy_mask&&!st->lfe)
+   {
+      int mask_end;
+      int midband;
+      int count_dynalloc;
+      opus_val32 mask_avg=0;
+      opus_val32 diff=0;
+      int count=0;
+      mask_end = IMAX(2,st->lastCodedBands);
+      for (c=0;c<C;c++)
+      {
+         for(i=0;i<mask_end;i++)
+         {
+            opus_val16 mask;
+            mask = MAX16(MIN16(st->energy_mask[nbEBands*c+i],
+                   QCONST16(.25f, DB_SHIFT)), -QCONST16(2.0f, DB_SHIFT));
+            if (mask > 0)
+               mask = HALF16(mask);
+            mask_avg += MULT16_16(mask, eBands[i+1]-eBands[i]);
+            count += eBands[i+1]-eBands[i];
+            diff += MULT16_16(mask, 1+2*i-mask_end);
+         }
+      }
+      mask_avg = DIV32_16(mask_avg,count);
+      mask_avg += QCONST16(.2f, DB_SHIFT);
+      diff = diff*6/(C*(mask_end-1)*(mask_end+1)*mask_end);
+      /* Again, being conservative */
+      diff = HALF32(diff);
+      diff = MAX32(MIN32(diff, QCONST32(.031f, DB_SHIFT)), -QCONST32(.031f, DB_SHIFT));
+      /* Find the band that's in the middle of the coded spectrum */
+      for (midband=0;eBands[midband+1] < eBands[mask_end]/2;midband++);
+      count_dynalloc=0;
+      for(i=0;i<mask_end;i++)
+      {
+         opus_val32 lin;
+         opus_val16 unmask;
+         lin = mask_avg + diff*(i-midband);
+         if (C==2)
+            unmask = MAX16(st->energy_mask[i], st->energy_mask[nbEBands+i]);
+         else
+            unmask = st->energy_mask[i];
+         unmask = MIN16(unmask, QCONST16(.0f, DB_SHIFT));
+         unmask -= lin;
+         if (unmask > QCONST16(.25f, DB_SHIFT))
+         {
+            surround_dynalloc[i] = unmask - QCONST16(.25f, DB_SHIFT);
+            count_dynalloc++;
+         }
+      }
+      if (count_dynalloc>=3)
+      {
+         /* If we need dynalloc in many bands, it's probably because our
+            initial masking rate was too low. */
+         mask_avg += QCONST16(.25f, DB_SHIFT);
+         if (mask_avg>0)
+         {
+            /* Something went really wrong in the original calculations,
+               disabling masking. */
+            mask_avg = 0;
+            diff = 0;
+            for(i=0;i<mask_end;i++)
+               surround_dynalloc[i] = 0;
+         } else {
+            for(i=0;i<mask_end;i++)
+               surround_dynalloc[i] = MAX16(0, surround_dynalloc[i]-QCONST16(.25f, DB_SHIFT));
+         }
+      }
+      mask_avg += QCONST16(.2f, DB_SHIFT);
+      /* Convert to 1/64th units used for the trim */
+      surround_trim = 64*diff;
+      /*printf("%d %d ", mask_avg, surround_trim);*/
+      surround_masking = mask_avg;
+   }
+   /* Temporal VBR (but not for LFE) */
+   if (!st->lfe)
+   {
+      opus_val16 follow=-QCONST16(10.0f,DB_SHIFT);
+      opus_val32 frame_avg=0;
+      opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0;
+      for(i=st->start;i<st->end;i++)
+      {
+         follow = MAX16(follow-QCONST16(1.f, DB_SHIFT), bandLogE[i]-offset);
+         if (C==2)
+            follow = MAX16(follow, bandLogE[i+nbEBands]-offset);
+         frame_avg += follow;
+      }
+      frame_avg /= (st->end-st->start);
+      temporal_vbr = SUB16(frame_avg,st->spec_avg);
+      temporal_vbr = MIN16(QCONST16(3.f, DB_SHIFT), MAX16(-QCONST16(1.5f, DB_SHIFT), temporal_vbr));
+      st->spec_avg += MULT16_16_Q15(QCONST16(.02f, 15), temporal_vbr);
+   }
+   /*for (i=0;i<21;i++)
+      printf("%f ", bandLogE[i]);
+   printf("\n");*/
+
+   if (!secondMdct)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         bandLogE2[i] = bandLogE[i];
+   }
+
+   /* Last chance to catch any transient we might have missed in the
+      time-domain analysis */
+   if (LM>0 && ec_tell(enc)+3<=total_bits && !isTransient && st->complexity>=5 && !st->lfe)
+   {
+      if (patch_transient_decision(bandLogE, oldBandE, nbEBands, st->end, C))
+      {
+         isTransient = 1;
+         shortBlocks = M;
+         compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample);
+         compute_band_energies(mode, freq, bandE, effEnd, C, M);
+         amp2Log2(mode, effEnd, st->end, bandE, bandLogE, C);
+         /* Compensate for the scaling of short vs long mdcts */
+         for (i=0;i<C*nbEBands;i++)
+            bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+         tf_estimate = QCONST16(.2f,14);
+      }
+   }
+
+   if (LM>0 && ec_tell(enc)+3<=total_bits)
+      ec_enc_bit_logp(enc, isTransient, 3);
+
+   ALLOC(X, C*N, celt_norm);         /**< Interleaved normalised MDCTs */
+
+   /* Band normalisation */
+   normalise_bands(mode, freq, X, bandE, effEnd, C, M);
+
+   ALLOC(tf_res, nbEBands, int);
+   /* Disable variable tf resolution for hybrid and at very low bitrate */
+   if (effectiveBytes>=15*C && st->start==0 && st->complexity>=2 && !st->lfe)
+   {
+      int lambda;
+      if (effectiveBytes<40)
+         lambda = 12;
+      else if (effectiveBytes<60)
+         lambda = 6;
+      else if (effectiveBytes<100)
+         lambda = 4;
+      else
+         lambda = 3;
+      lambda*=2;
+      tf_select = tf_analysis(mode, effEnd, isTransient, tf_res, lambda, X, N, LM, &tf_sum, tf_estimate, tf_chan);
+      for (i=effEnd;i<st->end;i++)
+         tf_res[i] = tf_res[effEnd-1];
+   } else {
+      tf_sum = 0;
+      for (i=0;i<st->end;i++)
+         tf_res[i] = isTransient;
+      tf_select=0;
+   }
+
+   ALLOC(error, C*nbEBands, opus_val16);
+   quant_coarse_energy(mode, st->start, st->end, effEnd, bandLogE,
+         oldBandE, total_bits, error, enc,
+         C, LM, nbAvailableBytes, st->force_intra,
+         &st->delayedIntra, st->complexity >= 4, st->loss_rate, st->lfe);
+
+   tf_encode(st->start, st->end, isTransient, tf_res, LM, tf_select, enc);
+
+   if (ec_tell(enc)+4<=total_bits)
+   {
+      if (st->lfe)
+      {
+         st->tapset_decision = 0;
+         st->spread_decision = SPREAD_NORMAL;
+      } else if (shortBlocks || st->complexity < 3 || nbAvailableBytes < 10*C || st->start != 0)
+      {
+         if (st->complexity == 0)
+            st->spread_decision = SPREAD_NONE;
+         else
+            st->spread_decision = SPREAD_NORMAL;
+      } else {
+         /* Disable new spreading+tapset estimator until we can show it works
+            better than the old one. So far it seems like spreading_decision()
+            works best. */
+#if 0
+         if (st->analysis.valid)
+         {
+            static const opus_val16 spread_thresholds[3] = {-QCONST16(.6f, 15), -QCONST16(.2f, 15), -QCONST16(.07f, 15)};
+            static const opus_val16 spread_histeresis[3] = {QCONST16(.15f, 15), QCONST16(.07f, 15), QCONST16(.02f, 15)};
+            static const opus_val16 tapset_thresholds[2] = {QCONST16(.0f, 15), QCONST16(.15f, 15)};
+            static const opus_val16 tapset_histeresis[2] = {QCONST16(.1f, 15), QCONST16(.05f, 15)};
+            st->spread_decision = hysteresis_decision(-st->analysis.tonality, spread_thresholds, spread_histeresis, 3, st->spread_decision);
+            st->tapset_decision = hysteresis_decision(st->analysis.tonality_slope, tapset_thresholds, tapset_histeresis, 2, st->tapset_decision);
+         } else
+#endif
+         {
+            st->spread_decision = spreading_decision(mode, X,
+                  &st->tonal_average, st->spread_decision, &st->hf_average,
+                  &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M);
+         }
+         /*printf("%d %d\n", st->tapset_decision, st->spread_decision);*/
+         /*printf("%f %d %f %d\n\n", st->analysis.tonality, st->spread_decision, st->analysis.tonality_slope, st->tapset_decision);*/
+      }
+      ec_enc_icdf(enc, st->spread_decision, spread_icdf, 5);
+   }
+
+   ALLOC(offsets, nbEBands, int);
+
+   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, st->start, st->end, C, offsets,
+         st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
+         eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc);
+   /* For LFE, everything interesting is in the first band */
+   if (st->lfe)
+      offsets[0] = IMIN(8, effectiveBytes/3);
+   ALLOC(cap, nbEBands, int);
+   init_caps(mode,cap,LM,C);
+
+   dynalloc_logp = 6;
+   total_bits<<=BITRES;
+   total_boost = 0;
+   tell = ec_tell_frac(enc);
+   for (i=st->start;i<st->end;i++)
+   {
+      int width, quanta;
+      int dynalloc_loop_logp;
+      int boost;
+      int j;
+      width = C*(eBands[i+1]-eBands[i])<<LM;
+      /* quanta is 6 bits, but no more than 1 bit/sample
+         and no less than 1/8 bit/sample */
+      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
+      dynalloc_loop_logp = dynalloc_logp;
+      boost = 0;
+      for (j = 0; tell+(dynalloc_loop_logp<<BITRES) < total_bits-total_boost
+            && boost < cap[i]; j++)
+      {
+         int flag;
+         flag = j<offsets[i];
+         ec_enc_bit_logp(enc, flag, dynalloc_loop_logp);
+         tell = ec_tell_frac(enc);
+         if (!flag)
+            break;
+         boost += quanta;
+         total_boost += quanta;
+         dynalloc_loop_logp = 1;
+      }
+      /* Making dynalloc more likely */
+      if (j)
+         dynalloc_logp = IMAX(2, dynalloc_logp-1);
+      offsets[i] = boost;
+   }
+
+   if (C==2)
+   {
+      static const opus_val16 intensity_thresholds[21]=
+      /* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  20  off*/
+        {  1, 2, 3, 4, 5, 6, 7, 8,16,24,36,44,50,56,62,67,72,79,88,106,134};
+      static const opus_val16 intensity_histeresis[21]=
+        {  1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 5, 6,  8, 8};
+
+      /* Always use MS for 2.5 ms frames until we can do a better analysis */
+      if (LM!=0)
+         dual_stereo = stereo_analysis(mode, X, LM, N);
+
+      st->intensity = hysteresis_decision((opus_val16)(equiv_rate/1000),
+            intensity_thresholds, intensity_histeresis, 21, st->intensity);
+      st->intensity = IMIN(st->end,IMAX(st->start, st->intensity));
+   }
+
+   alloc_trim = 5;
+   if (tell+(6<<BITRES) <= total_bits - total_boost)
+   {
+      if (st->lfe)
+         alloc_trim = 5;
+      else
+         alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
+            st->end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim);
+      ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
+      tell = ec_tell_frac(enc);
+   }
+
+   /* Variable bitrate */
+   if (vbr_rate>0)
+   {
+     opus_val16 alpha;
+     opus_int32 delta;
+     /* The target rate in 8th bits per frame */
+     opus_int32 target, base_target;
+     opus_int32 min_allowed;
+     int lm_diff = mode->maxLM - LM;
+
+     /* Don't attempt to use more than 510 kb/s, even for frames smaller than 20 ms.
+        The CELT allocator will just not be able to use more than that anyway. */
+     nbCompressedBytes = IMIN(nbCompressedBytes,1275>>(3-LM));
+     base_target = vbr_rate - ((40*C+20)<<BITRES);
+
+     if (st->constrained_vbr)
+        base_target += (st->vbr_offset>>lm_diff);
+
+     target = compute_vbr(mode, &st->analysis, base_target, LM, equiv_rate,
+           st->lastCodedBands, C, st->intensity, st->constrained_vbr,
+           st->stereo_saving, tot_boost, tf_estimate, pitch_change, maxDepth,
+           st->variable_duration, st->lfe, st->energy_mask!=NULL, surround_masking,
+           temporal_vbr);
+
+     /* The current offset is removed from the target and the space used
+        so far is added*/
+     target=target+tell;
+     /* In VBR mode the frame size must not be reduced so much that it would
+         result in the encoder running out of bits.
+        The margin of 2 bytes ensures that none of the bust-prevention logic
+         in the decoder will have triggered so far. */
+     min_allowed = ((tell+total_boost+(1<<(BITRES+3))-1)>>(BITRES+3)) + 2 - nbFilledBytes;
+
+     nbAvailableBytes = (target+(1<<(BITRES+2)))>>(BITRES+3);
+     nbAvailableBytes = IMAX(min_allowed,nbAvailableBytes);
+     nbAvailableBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes) - nbFilledBytes;
+
+     /* By how much did we "miss" the target on that frame */
+     delta = target - vbr_rate;
+
+     target=nbAvailableBytes<<(BITRES+3);
+
+     /*If the frame is silent we don't adjust our drift, otherwise
+       the encoder will shoot to very high rates after hitting a
+       span of silence, but we do allow the bitres to refill.
+       This means that we'll undershoot our target in CVBR/VBR modes
+       on files with lots of silence. */
+     if(silence)
+     {
+       nbAvailableBytes = 2;
+       target = 2*8<<BITRES;
+       delta = 0;
+     }
+
+     if (st->vbr_count < 970)
+     {
+        st->vbr_count++;
+        alpha = celt_rcp(SHL32(EXTEND32(st->vbr_count+20),16));
+     } else
+        alpha = QCONST16(.001f,15);
+     /* How many bits have we used in excess of what we're allowed */
+     if (st->constrained_vbr)
+        st->vbr_reservoir += target - vbr_rate;
+     /*printf ("%d\n", st->vbr_reservoir);*/
+
+     /* Compute the offset we need to apply in order to reach the target */
+     if (st->constrained_vbr)
+     {
+        st->vbr_drift += (opus_int32)MULT16_32_Q15(alpha,(delta*(1<<lm_diff))-st->vbr_offset-st->vbr_drift);
+        st->vbr_offset = -st->vbr_drift;
+     }
+     /*printf ("%d\n", st->vbr_drift);*/
+
+     if (st->constrained_vbr && st->vbr_reservoir < 0)
+     {
+        /* We're under the min value -- increase rate */
+        int adjust = (-st->vbr_reservoir)/(8<<BITRES);
+        /* Unless we're just coding silence */
+        nbAvailableBytes += silence?0:adjust;
+        st->vbr_reservoir = 0;
+        /*printf ("+%d\n", adjust);*/
+     }
+     nbCompressedBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes);
+     /*printf("%d\n", nbCompressedBytes*50*8);*/
+     /* This moves the raw bits to take into account the new compressed size */
+     ec_enc_shrink(enc, nbCompressedBytes);
+   }
+
+   /* Bit allocation */
+   ALLOC(fine_quant, nbEBands, int);
+   ALLOC(pulses, nbEBands, int);
+   ALLOC(fine_priority, nbEBands, int);
+
+   /* bits =           packet size                    - where we are - safety*/
+   bits = (((opus_int32)nbCompressedBytes*8)<<BITRES) - ec_tell_frac(enc) - 1;
+   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
+   bits -= anti_collapse_rsv;
+   signalBandwidth = st->end-1;
+#ifndef DISABLE_FLOAT_API
+   if (st->analysis.valid)
+   {
+      int min_bandwidth;
+      if (equiv_rate < (opus_int32)32000*C)
+         min_bandwidth = 13;
+      else if (equiv_rate < (opus_int32)48000*C)
+         min_bandwidth = 16;
+      else if (equiv_rate < (opus_int32)60000*C)
+         min_bandwidth = 18;
+      else  if (equiv_rate < (opus_int32)80000*C)
+         min_bandwidth = 19;
+      else
+         min_bandwidth = 20;
+      signalBandwidth = IMAX(st->analysis.bandwidth, min_bandwidth);
+   }
+#endif
+   if (st->lfe)
+      signalBandwidth = 1;
+   codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
+         alloc_trim, &st->intensity, &dual_stereo, bits, &balance, pulses,
+         fine_quant, fine_priority, C, LM, enc, 1, st->lastCodedBands, signalBandwidth);
+   if (st->lastCodedBands)
+      st->lastCodedBands = IMIN(st->lastCodedBands+1,IMAX(st->lastCodedBands-1,codedBands));
+   else
+      st->lastCodedBands = codedBands;
+
+   quant_fine_energy(mode, st->start, st->end, oldBandE, error, fine_quant, enc, C);
+
+   /* Residual quantisation */
+   ALLOC(collapse_masks, C*nbEBands, unsigned char);
+   quant_all_bands(1, mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
+         bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res,
+         nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
+
+   if (anti_collapse_rsv > 0)
+   {
+      anti_collapse_on = st->consec_transient<2;
+#ifdef FUZZING
+      anti_collapse_on = rand()&0x1;
+#endif
+      ec_enc_bits(enc, anti_collapse_on, 1);
+   }
+   quant_energy_finalise(mode, st->start, st->end, oldBandE, error, fine_quant, fine_priority, nbCompressedBytes*8-ec_tell(enc), enc, C);
+
+   if (silence)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
+   }
+
+#ifdef RESYNTH
+   /* Re-synthesis of the coded audio if required */
+   {
+      celt_sig *out_mem[2];
+
+      if (anti_collapse_on)
+      {
+         anti_collapse(mode, X, collapse_masks, LM, C, N,
+               st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+      }
+
+      if (silence)
+      {
+         for (i=0;i<C*N;i++)
+            freq[i] = 0;
+      } else {
+         /* Synthesis */
+         denormalise_bands(mode, X, freq, oldBandE, st->start, effEnd, C, M);
+      }
+
+      c=0; do {
+         OPUS_MOVE(st->syn_mem[c], st->syn_mem[c]+N, 2*MAX_PERIOD-N+overlap/2);
+      } while (++c<CC);
+
+      if (CC==2&&C==1)
+      {
+         for (i=0;i<N;i++)
+            freq[N+i] = freq[i];
+      }
+
+      c=0; do {
+         out_mem[c] = st->syn_mem[c]+2*MAX_PERIOD-N;
+      } while (++c<CC);
+
+      compute_inv_mdcts(mode, shortBlocks, freq, out_mem, CC, LM);
+
+      c=0; do {
+         st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
+         st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
+         comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize,
+               st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
+               mode->window, st->overlap);
+         if (LM!=0)
+            comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize,
+                  st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
+                  mode->window, overlap);
+      } while (++c<CC);
+
+      /* We reuse freq[] as scratch space for the de-emphasis */
+      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, freq);
+      st->prefilter_period_old = st->prefilter_period;
+      st->prefilter_gain_old = st->prefilter_gain;
+      st->prefilter_tapset_old = st->prefilter_tapset;
+   }
+#endif
+
+   st->prefilter_period = pitch_index;
+   st->prefilter_gain = gain1;
+   st->prefilter_tapset = prefilter_tapset;
+#ifdef RESYNTH
+   if (LM!=0)
+   {
+      st->prefilter_period_old = st->prefilter_period;
+      st->prefilter_gain_old = st->prefilter_gain;
+      st->prefilter_tapset_old = st->prefilter_tapset;
+   }
+#endif
+
+   if (CC==2&&C==1) {
+      for (i=0;i<nbEBands;i++)
+         oldBandE[nbEBands+i]=oldBandE[i];
+   }
+
+   if (!isTransient)
+   {
+      for (i=0;i<CC*nbEBands;i++)
+         oldLogE2[i] = oldLogE[i];
+      for (i=0;i<CC*nbEBands;i++)
+         oldLogE[i] = oldBandE[i];
+   } else {
+      for (i=0;i<CC*nbEBands;i++)
+         oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
+   }
+   /* In case start or end were to change */
+   c=0; do
+   {
+      for (i=0;i<st->start;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+      for (i=st->end;i<nbEBands;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+   } while (++c<CC);
+
+   if (isTransient || transient_got_disabled)
+      st->consec_transient++;
+   else
+      st->consec_transient=0;
+   st->rng = enc->rng;
+
+   /* If there's any room left (can only happen for very high rates),
+      it's already filled with zeros */
+   ec_enc_done(enc);
+
+#ifdef CUSTOM_MODES
+   if (st->signalling)
+      nbCompressedBytes++;
+#endif
+
+   RESTORE_STACK;
+   if (ec_get_error(enc))
+      return OPUS_INTERNAL_ERROR;
+   else
+      return nbCompressedBytes;
+}
+
+
+#ifdef CUSTOM_MODES
+
+#ifdef OPUS_FIXED_POINT
+int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
+}
+
+#ifndef DISABLE_FLOAT_API
+int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   int j, ret, C, N;
+   VARDECL(opus_int16, in);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+   ALLOC(in, C*N, opus_int16);
+
+   for (j=0;j<C*N;j++)
+     in[j] = FLOAT2INT16(pcm[j]);
+
+   ret=celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
+#ifdef RESYNTH
+   for (j=0;j<C*N;j++)
+      ((float*)pcm)[j]=in[j]*(1.f/32768.f);
+#endif
+   RESTORE_STACK;
+   return ret;
+}
+#endif /* DISABLE_FLOAT_API */
+#else
+
+int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   int j, ret, C, N;
+   VARDECL(celt_sig, in);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C=st->channels;
+   N=frame_size;
+   ALLOC(in, C*N, celt_sig);
+   for (j=0;j<C*N;j++) {
+     in[j] = SCALEOUT(pcm[j]);
+   }
+
+   ret = celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
+#ifdef RESYNTH
+   for (j=0;j<C*N;j++)
+      ((opus_int16*)pcm)[j] = FLOAT2INT16(in[j]);
+#endif
+   RESTORE_STACK;
+   return ret;
+}
+
+int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
+}
+
+#endif
+
+#endif /* CUSTOM_MODES */
+
+int opus_custom_encoder_ctl(CELTEncoder * OPUS_RESTRICT st, int request, ...)
+{
+   va_list ap;
+
+   va_start(ap, request);
+   switch (request)
+   {
+      case OPUS_SET_COMPLEXITY_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>10)
+            goto bad_arg;
+         st->complexity = value;
+      }
+      break;
+      case CELT_SET_START_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<0 || value>=st->mode->nbEBands)
+            goto bad_arg;
+         st->start = value;
+      }
+      break;
+      case CELT_SET_END_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>st->mode->nbEBands)
+            goto bad_arg;
+         st->end = value;
+      }
+      break;
+      case CELT_SET_PREDICTION_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>2)
+            goto bad_arg;
+         st->disable_pf = value<=1;
+         st->force_intra = value==0;
+      }
+      break;
+      case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>100)
+            goto bad_arg;
+         st->loss_rate = value;
+      }
+      break;
+      case OPUS_SET_VBR_CONSTRAINT_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->constrained_vbr = value;
+      }
+      break;
+      case OPUS_SET_VBR_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->vbr = value;
+      }
+      break;
+      case OPUS_SET_BITRATE_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<=500 && value!=OPUS_BITRATE_MAX)
+            goto bad_arg;
+         value = IMIN(value, 260000*st->channels);
+         st->bitrate = value;
+      }
+      break;
+      case CELT_SET_CHANNELS_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>2)
+            goto bad_arg;
+         st->stream_channels = value;
+      }
+      break;
+      case OPUS_SET_LSB_DEPTH_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if (value<8 || value>24)
+             goto bad_arg;
+          st->lsb_depth=value;
+      }
+      break;
+      case OPUS_GET_LSB_DEPTH_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          *value=st->lsb_depth;
+      }
+      break;
+      case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          st->variable_duration = value;
+      }
+      break;
+      case OPUS_RESET_STATE:
+      {
+         int i;
+         opus_val16 *oldBandE, *oldLogE, *oldLogE2;
+         oldBandE = (opus_val16*)(st->in_mem+st->channels*(st->overlap+COMBFILTER_MAXPERIOD));
+         oldLogE = oldBandE + st->channels*st->mode->nbEBands;
+         oldLogE2 = oldLogE + st->channels*st->mode->nbEBands;
+         OPUS_CLEAR((char*)&st->ENCODER_RESET_START,
+               opus_custom_encoder_get_size(st->mode, st->channels)-
+               ((char*)&st->ENCODER_RESET_START - (char*)st));
+         for (i=0;i<st->channels*st->mode->nbEBands;i++)
+            oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT);
+         st->vbr_offset = 0;
+         st->delayedIntra = 1;
+         st->spread_decision = SPREAD_NORMAL;
+         st->tonal_average = 256;
+         st->hf_average = 0;
+         st->tapset_decision = 0;
+      }
+      break;
+#ifdef CUSTOM_MODES
+      case CELT_SET_INPUT_CLIPPING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->clip = value;
+      }
+      break;
+#endif
+      case CELT_SET_SIGNALLING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->signalling = value;
+      }
+      break;
+      case CELT_SET_ANALYSIS_REQUEST:
+      {
+         AnalysisInfo *info = va_arg(ap, AnalysisInfo *);
+         if (info)
+            OPUS_COPY(&st->analysis, info, 1);
+      }
+      break;
+      case CELT_GET_MODE_REQUEST:
+      {
+         const CELTMode ** value = va_arg(ap, const CELTMode**);
+         if (value==0)
+            goto bad_arg;
+         *value=st->mode;
+      }
+      break;
+      case OPUS_GET_FINAL_RANGE_REQUEST:
+      {
+         opus_uint32 * value = va_arg(ap, opus_uint32 *);
+         if (value==0)
+            goto bad_arg;
+         *value=st->rng;
+      }
+      break;
+      case OPUS_SET_LFE_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          st->lfe = value;
+      }
+      break;
+      case OPUS_SET_ENERGY_MASK_REQUEST:
+      {
+          opus_val16 *value = va_arg(ap, opus_val16*);
+          st->energy_mask = value;
+      }
+      break;
+      default:
+         goto bad_request;
+   }
+   va_end(ap);
+   return OPUS_OK;
+bad_arg:
+   va_end(ap);
+   return OPUS_BAD_ARG;
+bad_request:
+   va_end(ap);
+   return OPUS_UNIMPLEMENTED;
+}

+ 309 - 0
drivers/opus/celt/celt_lpc.c

@@ -0,0 +1,309 @@
+/* Copyright (c) 2009-2010 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+void _celt_lpc(
+      opus_val16       *_lpc, /* out: [0...p-1] LPC coefficients      */
+const opus_val32 *ac,  /* in:  [0...p] autocorrelation values  */
+int          p
+)
+{
+   int i, j;
+   opus_val32 r;
+   opus_val32 error = ac[0];
+#ifdef OPUS_FIXED_POINT
+   opus_val32 lpc[LPC_ORDER];
+#else
+   float *lpc = _lpc;
+#endif
+
+   for (i = 0; i < p; i++)
+      lpc[i] = 0;
+   if (ac[0] != 0)
+   {
+      for (i = 0; i < p; i++) {
+         /* Sum up this iteration's reflection coefficient */
+         opus_val32 rr = 0;
+         for (j = 0; j < i; j++)
+            rr += MULT32_32_Q31(lpc[j],ac[i - j]);
+         rr += SHR32(ac[i + 1],3);
+         r = -frac_div32(SHL32(rr,3), error);
+         /*  Update LPC coefficients and total error */
+         lpc[i] = SHR32(r,3);
+         for (j = 0; j < (i+1)>>1; j++)
+         {
+            opus_val32 tmp1, tmp2;
+            tmp1 = lpc[j];
+            tmp2 = lpc[i-1-j];
+            lpc[j]     = tmp1 + MULT32_32_Q31(r,tmp2);
+            lpc[i-1-j] = tmp2 + MULT32_32_Q31(r,tmp1);
+         }
+
+         error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
+         /* Bail out once we get 30 dB gain */
+#ifdef OPUS_FIXED_POINT
+         if (error<SHR32(ac[0],10))
+            break;
+#else
+         if (error<.001f*ac[0])
+            break;
+#endif
+      }
+   }
+#ifdef OPUS_FIXED_POINT
+   for (i=0;i<p;i++)
+      _lpc[i] = ROUND16(lpc[i],16);
+#endif
+}
+
+void celt_fir(const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         opus_val16 *mem)
+{
+   int i,j;
+   VARDECL(opus_val16, rnum);
+   VARDECL(opus_val16, x);
+   SAVE_STACK;
+
+   ALLOC(rnum, ord, opus_val16);
+   ALLOC(x, N+ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   for(i=0;i<ord;i++)
+      x[i] = mem[ord-i-1];
+   for (i=0;i<N;i++)
+      x[i+ord]=_x[i];
+   for(i=0;i<ord;i++)
+      mem[i] = _x[N-i-1];
+#ifdef SMALL_FOOTPRINT
+   for (i=0;i<N;i++)
+   {
+      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      for (j=0;j<ord;j++)
+      {
+         sum = MAC16_16(sum,rnum[j],x[i+j]);
+      }
+      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+   }
+#else
+   for (i=0;i<N-3;i+=4)
+   {
+      opus_val32 sum[4]={0,0,0,0};
+      xcorr_kernel(rnum, x+i, sum, ord);
+      _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
+      _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
+      _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
+      _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<ord;j++)
+         sum = MAC16_16(sum,rnum[j],x[i+j]);
+      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+   }
+#endif
+   RESTORE_STACK;
+}
+
+void celt_iir(const opus_val32 *_x,
+         const opus_val16 *den,
+         opus_val32 *_y,
+         int N,
+         int ord,
+         opus_val16 *mem)
+{
+#ifdef SMALL_FOOTPRINT
+   int i,j;
+   for (i=0;i<N;i++)
+   {
+      opus_val32 sum = _x[i];
+      for (j=0;j<ord;j++)
+      {
+         sum -= MULT16_16(den[j],mem[j]);
+      }
+      for (j=ord-1;j>=1;j--)
+      {
+         mem[j]=mem[j-1];
+      }
+      mem[0] = ROUND16(sum,SIG_SHIFT);
+      _y[i] = sum;
+   }
+#else
+   int i,j;
+   VARDECL(opus_val16, rden);
+   VARDECL(opus_val16, y);
+   SAVE_STACK;
+
+   celt_assert((ord&3)==0);
+   ALLOC(rden, ord, opus_val16);
+   ALLOC(y, N+ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rden[i] = den[ord-i-1];
+   for(i=0;i<ord;i++)
+      y[i] = -mem[ord-i-1];
+   for(;i<N+ord;i++)
+      y[i]=0;
+   for (i=0;i<N-3;i+=4)
+   {
+      /* Unroll by 4 as if it were an FIR filter */
+      opus_val32 sum[4];
+      sum[0]=_x[i];
+      sum[1]=_x[i+1];
+      sum[2]=_x[i+2];
+      sum[3]=_x[i+3];
+      xcorr_kernel(rden, y+i, sum, ord);
+
+      /* Patch up the result to compensate for the fact that this is an IIR */
+      y[i+ord  ] = -ROUND16(sum[0],SIG_SHIFT);
+      _y[i  ] = sum[0];
+      sum[1] = MAC16_16(sum[1], y[i+ord  ], den[0]);
+      y[i+ord+1] = -ROUND16(sum[1],SIG_SHIFT);
+      _y[i+1] = sum[1];
+      sum[2] = MAC16_16(sum[2], y[i+ord+1], den[0]);
+      sum[2] = MAC16_16(sum[2], y[i+ord  ], den[1]);
+      y[i+ord+2] = -ROUND16(sum[2],SIG_SHIFT);
+      _y[i+2] = sum[2];
+
+      sum[3] = MAC16_16(sum[3], y[i+ord+2], den[0]);
+      sum[3] = MAC16_16(sum[3], y[i+ord+1], den[1]);
+      sum[3] = MAC16_16(sum[3], y[i+ord  ], den[2]);
+      y[i+ord+3] = -ROUND16(sum[3],SIG_SHIFT);
+      _y[i+3] = sum[3];
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = _x[i];
+      for (j=0;j<ord;j++)
+         sum -= MULT16_16(rden[j],y[i+j]);
+      y[i+ord] = ROUND16(sum,SIG_SHIFT);
+      _y[i] = sum;
+   }
+   for(i=0;i<ord;i++)
+      mem[i] = _y[N-i-1];
+   RESTORE_STACK;
+#endif
+}
+
+int _celt_autocorr(
+                   const opus_val16 *x,   /*  in: [0...n-1] samples x   */
+                   opus_val32       *ac,  /* out: [0...lag-1] ac values */
+                   const opus_val16       *window,
+                   int          overlap,
+                   int          lag,
+                   int          n,
+                   int          arch
+                  )
+{
+   opus_val32 d;
+   int i, k;
+   int fastN=n-lag;
+   int shift;
+   const opus_val16 *xptr;
+   VARDECL(opus_val16, xx);
+   SAVE_STACK;
+   ALLOC(xx, n, opus_val16);
+   celt_assert(n>0);
+   celt_assert(overlap>=0);
+   if (overlap == 0)
+   {
+      xptr = x;
+   } else {
+      for (i=0;i<n;i++)
+         xx[i] = x[i];
+      for (i=0;i<overlap;i++)
+      {
+         xx[i] = MULT16_16_Q15(x[i],window[i]);
+         xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]);
+      }
+      xptr = xx;
+   }
+   shift=0;
+#ifdef OPUS_FIXED_POINT
+   {
+      opus_val32 ac0;
+      ac0 = 1+(n<<7);
+      if (n&1) ac0 += SHR32(MULT16_16(xptr[0],xptr[0]),9);
+      for(i=(n&1);i<n;i+=2)
+      {
+         ac0 += SHR32(MULT16_16(xptr[i],xptr[i]),9);
+         ac0 += SHR32(MULT16_16(xptr[i+1],xptr[i+1]),9);
+      }
+
+      shift = celt_ilog2(ac0)-30+10;
+      shift = (shift)/2;
+      if (shift>0)
+      {
+         for(i=0;i<n;i++)
+            xx[i] = PSHR32(xptr[i], shift);
+         xptr = xx;
+      } else
+         shift = 0;
+   }
+#endif
+   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch);
+   for (k=0;k<=lag;k++)
+   {
+      for (i = k+fastN, d = 0; i < n; i++)
+         d = MAC16_16(d, xptr[i], xptr[i-k]);
+      ac[k] += d;
+   }
+#ifdef OPUS_FIXED_POINT
+   shift = 2*shift;
+   if (shift<=0)
+      ac[0] += SHL32((opus_int32)1, -shift);
+   if (ac[0] < 268435456)
+   {
+      int shift2 = 29 - EC_ILOG(ac[0]);
+      for (i=0;i<=lag;i++)
+         ac[i] = SHL32(ac[i], shift2);
+      shift -= shift2;
+   } else if (ac[0] >= 536870912)
+   {
+      int shift2=1;
+      if (ac[0] >= 1073741824)
+         shift2++;
+      for (i=0;i<=lag;i++)
+         ac[i] = SHR32(ac[i], shift2);
+      shift += shift2;
+   }
+#endif
+
+   RESTORE_STACK;
+   return shift;
+}

+ 54 - 0
drivers/opus/celt/celt_lpc.h

@@ -0,0 +1,54 @@
+/* Copyright (c) 2009-2010 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PLC_H
+#define PLC_H
+
+#include "arch.h"
+
+#define LPC_ORDER 24
+
+void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
+
+void celt_fir(const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         opus_val16 *mem);
+
+void celt_iir(const opus_val32 *x,
+         const opus_val16 *den,
+         opus_val32 *y,
+         int N,
+         int ord,
+         opus_val16 *mem);
+
+int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
+         const opus_val16 *window, int overlap, int lag, int n, int arch);
+
+#endif /* PLC_H */

+ 54 - 0
drivers/opus/celt/cpu_support.h

@@ -0,0 +1,54 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CPU_SUPPORT_H
+#define CPU_SUPPORT_H
+
+#include "opus_types.h"
+#include "opus_defines.h"
+
+#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
+#include "arm/armcpu.h"
+
+/* We currently support 4 ARM variants:
+ * arch[0] -> ARMv4
+ * arch[1] -> ARMv5E
+ * arch[2] -> ARMv6
+ * arch[3] -> NEON
+ */
+#define OPUS_ARCHMASK 3
+
+#else
+#define OPUS_ARCHMASK 0
+
+static OPUS_INLINE int opus_select_arch(void)
+{
+  return 0;
+}
+#endif
+
+#endif

+ 697 - 0
drivers/opus/celt/cwrs.c

@@ -0,0 +1,697 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2007-2009 Timothy B. Terriberry
+   Written by Timothy B. Terriberry and Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "os_support.h"
+#include "cwrs.h"
+#include "mathops.h"
+#include "arch.h"
+
+#ifdef CUSTOM_MODES
+
+/*Guaranteed to return a conservatively large estimate of the binary logarithm
+   with frac bits of fractional precision.
+  Tested for all possible 32-bit inputs with frac=4, where the maximum
+   overestimation is 0.06254243 bits.*/
+int log2_frac(opus_uint32 val, int frac)
+{
+  int l;
+  l=EC_ILOG(val);
+  if(val&(val-1)){
+    /*This is (val>>l-16), but guaranteed to round up, even if adding a bias
+       before the shift would cause overflow (e.g., for 0xFFFFxxxx).
+       Doesn't work for val=0, but that case fails the test above.*/
+    if(l>16)val=((val-1)>>(l-16))+1;
+    else val<<=16-l;
+    l=(l-1)<<frac;
+    /*Note that we always need one iteration, since the rounding up above means
+       that we might need to adjust the integer part of the logarithm.*/
+    do{
+      int b;
+      b=(int)(val>>16);
+      l+=b<<frac;
+      val=(val+b)>>b;
+      val=(val*val+0x7FFF)>>15;
+    }
+    while(frac-->0);
+    /*If val is not exactly 0x8000, then we have to round up the remainder.*/
+    return l+(val>0x8000);
+  }
+  /*Exact powers of two require no rounding.*/
+  else return (l-1)<<frac;
+}
+#endif
+
+/*Although derived separately, the pulse vector coding scheme is equivalent to
+   a Pyramid Vector Quantizer \cite{Fis86}.
+  Some additional notes about an early version appear at
+   http://people.xiph.org/~tterribe/notes/cwrs.html, but the codebook ordering
+   and the definitions of some terms have evolved since that was written.
+
+  The conversion from a pulse vector to an integer index (encoding) and back
+   (decoding) is governed by two related functions, V(N,K) and U(N,K).
+
+  V(N,K) = the number of combinations, with replacement, of N items, taken K
+   at a time, when a sign bit is added to each item taken at least once (i.e.,
+   the number of N-dimensional unit pulse vectors with K pulses).
+  One way to compute this is via
+    V(N,K) = K>0 ? sum(k=1...K,2**k*choose(N,k)*choose(K-1,k-1)) : 1,
+   where choose() is the binomial function.
+  A table of values for N<10 and K<10 looks like:
+  V[10][10] = {
+    {1,  0,   0,    0,    0,     0,     0,      0,      0,       0},
+    {1,  2,   2,    2,    2,     2,     2,      2,      2,       2},
+    {1,  4,   8,   12,   16,    20,    24,     28,     32,      36},
+    {1,  6,  18,   38,   66,   102,   146,    198,    258,     326},
+    {1,  8,  32,   88,  192,   360,   608,    952,   1408,    1992},
+    {1, 10,  50,  170,  450,  1002,  1970,   3530,   5890,    9290},
+    {1, 12,  72,  292,  912,  2364,  5336,  10836,  20256,   35436},
+    {1, 14,  98,  462, 1666,  4942, 12642,  28814,  59906,  115598},
+    {1, 16, 128,  688, 2816,  9424, 27008,  68464, 157184,  332688},
+    {1, 18, 162,  978, 4482, 16722, 53154, 148626, 374274,  864146}
+  };
+
+  U(N,K) = the number of such combinations wherein N-1 objects are taken at
+   most K-1 at a time.
+  This is given by
+    U(N,K) = sum(k=0...K-1,V(N-1,k))
+           = K>0 ? (V(N-1,K-1) + V(N,K-1))/2 : 0.
+  The latter expression also makes clear that U(N,K) is half the number of such
+   combinations wherein the first object is taken at least once.
+  Although it may not be clear from either of these definitions, U(N,K) is the
+   natural function to work with when enumerating the pulse vector codebooks,
+   not V(N,K).
+  U(N,K) is not well-defined for N=0, but with the extension
+    U(0,K) = K>0 ? 0 : 1,
+   the function becomes symmetric: U(N,K) = U(K,N), with a similar table:
+  U[10][10] = {
+    {1, 0,  0,   0,    0,    0,     0,     0,      0,      0},
+    {0, 1,  1,   1,    1,    1,     1,     1,      1,      1},
+    {0, 1,  3,   5,    7,    9,    11,    13,     15,     17},
+    {0, 1,  5,  13,   25,   41,    61,    85,    113,    145},
+    {0, 1,  7,  25,   63,  129,   231,   377,    575,    833},
+    {0, 1,  9,  41,  129,  321,   681,  1289,   2241,   3649},
+    {0, 1, 11,  61,  231,  681,  1683,  3653,   7183,  13073},
+    {0, 1, 13,  85,  377, 1289,  3653,  8989,  19825,  40081},
+    {0, 1, 15, 113,  575, 2241,  7183, 19825,  48639, 108545},
+    {0, 1, 17, 145,  833, 3649, 13073, 40081, 108545, 265729}
+  };
+
+  With this extension, V(N,K) may be written in terms of U(N,K):
+    V(N,K) = U(N,K) + U(N,K+1)
+   for all N>=0, K>=0.
+  Thus U(N,K+1) represents the number of combinations where the first element
+   is positive or zero, and U(N,K) represents the number of combinations where
+   it is negative.
+  With a large enough table of U(N,K) values, we could write O(N) encoding
+   and O(min(N*log(K),N+K)) decoding routines, but such a table would be
+   prohibitively large for small embedded devices (K may be as large as 32767
+   for small N, and N may be as large as 200).
+
+  Both functions obey the same recurrence relation:
+    V(N,K) = V(N-1,K) + V(N,K-1) + V(N-1,K-1),
+    U(N,K) = U(N-1,K) + U(N,K-1) + U(N-1,K-1),
+   for all N>0, K>0, with different initial conditions at N=0 or K=0.
+  This allows us to construct a row of one of the tables above given the
+   previous row or the next row.
+  Thus we can derive O(NK) encoding and decoding routines with O(K) memory
+   using only addition and subtraction.
+
+  When encoding, we build up from the U(2,K) row and work our way forwards.
+  When decoding, we need to start at the U(N,K) row and work our way backwards,
+   which requires a means of computing U(N,K).
+  U(N,K) may be computed from two previous values with the same N:
+    U(N,K) = ((2*N-1)*U(N,K-1) - U(N,K-2))/(K-1) + U(N,K-2)
+   for all N>1, and since U(N,K) is symmetric, a similar relation holds for two
+   previous values with the same K:
+    U(N,K>1) = ((2*K-1)*U(N-1,K) - U(N-2,K))/(N-1) + U(N-2,K)
+   for all K>1.
+  This allows us to construct an arbitrary row of the U(N,K) table by starting
+   with the first two values, which are constants.
+  This saves roughly 2/3 the work in our O(NK) decoding routine, but costs O(K)
+   multiplications.
+  Similar relations can be derived for V(N,K), but are not used here.
+
+  For N>0 and K>0, U(N,K) and V(N,K) take on the form of an (N-1)-degree
+   polynomial for fixed N.
+  The first few are
+    U(1,K) = 1,
+    U(2,K) = 2*K-1,
+    U(3,K) = (2*K-2)*K+1,
+    U(4,K) = (((4*K-6)*K+8)*K-3)/3,
+    U(5,K) = ((((2*K-4)*K+10)*K-8)*K+3)/3,
+   and
+    V(1,K) = 2,
+    V(2,K) = 4*K,
+    V(3,K) = 4*K*K+2,
+    V(4,K) = 8*(K*K+2)*K/3,
+    V(5,K) = ((4*K*K+20)*K*K+6)/3,
+   for all K>0.
+  This allows us to derive O(N) encoding and O(N*log(K)) decoding routines for
+   small N (and indeed decoding is also O(N) for N<3).
+
+  @ARTICLE{Fis86,
+    author="Thomas R. Fischer",
+    title="A Pyramid Vector Quantizer",
+    journal="IEEE Transactions on Information Theory",
+    volume="IT-32",
+    number=4,
+    pages="568--583",
+    month=Jul,
+    year=1986
+  }*/
+
+#if !defined(SMALL_FOOTPRINT)
+
+/*U(N,K) = U(K,N) := N>0?K>0?U(N-1,K)+U(N,K-1)+U(N-1,K-1):0:K>0?1:0*/
+# define CELT_PVQ_U(_n,_k) (CELT_PVQ_U_ROW[IMIN(_n,_k)][IMAX(_n,_k)])
+/*V(N,K) := U(N,K)+U(N,K+1) = the number of PVQ codewords for a band of size N
+   with K pulses allocated to it.*/
+# define CELT_PVQ_V(_n,_k) (CELT_PVQ_U(_n,_k)+CELT_PVQ_U(_n,(_k)+1))
+
+/*For each V(N,K) supported, we will access element U(min(N,K+1),max(N,K+1)).
+  Thus, the number of entries in row I is the larger of the maximum number of
+   pulses we will ever allocate for a given N=I (K=128, or however many fit in
+   32 bits, whichever is smaller), plus one, and the maximum N for which
+   K=I-1 pulses fit in 32 bits.
+  The largest band size in an Opus Custom mode is 208.
+  Otherwise, we can limit things to the set of N which can be achieved by
+   splitting a band from a standard Opus mode: 176, 144, 96, 88, 72, 64, 48,
+   44, 36, 32, 24, 22, 18, 16, 8, 4, 2).*/
+#if defined(CUSTOM_MODES)
+static const opus_uint32 CELT_PVQ_U_DATA[1488]={
+#else
+static const opus_uint32 CELT_PVQ_U_DATA[1272]={
+#endif
+  /*N=0, K=0...176:*/
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0,
+#endif
+  /*N=1, K=1...176:*/
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1,
+#endif
+  /*N=2, K=2...176:*/
+  3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+  43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+  81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
+  115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 141, 143,
+  145, 147, 149, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173,
+  175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203,
+  205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233,
+  235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 259, 261, 263,
+  265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293,
+  295, 297, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323,
+  325, 327, 329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 377, 379, 381,
+  383, 385, 387, 389, 391, 393, 395, 397, 399, 401, 403, 405, 407, 409, 411,
+  413, 415,
+#endif
+  /*N=3, K=3...176:*/
+  13, 25, 41, 61, 85, 113, 145, 181, 221, 265, 313, 365, 421, 481, 545, 613,
+  685, 761, 841, 925, 1013, 1105, 1201, 1301, 1405, 1513, 1625, 1741, 1861,
+  1985, 2113, 2245, 2381, 2521, 2665, 2813, 2965, 3121, 3281, 3445, 3613, 3785,
+  3961, 4141, 4325, 4513, 4705, 4901, 5101, 5305, 5513, 5725, 5941, 6161, 6385,
+  6613, 6845, 7081, 7321, 7565, 7813, 8065, 8321, 8581, 8845, 9113, 9385, 9661,
+  9941, 10225, 10513, 10805, 11101, 11401, 11705, 12013, 12325, 12641, 12961,
+  13285, 13613, 13945, 14281, 14621, 14965, 15313, 15665, 16021, 16381, 16745,
+  17113, 17485, 17861, 18241, 18625, 19013, 19405, 19801, 20201, 20605, 21013,
+  21425, 21841, 22261, 22685, 23113, 23545, 23981, 24421, 24865, 25313, 25765,
+  26221, 26681, 27145, 27613, 28085, 28561, 29041, 29525, 30013, 30505, 31001,
+  31501, 32005, 32513, 33025, 33541, 34061, 34585, 35113, 35645, 36181, 36721,
+  37265, 37813, 38365, 38921, 39481, 40045, 40613, 41185, 41761, 42341, 42925,
+  43513, 44105, 44701, 45301, 45905, 46513, 47125, 47741, 48361, 48985, 49613,
+  50245, 50881, 51521, 52165, 52813, 53465, 54121, 54781, 55445, 56113, 56785,
+  57461, 58141, 58825, 59513, 60205, 60901, 61601,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  62305, 63013, 63725, 64441, 65161, 65885, 66613, 67345, 68081, 68821, 69565,
+  70313, 71065, 71821, 72581, 73345, 74113, 74885, 75661, 76441, 77225, 78013,
+  78805, 79601, 80401, 81205, 82013, 82825, 83641, 84461, 85285, 86113,
+#endif
+  /*N=4, K=4...176:*/
+  63, 129, 231, 377, 575, 833, 1159, 1561, 2047, 2625, 3303, 4089, 4991, 6017,
+  7175, 8473, 9919, 11521, 13287, 15225, 17343, 19649, 22151, 24857, 27775,
+  30913, 34279, 37881, 41727, 45825, 50183, 54809, 59711, 64897, 70375, 76153,
+  82239, 88641, 95367, 102425, 109823, 117569, 125671, 134137, 142975, 152193,
+  161799, 171801, 182207, 193025, 204263, 215929, 228031, 240577, 253575,
+  267033, 280959, 295361, 310247, 325625, 341503, 357889, 374791, 392217,
+  410175, 428673, 447719, 467321, 487487, 508225, 529543, 551449, 573951,
+  597057, 620775, 645113, 670079, 695681, 721927, 748825, 776383, 804609,
+  833511, 863097, 893375, 924353, 956039, 988441, 1021567, 1055425, 1090023,
+  1125369, 1161471, 1198337, 1235975, 1274393, 1313599, 1353601, 1394407,
+  1436025, 1478463, 1521729, 1565831, 1610777, 1656575, 1703233, 1750759,
+  1799161, 1848447, 1898625, 1949703, 2001689, 2054591, 2108417, 2163175,
+  2218873, 2275519, 2333121, 2391687, 2451225, 2511743, 2573249, 2635751,
+  2699257, 2763775, 2829313, 2895879, 2963481, 3032127, 3101825, 3172583,
+  3244409, 3317311, 3391297, 3466375, 3542553, 3619839, 3698241, 3777767,
+  3858425, 3940223, 4023169, 4107271, 4192537, 4278975, 4366593, 4455399,
+  4545401, 4636607, 4729025, 4822663, 4917529, 5013631, 5110977, 5209575,
+  5309433, 5410559, 5512961, 5616647, 5721625, 5827903, 5935489, 6044391,
+  6154617, 6266175, 6379073, 6493319, 6608921, 6725887, 6844225, 6963943,
+  7085049, 7207551,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  7331457, 7456775, 7583513, 7711679, 7841281, 7972327, 8104825, 8238783,
+  8374209, 8511111, 8649497, 8789375, 8930753, 9073639, 9218041, 9363967,
+  9511425, 9660423, 9810969, 9963071, 10116737, 10271975, 10428793, 10587199,
+  10747201, 10908807, 11072025, 11236863, 11403329, 11571431, 11741177,
+  11912575,
+#endif
+  /*N=5, K=5...176:*/
+  321, 681, 1289, 2241, 3649, 5641, 8361, 11969, 16641, 22569, 29961, 39041,
+  50049, 63241, 78889, 97281, 118721, 143529, 172041, 204609, 241601, 283401,
+  330409, 383041, 441729, 506921, 579081, 658689, 746241, 842249, 947241,
+  1061761, 1186369, 1321641, 1468169, 1626561, 1797441, 1981449, 2179241,
+  2391489, 2618881, 2862121, 3121929, 3399041, 3694209, 4008201, 4341801,
+  4695809, 5071041, 5468329, 5888521, 6332481, 6801089, 7295241, 7815849,
+  8363841, 8940161, 9545769, 10181641, 10848769, 11548161, 12280841, 13047849,
+  13850241, 14689089, 15565481, 16480521, 17435329, 18431041, 19468809,
+  20549801, 21675201, 22846209, 24064041, 25329929, 26645121, 28010881,
+  29428489, 30899241, 32424449, 34005441, 35643561, 37340169, 39096641,
+  40914369, 42794761, 44739241, 46749249, 48826241, 50971689, 53187081,
+  55473921, 57833729, 60268041, 62778409, 65366401, 68033601, 70781609,
+  73612041, 76526529, 79526721, 82614281, 85790889, 89058241, 92418049,
+  95872041, 99421961, 103069569, 106816641, 110664969, 114616361, 118672641,
+  122835649, 127107241, 131489289, 135983681, 140592321, 145317129, 150160041,
+  155123009, 160208001, 165417001, 170752009, 176215041, 181808129, 187533321,
+  193392681, 199388289, 205522241, 211796649, 218213641, 224775361, 231483969,
+  238341641, 245350569, 252512961, 259831041, 267307049, 274943241, 282741889,
+  290705281, 298835721, 307135529, 315607041, 324252609, 333074601, 342075401,
+  351257409, 360623041, 370174729, 379914921, 389846081, 399970689, 410291241,
+  420810249, 431530241, 442453761, 453583369, 464921641, 476471169, 488234561,
+  500214441, 512413449, 524834241, 537479489, 550351881, 563454121, 576788929,
+  590359041, 604167209, 618216201, 632508801,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  647047809, 661836041, 676876329, 692171521, 707724481, 723538089, 739615241,
+  755958849, 772571841, 789457161, 806617769, 824056641, 841776769, 859781161,
+  878072841, 896654849, 915530241, 934702089, 954173481, 973947521, 994027329,
+  1014416041, 1035116809, 1056132801, 1077467201, 1099123209, 1121104041,
+  1143412929, 1166053121, 1189027881, 1212340489, 1235994241,
+#endif
+  /*N=6, K=6...96:*/
+  1683, 3653, 7183, 13073, 22363, 36365, 56695, 85305, 124515, 177045, 246047,
+  335137, 448427, 590557, 766727, 982729, 1244979, 1560549, 1937199, 2383409,
+  2908411, 3522221, 4235671, 5060441, 6009091, 7095093, 8332863, 9737793,
+  11326283, 13115773, 15124775, 17372905, 19880915, 22670725, 25765455,
+  29189457, 32968347, 37129037, 41699767, 46710137, 52191139, 58175189,
+  64696159, 71789409, 79491819, 87841821, 96879431, 106646281, 117185651,
+  128542501, 140763503, 153897073, 167993403, 183104493, 199284183, 216588185,
+  235074115, 254801525, 275831935, 298228865, 322057867, 347386557, 374284647,
+  402823977, 433078547, 465124549, 499040399, 534906769, 572806619, 612825229,
+  655050231, 699571641, 746481891, 795875861, 847850911, 902506913, 959946283,
+  1020274013, 1083597703, 1150027593, 1219676595, 1292660325, 1369097135,
+  1449108145, 1532817275, 1620351277, 1711839767, 1807415257, 1907213187,
+  2011371957, 2120032959,
+#if defined(CUSTOM_MODES)
+  /*...109:*/
+  2233340609U, 2351442379U, 2474488829U, 2602633639U, 2736033641U, 2874848851U,
+  3019242501U, 3169381071U, 3325434321U, 3487575323U, 3655980493U, 3830829623U,
+  4012305913U,
+#endif
+  /*N=7, K=7...54*/
+  8989, 19825, 40081, 75517, 134245, 227305, 369305, 579125, 880685, 1303777,
+  1884961, 2668525, 3707509, 5064793, 6814249, 9041957, 11847485, 15345233,
+  19665841, 24957661, 31388293, 39146185, 48442297, 59511829, 72616013,
+  88043969, 106114625, 127178701, 151620757, 179861305, 212358985, 249612805,
+  292164445, 340600625, 395555537, 457713341, 527810725, 606639529, 695049433,
+  793950709, 904317037, 1027188385, 1163673953, 1314955181, 1482288821,
+  1667010073, 1870535785, 2094367717,
+#if defined(CUSTOM_MODES)
+  /*...60:*/
+  2340095869U, 2609401873U, 2904062449U, 3225952925U, 3577050821U, 3959439497U,
+#endif
+  /*N=8, K=8...37*/
+  48639, 108545, 224143, 433905, 795455, 1392065, 2340495, 3800305, 5984767,
+  9173505, 13726991, 20103025, 28875327, 40754369, 56610575, 77500017,
+  104692735, 139703809, 184327311, 240673265, 311207743, 398796225, 506750351,
+  638878193, 799538175, 993696769, 1226990095, 1505789553, 1837271615,
+  2229491905U,
+#if defined(CUSTOM_MODES)
+  /*...40:*/
+  2691463695U, 3233240945U, 3866006015U,
+#endif
+  /*N=9, K=9...28:*/
+  265729, 598417, 1256465, 2485825, 4673345, 8405905, 14546705, 24331777,
+  39490049, 62390545, 96220561, 145198913, 214828609, 312193553, 446304145,
+  628496897, 872893441, 1196924561, 1621925137, 2173806145U,
+#if defined(CUSTOM_MODES)
+  /*...29:*/
+  2883810113U,
+#endif
+  /*N=10, K=10...24:*/
+  1462563, 3317445, 7059735, 14218905, 27298155, 50250765, 89129247, 152951073,
+  254831667, 413442773, 654862247, 1014889769, 1541911931, 2300409629U,
+  3375210671U,
+  /*N=11, K=11...19:*/
+  8097453, 18474633, 39753273, 81270333, 158819253, 298199265, 540279585,
+  948062325, 1616336765,
+#if defined(CUSTOM_MODES)
+  /*...20:*/
+  2684641785U,
+#endif
+  /*N=12, K=12...18:*/
+  45046719, 103274625, 224298231, 464387817, 921406335, 1759885185,
+  3248227095U,
+  /*N=13, K=13...16:*/
+  251595969, 579168825, 1267854873, 2653649025U,
+  /*N=14, K=14:*/
+  1409933619
+};
+
+#if defined(CUSTOM_MODES)
+static const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+  CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 208,CELT_PVQ_U_DATA+ 415,
+  CELT_PVQ_U_DATA+ 621,CELT_PVQ_U_DATA+ 826,CELT_PVQ_U_DATA+1030,
+  CELT_PVQ_U_DATA+1233,CELT_PVQ_U_DATA+1336,CELT_PVQ_U_DATA+1389,
+  CELT_PVQ_U_DATA+1421,CELT_PVQ_U_DATA+1441,CELT_PVQ_U_DATA+1455,
+  CELT_PVQ_U_DATA+1464,CELT_PVQ_U_DATA+1470,CELT_PVQ_U_DATA+1473
+};
+#else
+static const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+  CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 176,CELT_PVQ_U_DATA+ 351,
+  CELT_PVQ_U_DATA+ 525,CELT_PVQ_U_DATA+ 698,CELT_PVQ_U_DATA+ 870,
+  CELT_PVQ_U_DATA+1041,CELT_PVQ_U_DATA+1131,CELT_PVQ_U_DATA+1178,
+  CELT_PVQ_U_DATA+1207,CELT_PVQ_U_DATA+1226,CELT_PVQ_U_DATA+1240,
+  CELT_PVQ_U_DATA+1248,CELT_PVQ_U_DATA+1254,CELT_PVQ_U_DATA+1257
+};
+#endif
+
+#if defined(CUSTOM_MODES)
+void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){
+  int k;
+  /*_maxk==0 => there's nothing to do.*/
+  celt_assert(_maxk>0);
+  _bits[0]=0;
+  for(k=1;k<=_maxk;k++)_bits[k]=log2_frac(CELT_PVQ_V(_n,k),_frac);
+}
+#endif
+
+static opus_uint32 icwrs(int _n,const int *_y){
+  opus_uint32 i;
+  int         j;
+  int         k;
+  celt_assert(_n>=2);
+  j=_n-1;
+  i=_y[j]<0;
+  k=abs(_y[j]);
+  do{
+    j--;
+    i+=CELT_PVQ_U(_n-j,k);
+    k+=abs(_y[j]);
+    if(_y[j]<0)i+=CELT_PVQ_U(_n-j,k+1);
+  }
+  while(j>0);
+  return i;
+}
+
+void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
+  celt_assert(_k>0);
+  ec_enc_uint(_enc,icwrs(_n,_y),CELT_PVQ_V(_n,_k));
+}
+
+static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
+  opus_uint32 p;
+  int         s;
+  int         k0;
+  celt_assert(_k>0);
+  celt_assert(_n>1);
+  while(_n>2){
+    opus_uint32 q;
+    /*Lots of pulses case:*/
+    if(_k>=_n){
+      const opus_uint32 *row;
+      row=CELT_PVQ_U_ROW[_n];
+      /*Are the pulses in this dimension negative?*/
+      p=row[_k+1];
+      s=-(_i>=p);
+      _i-=p&s;
+      /*Count how many pulses were placed in this dimension.*/
+      k0=_k;
+      q=row[_n];
+      if(q>_i){
+        celt_assert(p>q);
+        _k=_n;
+        do p=CELT_PVQ_U_ROW[--_k][_n];
+        while(p>_i);
+      }
+      else for(p=row[_k];p>_i;p=row[_k])_k--;
+      _i-=p;
+      *_y++=(k0-_k+s)^s;
+    }
+    /*Lots of dimensions case:*/
+    else{
+      /*Are there any pulses in this dimension at all?*/
+      p=CELT_PVQ_U_ROW[_k][_n];
+      q=CELT_PVQ_U_ROW[_k+1][_n];
+      if(p<=_i&&_i<q){
+        _i-=p;
+        *_y++=0;
+      }
+      else{
+        /*Are the pulses in this dimension negative?*/
+        s=-(_i>=q);
+        _i-=q&s;
+        /*Count how many pulses were placed in this dimension.*/
+        k0=_k;
+        do p=CELT_PVQ_U_ROW[--_k][_n];
+        while(p>_i);
+        _i-=p;
+        *_y++=(k0-_k+s)^s;
+      }
+    }
+    _n--;
+  }
+  /*_n==2*/
+  p=2*_k+1;
+  s=-(_i>=p);
+  _i-=p&s;
+  k0=_k;
+  _k=(_i+1)>>1;
+  if(_k)_i-=2*_k-1;
+  *_y++=(k0-_k+s)^s;
+  /*_n==1*/
+  s=-(int)_i;
+  *_y=(_k+s)^s;
+}
+
+void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
+}
+
+#else /* SMALL_FOOTPRINT */
+
+/*Computes the next row/column of any recurrence that obeys the relation
+   u[i][j]=u[i-1][j]+u[i][j-1]+u[i-1][j-1].
+  _ui0 is the base case for the new row/column.*/
+static OPUS_INLINE void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){
+  opus_uint32 ui1;
+  unsigned      j;
+  /*This do-while will overrun the array if we don't have storage for at least
+     2 values.*/
+  j=1; do {
+    ui1=UADD32(UADD32(_ui[j],_ui[j-1]),_ui0);
+    _ui[j-1]=_ui0;
+    _ui0=ui1;
+  } while (++j<_len);
+  _ui[j-1]=_ui0;
+}
+
+/*Computes the previous row/column of any recurrence that obeys the relation
+   u[i-1][j]=u[i][j]-u[i][j-1]-u[i-1][j-1].
+  _ui0 is the base case for the new row/column.*/
+static OPUS_INLINE void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){
+  opus_uint32 ui1;
+  unsigned      j;
+  /*This do-while will overrun the array if we don't have storage for at least
+     2 values.*/
+  j=1; do {
+    ui1=USUB32(USUB32(_ui[j],_ui[j-1]),_ui0);
+    _ui[j-1]=_ui0;
+    _ui0=ui1;
+  } while (++j<_n);
+  _ui[j-1]=_ui0;
+}
+
+/*Compute V(_n,_k), as well as U(_n,0..._k+1).
+  _u: On exit, _u[i] contains U(_n,i) for i in [0..._k+1].*/
+static opus_uint32 ncwrs_urow(unsigned _n,unsigned _k,opus_uint32 *_u){
+  opus_uint32 um2;
+  unsigned      len;
+  unsigned      k;
+  len=_k+2;
+  /*We require storage at least 3 values (e.g., _k>0).*/
+  celt_assert(len>=3);
+  _u[0]=0;
+  _u[1]=um2=1;
+  /*If _n==0, _u[0] should be 1 and the rest should be 0.*/
+  /*If _n==1, _u[i] should be 1 for i>1.*/
+  celt_assert(_n>=2);
+  /*If _k==0, the following do-while loop will overflow the buffer.*/
+  celt_assert(_k>0);
+  k=2;
+  do _u[k]=(k<<1)-1;
+  while(++k<len);
+  for(k=2;k<_n;k++)unext(_u+1,_k+1,1);
+  return _u[_k]+_u[_k+1];
+}
+
+/*Returns the _i'th combination of _k elements chosen from a set of size _n
+   with associated sign bits.
+  _y: Returns the vector of pulses.
+  _u: Must contain entries [0..._k+1] of row _n of U() on input.
+      Its contents will be destructively modified.*/
+static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
+  int j;
+  celt_assert(_n>0);
+  j=0;
+  do{
+    opus_uint32 p;
+    int           s;
+    int           yj;
+    p=_u[_k+1];
+    s=-(_i>=p);
+    _i-=p&s;
+    yj=_k;
+    p=_u[_k];
+    while(p>_i)p=_u[--_k];
+    _i-=p;
+    yj-=_k;
+    _y[j]=(yj+s)^s;
+    uprev(_u,_k+2,0);
+  }
+  while(++j<_n);
+}
+
+/*Returns the index of the given combination of K elements chosen from a set
+   of size 1 with associated sign bits.
+  _y: The vector of pulses, whose sum of absolute values is K.
+  _k: Returns K.*/
+static OPUS_INLINE opus_uint32 icwrs1(const int *_y,int *_k){
+  *_k=abs(_y[0]);
+  return _y[0]<0;
+}
+
+/*Returns the index of the given combination of K elements chosen from a set
+   of size _n with associated sign bits.
+  _y:  The vector of pulses, whose sum of absolute values must be _k.
+  _nc: Returns V(_n,_k).*/
+static OPUS_INLINE opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y,
+ opus_uint32 *_u){
+  opus_uint32 i;
+  int         j;
+  int         k;
+  /*We can't unroll the first two iterations of the loop unless _n>=2.*/
+  celt_assert(_n>=2);
+  _u[0]=0;
+  for(k=1;k<=_k+1;k++)_u[k]=(k<<1)-1;
+  i=icwrs1(_y+_n-1,&k);
+  j=_n-2;
+  i+=_u[k];
+  k+=abs(_y[j]);
+  if(_y[j]<0)i+=_u[k+1];
+  while(j-->0){
+    unext(_u,_k+2,0);
+    i+=_u[k];
+    k+=abs(_y[j]);
+    if(_y[j]<0)i+=_u[k+1];
+  }
+  *_nc=_u[k]+_u[k+1];
+  return i;
+}
+
+#ifdef CUSTOM_MODES
+void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){
+  int k;
+  /*_maxk==0 => there's nothing to do.*/
+  celt_assert(_maxk>0);
+  _bits[0]=0;
+  if (_n==1)
+  {
+    for (k=1;k<=_maxk;k++)
+      _bits[k] = 1<<_frac;
+  }
+  else {
+    VARDECL(opus_uint32,u);
+    SAVE_STACK;
+    ALLOC(u,_maxk+2U,opus_uint32);
+    ncwrs_urow(_n,_maxk,u);
+    for(k=1;k<=_maxk;k++)
+      _bits[k]=log2_frac(u[k]+u[k+1],_frac);
+    RESTORE_STACK;
+  }
+}
+#endif /* CUSTOM_MODES */
+
+void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
+  opus_uint32 i;
+  VARDECL(opus_uint32,u);
+  opus_uint32 nc;
+  SAVE_STACK;
+  celt_assert(_k>0);
+  ALLOC(u,_k+2U,opus_uint32);
+  i=icwrs(_n,_k,&nc,_y,u);
+  ec_enc_uint(_enc,i,nc);
+  RESTORE_STACK;
+}
+
+void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  VARDECL(opus_uint32,u);
+  SAVE_STACK;
+  celt_assert(_k>0);
+  ALLOC(u,_k+2U,opus_uint32);
+  cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
+  RESTORE_STACK;
+}
+
+#endif /* SMALL_FOOTPRINT */

+ 48 - 0
drivers/opus/celt/cwrs.h

@@ -0,0 +1,48 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2007-2009 Timothy B. Terriberry
+   Written by Timothy B. Terriberry and Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CWRS_H
+#define CWRS_H
+
+#include "arch.h"
+#include "stack_alloc.h"
+#include "entenc.h"
+#include "entdec.h"
+
+#ifdef CUSTOM_MODES
+int log2_frac(opus_uint32 val, int frac);
+#endif
+
+void get_required_bits(opus_int16 *bits, int N, int K, int frac);
+
+void encode_pulses(const int *_y, int N, int K, ec_enc *enc);
+
+void decode_pulses(int *_y, int N, int K, ec_dec *dec);
+
+#endif /* CWRS_H */

+ 87 - 0
drivers/opus/celt/ecintrin.h

@@ -0,0 +1,87 @@
+/* Copyright (c) 2003-2008 Timothy B. Terriberry
+   Copyright (c) 2008 Xiph.Org Foundation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*Some common macros for potential platform-specific optimization.*/
+#include "opus_types.h"
+#include <math.h>
+#include <limits.h>
+#include "arch.h"
+#if !defined(_ecintrin_H)
+# define _ecintrin_H (1)
+
+/*Some specific platforms may have optimized intrinsic or OPUS_INLINE assembly
+   versions of these functions which can substantially improve performance.
+  We define macros for them to allow easy incorporation of these non-ANSI
+   features.*/
+
+/*Modern gcc (4.x) can compile the naive versions of min and max with cmov if
+   given an appropriate architecture, but the branchless bit-twiddling versions
+   are just as fast, and do not require any special target architecture.
+  Earlier gcc versions (3.x) compiled both code to the same assembly
+   instructions, because of the way they represented ((_b)>(_a)) internally.*/
+# define EC_MINI(_a,_b)      ((_a)+(((_b)-(_a))&-((_b)<(_a))))
+
+/*Count leading zeros.
+  This macro should only be used for implementing ec_ilog(), if it is defined.
+  All other code should use EC_ILOG() instead.*/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)
+# include <intrin.h>
+/*In _DEBUG mode this is not an intrinsic by default.*/
+# pragma intrinsic(_BitScanReverse)
+
+static __inline int ec_bsr(unsigned long _x){
+  unsigned long ret;
+  _BitScanReverse(&ret,_x);
+  return (int)ret;
+}
+# define EC_CLZ0    (1)
+# define EC_CLZ(_x) (-ec_bsr(_x))
+#elif defined(ENABLE_TI_DSPLIB)
+# include "dsplib.h"
+# define EC_CLZ0    (31)
+# define EC_CLZ(_x) (_lnorm(_x))
+#elif __GNUC_PREREQ(3,4)
+# if INT_MAX>=2147483647
+#  define EC_CLZ0    ((int)sizeof(unsigned)*CHAR_BIT)
+#  define EC_CLZ(_x) (__builtin_clz(_x))
+# elif LONG_MAX>=2147483647L
+#  define EC_CLZ0    ((int)sizeof(unsigned long)*CHAR_BIT)
+#  define EC_CLZ(_x) (__builtin_clzl(_x))
+# endif
+#endif
+
+#if defined(EC_CLZ)
+/*Note that __builtin_clz is not defined when _x==0, according to the gcc
+   documentation (and that of the BSR instruction that implements it on x86).
+  The majority of the time we can never pass it zero.
+  When we need to, it can be special cased.*/
+# define EC_ILOG(_x) (EC_CLZ0-EC_CLZ(_x))
+#else
+int ec_ilog(opus_uint32 _v);
+# define EC_ILOG(_x) (ec_ilog(_x))
+#endif
+#endif

+ 93 - 0
drivers/opus/celt/entcode.c

@@ -0,0 +1,93 @@
+/* Copyright (c) 2001-2011 Timothy B. Terriberry
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "entcode.h"
+#include "arch.h"
+
+#if !defined(EC_CLZ)
+/*This is a fallback for systems where we don't know how to access
+   a BSR or CLZ instruction (see ecintrin.h).
+  If you are optimizing Opus on a new platform and it has a native CLZ or
+   BZR (e.g. cell, MIPS, x86, etc) then making it available to Opus will be
+   an easy performance win.*/
+int ec_ilog(opus_uint32 _v){
+  /*On a Pentium M, this branchless version tested as the fastest on
+     1,000,000,000 random 32-bit integers, edging out a similar version with
+     branches, and a 256-entry LUT version.*/
+  int ret;
+  int m;
+  ret=!!_v;
+  m=!!(_v&0xFFFF0000)<<4;
+  _v>>=m;
+  ret|=m;
+  m=!!(_v&0xFF00)<<3;
+  _v>>=m;
+  ret|=m;
+  m=!!(_v&0xF0)<<2;
+  _v>>=m;
+  ret|=m;
+  m=!!(_v&0xC)<<1;
+  _v>>=m;
+  ret|=m;
+  ret+=!!(_v&0x2);
+  return ret;
+}
+#endif
+
+opus_uint32 ec_tell_frac(ec_ctx *_this){
+  opus_uint32 nbits;
+  opus_uint32 r;
+  int         l;
+  int         i;
+  /*To handle the non-integral number of bits still left in the encoder/decoder
+     state, we compute the worst-case number of bits of val that must be
+     encoded to ensure that the value is inside the range for any possible
+     subsequent bits.
+    The computation here is independent of val itself (the decoder does not
+     even track that value), even though the real number of bits used after
+     ec_enc_done() may be 1 smaller if rng is a power of two and the
+     corresponding trailing bits of val are all zeros.
+    If we did try to track that special case, then coding a value with a
+     probability of 1/(1<<n) might sometimes appear to use more than n bits.
+    This may help explain the surprising result that a newly initialized
+     encoder or decoder claims to have used 1 bit.*/
+  nbits=_this->nbits_total<<BITRES;
+  l=EC_ILOG(_this->rng);
+  r=_this->rng>>(l-16);
+  for(i=BITRES;i-->0;){
+    int b;
+    r=r*r>>15;
+    b=(int)(r>>16);
+    l=l<<1|b;
+    r>>=b;
+  }
+  return nbits-l;
+}

+ 117 - 0
drivers/opus/celt/entcode.h

@@ -0,0 +1,117 @@
+/* Copyright (c) 2001-2011 Timothy B. Terriberry
+   Copyright (c) 2008-2009 Xiph.Org Foundation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "opus_types.h"
+#include "opus_defines.h"
+
+#if !defined(_entcode_H)
+# define _entcode_H (1)
+# include <limits.h>
+# include <stddef.h>
+# include "ecintrin.h"
+
+/*OPT: ec_window must be at least 32 bits, but if you have fast arithmetic on a
+   larger type, you can speed up the decoder by using it here.*/
+typedef opus_uint32           ec_window;
+typedef struct ec_ctx         ec_ctx;
+typedef struct ec_ctx         ec_enc;
+typedef struct ec_ctx         ec_dec;
+
+# define EC_WINDOW_SIZE ((int)sizeof(ec_window)*CHAR_BIT)
+
+/*The number of bits to use for the range-coded part of unsigned integers.*/
+# define EC_UINT_BITS   (8)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+# define BITRES 3
+
+/*The entropy encoder/decoder context.
+  We use the same structure for both, so that common functions like ec_tell()
+   can be used on either one.*/
+struct ec_ctx{
+   /*Buffered input/output.*/
+   unsigned char *buf;
+   /*The size of the buffer.*/
+   opus_uint32    storage;
+   /*The offset at which the last byte containing raw bits was read/written.*/
+   opus_uint32    end_offs;
+   /*Bits that will be read from/written at the end.*/
+   ec_window      end_window;
+   /*Number of valid bits in end_window.*/
+   int            nend_bits;
+   /*The total number of whole bits read/written.
+     This does not include partial bits currently in the range coder.*/
+   int            nbits_total;
+   /*The offset at which the next range coder byte will be read/written.*/
+   opus_uint32    offs;
+   /*The number of values in the current range.*/
+   opus_uint32    rng;
+   /*In the decoder: the difference between the top of the current range and
+      the input value, minus one.
+     In the encoder: the low end of the current range.*/
+   opus_uint32    val;
+   /*In the decoder: the saved normalization factor from ec_decode().
+     In the encoder: the number of oustanding carry propagating symbols.*/
+   opus_uint32    ext;
+   /*A buffered input/output symbol, awaiting carry propagation.*/
+   int            rem;
+   /*Nonzero if an error occurred.*/
+   int            error;
+};
+
+static OPUS_INLINE opus_uint32 ec_range_bytes(ec_ctx *_this){
+  return _this->offs;
+}
+
+static OPUS_INLINE unsigned char *ec_get_buffer(ec_ctx *_this){
+  return _this->buf;
+}
+
+static OPUS_INLINE int ec_get_error(ec_ctx *_this){
+  return _this->error;
+}
+
+/*Returns the number of bits "used" by the encoded or decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+static OPUS_INLINE int ec_tell(ec_ctx *_this){
+  return _this->nbits_total-EC_ILOG(_this->rng);
+}
+
+/*Returns the number of bits "used" by the encoded or decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits scaled by 2**BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+opus_uint32 ec_tell_frac(ec_ctx *_this);
+
+#endif

+ 245 - 0
drivers/opus/celt/entdec.c

@@ -0,0 +1,245 @@
+/* Copyright (c) 2001-2011 Timothy B. Terriberry
+   Copyright (c) 2008-2009 Xiph.Org Foundation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include <stddef.h>
+#include "os_support.h"
+#include "arch.h"
+#include "entdec.h"
+#include "mfrngcod.h"
+
+/*A range decoder.
+  This is an entropy decoder based upon \cite{Mar79}, which is itself a
+   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
+  It is very similar to arithmetic encoding, except that encoding is done with
+   digits in any base, instead of with bits, and so it is faster when using
+   larger bases (i.e.: a byte).
+  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
+   is the base, longer than the theoretical optimum, but to my knowledge there
+   is no published justification for this claim.
+  This only seems true when using near-infinite precision arithmetic so that
+   the process is carried out with no rounding errors.
+
+  An excellent description of implementation details is available at
+   http://www.arturocampos.com/ac_range.html
+  A recent work \cite{MNW98} which proposes several changes to arithmetic
+   encoding for efficiency actually re-discovers many of the principles
+   behind range encoding, and presents a good theoretical analysis of them.
+
+  End of stream is handled by writing out the smallest number of bits that
+   ensures that the stream will be correctly decoded regardless of the value of
+   any subsequent bits.
+  ec_tell() can be used to determine how many bits were needed to decode
+   all the symbols thus far; other data can be packed in the remaining bits of
+   the input buffer.
+  @PHDTHESIS{Pas76,
+    author="Richard Clark Pasco",
+    title="Source coding algorithms for fast data compression",
+    school="Dept. of Electrical Engineering, Stanford University",
+    address="Stanford, CA",
+    month=May,
+    year=1976
+  }
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video & Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://www.stanford.edu/class/ee398a/handouts/papers/Moffat98ArithmCoding.pdf"
+  }*/
+
+static int ec_read_byte(ec_dec *_this){
+  return _this->offs<_this->storage?_this->buf[_this->offs++]:0;
+}
+
+static int ec_read_byte_from_end(ec_dec *_this){
+  return _this->end_offs<_this->storage?
+   _this->buf[_this->storage-++(_this->end_offs)]:0;
+}
+
+/*Normalizes the contents of val and rng so that rng lies entirely in the
+   high-order symbol.*/
+static void ec_dec_normalize(ec_dec *_this){
+  /*If the range is too small, rescale it and input some bits.*/
+  while(_this->rng<=EC_CODE_BOT){
+    int sym;
+    _this->nbits_total+=EC_SYM_BITS;
+    _this->rng<<=EC_SYM_BITS;
+    /*Use up the remaining bits from our last symbol.*/
+    sym=_this->rem;
+    /*Read the next value from the input.*/
+    _this->rem=ec_read_byte(_this);
+    /*Take the rest of the bits we need from this new symbol.*/
+    sym=(sym<<EC_SYM_BITS|_this->rem)>>(EC_SYM_BITS-EC_CODE_EXTRA);
+    /*And subtract them from val, capped to be less than EC_CODE_TOP.*/
+    _this->val=((_this->val<<EC_SYM_BITS)+(EC_SYM_MAX&~sym))&(EC_CODE_TOP-1);
+  }
+}
+
+void ec_dec_init(ec_dec *_this,unsigned char *_buf,opus_uint32 _storage){
+  _this->buf=_buf;
+  _this->storage=_storage;
+  _this->end_offs=0;
+  _this->end_window=0;
+  _this->nend_bits=0;
+  /*This is the offset from which ec_tell() will subtract partial bits.
+    The final value after the ec_dec_normalize() call will be the same as in
+     the encoder, but we have to compensate for the bits that are added there.*/
+  _this->nbits_total=EC_CODE_BITS+1
+   -((EC_CODE_BITS-EC_CODE_EXTRA)/EC_SYM_BITS)*EC_SYM_BITS;
+  _this->offs=0;
+  _this->rng=1U<<EC_CODE_EXTRA;
+  _this->rem=ec_read_byte(_this);
+  _this->val=_this->rng-1-(_this->rem>>(EC_SYM_BITS-EC_CODE_EXTRA));
+  _this->error=0;
+  /*Normalize the interval.*/
+  ec_dec_normalize(_this);
+}
+
+unsigned ec_decode(ec_dec *_this,unsigned _ft){
+  unsigned s;
+  _this->ext=_this->rng/_ft;
+  s=(unsigned)(_this->val/_this->ext);
+  return _ft-EC_MINI(s+1,_ft);
+}
+
+unsigned ec_decode_bin(ec_dec *_this,unsigned _bits){
+   unsigned s;
+   _this->ext=_this->rng>>_bits;
+   s=(unsigned)(_this->val/_this->ext);
+   return (1U<<_bits)-EC_MINI(s+1U,1U<<_bits);
+}
+
+void ec_dec_update(ec_dec *_this,unsigned _fl,unsigned _fh,unsigned _ft){
+  opus_uint32 s;
+  s=IMUL32(_this->ext,_ft-_fh);
+  _this->val-=s;
+  _this->rng=_fl>0?IMUL32(_this->ext,_fh-_fl):_this->rng-s;
+  ec_dec_normalize(_this);
+}
+
+/*The probability of having a "one" is 1/(1<<_logp).*/
+int ec_dec_bit_logp(ec_dec *_this,unsigned _logp){
+  opus_uint32 r;
+  opus_uint32 d;
+  opus_uint32 s;
+  int         ret;
+  r=_this->rng;
+  d=_this->val;
+  s=r>>_logp;
+  ret=d<s;
+  if(!ret)_this->val=d-s;
+  _this->rng=ret?s:r-s;
+  ec_dec_normalize(_this);
+  return ret;
+}
+
+int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){
+  opus_uint32 r;
+  opus_uint32 d;
+  opus_uint32 s;
+  opus_uint32 t;
+  int         ret;
+  s=_this->rng;
+  d=_this->val;
+  r=s>>_ftb;
+  ret=-1;
+  do{
+    t=s;
+    s=IMUL32(r,_icdf[++ret]);
+  }
+  while(d<s);
+  _this->val=d-s;
+  _this->rng=t-s;
+  ec_dec_normalize(_this);
+  return ret;
+}
+
+opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){
+  unsigned ft;
+  unsigned s;
+  int      ftb;
+  /*In order to optimize EC_ILOG(), it is undefined for the value 0.*/
+  celt_assert(_ft>1);
+  _ft--;
+  ftb=EC_ILOG(_ft);
+  if(ftb>EC_UINT_BITS){
+    opus_uint32 t;
+    ftb-=EC_UINT_BITS;
+    ft=(unsigned)(_ft>>ftb)+1;
+    s=ec_decode(_this,ft);
+    ec_dec_update(_this,s,s+1,ft);
+    t=(opus_uint32)s<<ftb|ec_dec_bits(_this,ftb);
+    if(t<=_ft)return t;
+    _this->error=1;
+    return _ft;
+  }
+  else{
+    _ft++;
+    s=ec_decode(_this,(unsigned)_ft);
+    ec_dec_update(_this,s,s+1,(unsigned)_ft);
+    return s;
+  }
+}
+
+opus_uint32 ec_dec_bits(ec_dec *_this,unsigned _bits){
+  ec_window   window;
+  int         available;
+  opus_uint32 ret;
+  window=_this->end_window;
+  available=_this->nend_bits;
+  if((unsigned)available<_bits){
+    do{
+      window|=(ec_window)ec_read_byte_from_end(_this)<<available;
+      available+=EC_SYM_BITS;
+    }
+    while(available<=EC_WINDOW_SIZE-EC_SYM_BITS);
+  }
+  ret=(opus_uint32)window&(((opus_uint32)1<<_bits)-1U);
+  window>>=_bits;
+  available-=_bits;
+  _this->end_window=window;
+  _this->nend_bits=available;
+  _this->nbits_total+=_bits;
+  return ret;
+}

+ 100 - 0
drivers/opus/celt/entdec.h

@@ -0,0 +1,100 @@
+/* Copyright (c) 2001-2011 Timothy B. Terriberry
+   Copyright (c) 2008-2009 Xiph.Org Foundation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(_entdec_H)
+# define _entdec_H (1)
+# include <limits.h>
+# include "entcode.h"
+
+/*Initializes the decoder.
+  _buf: The input buffer to use.
+  Return: 0 on success, or a negative value on error.*/
+void ec_dec_init(ec_dec *_this,unsigned char *_buf,opus_uint32 _storage);
+
+/*Calculates the cumulative frequency for the next symbol.
+  This can then be fed into the probability model to determine what that
+   symbol is, and the additional frequency information required to advance to
+   the next symbol.
+  This function cannot be called more than once without a corresponding call to
+   ec_dec_update(), or decoding will not proceed correctly.
+  _ft: The total frequency of the symbols in the alphabet the next symbol was
+        encoded with.
+  Return: A cumulative frequency representing the encoded symbol.
+          If the cumulative frequency of all the symbols before the one that
+           was encoded was fl, and the cumulative frequency of all the symbols
+           up to and including the one encoded is fh, then the returned value
+           will fall in the range [fl,fh).*/
+unsigned ec_decode(ec_dec *_this,unsigned _ft);
+
+/*Equivalent to ec_decode() with _ft==1<<_bits.*/
+unsigned ec_decode_bin(ec_dec *_this,unsigned _bits);
+
+/*Advance the decoder past the next symbol using the frequency information the
+   symbol was encoded with.
+  Exactly one call to ec_decode() must have been made so that all necessary
+   intermediate calculations are performed.
+  _fl:  The cumulative frequency of all symbols that come before the symbol
+         decoded.
+  _fh:  The cumulative frequency of all symbols up to and including the symbol
+         decoded.
+        Together with _fl, this defines the range [_fl,_fh) in which the value
+         returned above must fall.
+  _ft:  The total frequency of the symbols in the alphabet the symbol decoded
+         was encoded in.
+        This must be the same as passed to the preceding call to ec_decode().*/
+void ec_dec_update(ec_dec *_this,unsigned _fl,unsigned _fh,unsigned _ft);
+
+/* Decode a bit that has a 1/(1<<_logp) probability of being a one */
+int ec_dec_bit_logp(ec_dec *_this,unsigned _logp);
+
+/*Decodes a symbol given an "inverse" CDF table.
+  No call to ec_dec_update() is necessary after this call.
+  _icdf: The "inverse" CDF, such that symbol s falls in the range
+          [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb.
+         The values must be monotonically non-increasing, and the last value
+          must be 0.
+  _ftb: The number of bits of precision in the cumulative distribution.
+  Return: The decoded symbol s.*/
+int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb);
+
+/*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
+  The bits must have been encoded with ec_enc_uint().
+  No call to ec_dec_update() is necessary after this call.
+  _ft: The number of integers that can be decoded (one more than the max).
+       This must be at least one, and no more than 2**32-1.
+  Return: The decoded bits.*/
+opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft);
+
+/*Extracts a sequence of raw bits from the stream.
+  The bits must have been encoded with ec_enc_bits().
+  No call to ec_dec_update() is necessary after this call.
+  _ftb: The number of bits to extract.
+        This must be between 0 and 25, inclusive.
+  Return: The decoded bits.*/
+opus_uint32 ec_dec_bits(ec_dec *_this,unsigned _ftb);
+
+#endif

+ 294 - 0
drivers/opus/celt/entenc.c

@@ -0,0 +1,294 @@
+/* Copyright (c) 2001-2011 Timothy B. Terriberry
+   Copyright (c) 2008-2009 Xiph.Org Foundation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(OPUS_HAVE_CONFIG_H)
+# include "opus_config.h"
+#endif
+#include "os_support.h"
+#include "arch.h"
+#include "entenc.h"
+#include "mfrngcod.h"
+
+/*A range encoder.
+  See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
+
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video \& Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://www.stanford.edu/class/ee398/handouts/papers/Moffat98ArithmCoding.pdf"
+  }*/
+
+static int ec_write_byte(ec_enc *_this,unsigned _value){
+  if(_this->offs+_this->end_offs>=_this->storage)return -1;
+  _this->buf[_this->offs++]=(unsigned char)_value;
+  return 0;
+}
+
+static int ec_write_byte_at_end(ec_enc *_this,unsigned _value){
+  if(_this->offs+_this->end_offs>=_this->storage)return -1;
+  _this->buf[_this->storage-++(_this->end_offs)]=(unsigned char)_value;
+  return 0;
+}
+
+/*Outputs a symbol, with a carry bit.
+  If there is a potential to propagate a carry over several symbols, they are
+   buffered until it can be determined whether or not an actual carry will
+   occur.
+  If the counter for the buffered symbols overflows, then the stream becomes
+   undecodable.
+  This gives a theoretical limit of a few billion symbols in a single packet on
+   32-bit systems.
+  The alternative is to truncate the range in order to force a carry, but
+   requires similar carry tracking in the decoder, needlessly slowing it down.*/
+static void ec_enc_carry_out(ec_enc *_this,int _c){
+  if(_c!=EC_SYM_MAX){
+    /*No further carry propagation possible, flush buffer.*/
+    int carry;
+    carry=_c>>EC_SYM_BITS;
+    /*Don't output a byte on the first write.
+      This compare should be taken care of by branch-prediction thereafter.*/
+    if(_this->rem>=0)_this->error|=ec_write_byte(_this,_this->rem+carry);
+    if(_this->ext>0){
+      unsigned sym;
+      sym=(EC_SYM_MAX+carry)&EC_SYM_MAX;
+      do _this->error|=ec_write_byte(_this,sym);
+      while(--(_this->ext)>0);
+    }
+    _this->rem=_c&EC_SYM_MAX;
+  }
+  else _this->ext++;
+}
+
+static void ec_enc_normalize(ec_enc *_this){
+  /*If the range is too small, output some bits and rescale it.*/
+  while(_this->rng<=EC_CODE_BOT){
+    ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT));
+    /*Move the next-to-high-order symbol into the high-order position.*/
+    _this->val=(_this->val<<EC_SYM_BITS)&(EC_CODE_TOP-1);
+    _this->rng<<=EC_SYM_BITS;
+    _this->nbits_total+=EC_SYM_BITS;
+  }
+}
+
+void ec_enc_init(ec_enc *_this,unsigned char *_buf,opus_uint32 _size){
+  _this->buf=_buf;
+  _this->end_offs=0;
+  _this->end_window=0;
+  _this->nend_bits=0;
+  /*This is the offset from which ec_tell() will subtract partial bits.*/
+  _this->nbits_total=EC_CODE_BITS+1;
+  _this->offs=0;
+  _this->rng=EC_CODE_TOP;
+  _this->rem=-1;
+  _this->val=0;
+  _this->ext=0;
+  _this->storage=_size;
+  _this->error=0;
+}
+
+void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
+  opus_uint32 r;
+  r=_this->rng/_ft;
+  if(_fl>0){
+    _this->val+=_this->rng-IMUL32(r,(_ft-_fl));
+    _this->rng=IMUL32(r,(_fh-_fl));
+  }
+  else _this->rng-=IMUL32(r,(_ft-_fh));
+  ec_enc_normalize(_this);
+}
+
+void ec_encode_bin(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _bits){
+  opus_uint32 r;
+  r=_this->rng>>_bits;
+  if(_fl>0){
+    _this->val+=_this->rng-IMUL32(r,((1U<<_bits)-_fl));
+    _this->rng=IMUL32(r,(_fh-_fl));
+  }
+  else _this->rng-=IMUL32(r,((1U<<_bits)-_fh));
+  ec_enc_normalize(_this);
+}
+
+/*The probability of having a "one" is 1/(1<<_logp).*/
+void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp){
+  opus_uint32 r;
+  opus_uint32 s;
+  opus_uint32 l;
+  r=_this->rng;
+  l=_this->val;
+  s=r>>_logp;
+  r-=s;
+  if(_val)_this->val=l+r;
+  _this->rng=_val?s:r;
+  ec_enc_normalize(_this);
+}
+
+void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){
+  opus_uint32 r;
+  r=_this->rng>>_ftb;
+  if(_s>0){
+    _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]);
+    _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]);
+  }
+  else _this->rng-=IMUL32(r,_icdf[_s]);
+  ec_enc_normalize(_this);
+}
+
+void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){
+  unsigned  ft;
+  unsigned  fl;
+  int       ftb;
+  /*In order to optimize EC_ILOG(), it is undefined for the value 0.*/
+  celt_assert(_ft>1);
+  _ft--;
+  ftb=EC_ILOG(_ft);
+  if(ftb>EC_UINT_BITS){
+    ftb-=EC_UINT_BITS;
+    ft=(_ft>>ftb)+1;
+    fl=(unsigned)(_fl>>ftb);
+    ec_encode(_this,fl,fl+1,ft);
+    ec_enc_bits(_this,_fl&(((opus_uint32)1<<ftb)-1U),ftb);
+  }
+  else ec_encode(_this,_fl,_fl+1,_ft+1);
+}
+
+void ec_enc_bits(ec_enc *_this,opus_uint32 _fl,unsigned _bits){
+  ec_window window;
+  int       used;
+  window=_this->end_window;
+  used=_this->nend_bits;
+  celt_assert(_bits>0);
+  if(used+_bits>EC_WINDOW_SIZE){
+    do{
+      _this->error|=ec_write_byte_at_end(_this,(unsigned)window&EC_SYM_MAX);
+      window>>=EC_SYM_BITS;
+      used-=EC_SYM_BITS;
+    }
+    while(used>=EC_SYM_BITS);
+  }
+  window|=(ec_window)_fl<<used;
+  used+=_bits;
+  _this->end_window=window;
+  _this->nend_bits=used;
+  _this->nbits_total+=_bits;
+}
+
+void ec_enc_patch_initial_bits(ec_enc *_this,unsigned _val,unsigned _nbits){
+  int      shift;
+  unsigned mask;
+  celt_assert(_nbits<=EC_SYM_BITS);
+  shift=EC_SYM_BITS-_nbits;
+  mask=((1<<_nbits)-1)<<shift;
+  if(_this->offs>0){
+    /*The first byte has been finalized.*/
+    _this->buf[0]=(unsigned char)((_this->buf[0]&~mask)|_val<<shift);
+  }
+  else if(_this->rem>=0){
+    /*The first byte is still awaiting carry propagation.*/
+    _this->rem=(_this->rem&~mask)|_val<<shift;
+  }
+  else if(_this->rng<=(EC_CODE_TOP>>_nbits)){
+    /*The renormalization loop has never been run.*/
+    _this->val=(_this->val&~((opus_uint32)mask<<EC_CODE_SHIFT))|
+     (opus_uint32)_val<<(EC_CODE_SHIFT+shift);
+  }
+  /*The encoder hasn't even encoded _nbits of data yet.*/
+  else _this->error=-1;
+}
+
+void ec_enc_shrink(ec_enc *_this,opus_uint32 _size){
+  celt_assert(_this->offs+_this->end_offs<=_size);
+  OPUS_MOVE(_this->buf+_size-_this->end_offs,
+   _this->buf+_this->storage-_this->end_offs,_this->end_offs);
+  _this->storage=_size;
+}
+
+void ec_enc_done(ec_enc *_this){
+  ec_window   window;
+  int         used;
+  opus_uint32 msk;
+  opus_uint32 end;
+  int         l;
+  /*We output the minimum number of bits that ensures that the symbols encoded
+     thus far will be decoded correctly regardless of the bits that follow.*/
+  l=EC_CODE_BITS-EC_ILOG(_this->rng);
+  msk=(EC_CODE_TOP-1)>>l;
+  end=(_this->val+msk)&~msk;
+  if((end|msk)>=_this->val+_this->rng){
+    l++;
+    msk>>=1;
+    end=(_this->val+msk)&~msk;
+  }
+  while(l>0){
+    ec_enc_carry_out(_this,(int)(end>>EC_CODE_SHIFT));
+    end=(end<<EC_SYM_BITS)&(EC_CODE_TOP-1);
+    l-=EC_SYM_BITS;
+  }
+  /*If we have a buffered byte flush it into the output buffer.*/
+  if(_this->rem>=0||_this->ext>0)ec_enc_carry_out(_this,0);
+  /*If we have buffered extra bits, flush them as well.*/
+  window=_this->end_window;
+  used=_this->nend_bits;
+  while(used>=EC_SYM_BITS){
+    _this->error|=ec_write_byte_at_end(_this,(unsigned)window&EC_SYM_MAX);
+    window>>=EC_SYM_BITS;
+    used-=EC_SYM_BITS;
+  }
+  /*Clear any excess space and add any remaining extra bits to the last byte.*/
+  if(!_this->error){
+    OPUS_CLEAR(_this->buf+_this->offs,
+     _this->storage-_this->offs-_this->end_offs);
+    if(used>0){
+      /*If there's no range coder data at all, give up.*/
+      if(_this->end_offs>=_this->storage)_this->error=-1;
+      else{
+        l=-l;
+        /*If we've busted, don't add too many extra bits to the last byte; it
+           would corrupt the range coder data, and that's more important.*/
+        if(_this->offs+_this->end_offs>=_this->storage&&l<used){
+          window&=(1<<l)-1;
+          _this->error=-1;
+        }
+        _this->buf[_this->storage-_this->end_offs-1]|=(unsigned char)window;
+      }
+    }
+  }
+}

+ 110 - 0
drivers/opus/celt/entenc.h

@@ -0,0 +1,110 @@
+/* Copyright (c) 2001-2011 Timothy B. Terriberry
+   Copyright (c) 2008-2009 Xiph.Org Foundation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(_entenc_H)
+# define _entenc_H (1)
+# include <stddef.h>
+# include "entcode.h"
+
+/*Initializes the encoder.
+  _buf:  The buffer to store output bytes in.
+  _size: The size of the buffer, in chars.*/
+void ec_enc_init(ec_enc *_this,unsigned char *_buf,opus_uint32 _size);
+/*Encodes a symbol given its frequency information.
+  The frequency information must be discernable by the decoder, assuming it
+   has read only the previous symbols from the stream.
+  It is allowable to change the frequency information, or even the entire
+   source alphabet, so long as the decoder can tell from the context of the
+   previously encoded information that it is supposed to do so as well.
+  _fl: The cumulative frequency of all symbols that come before the one to be
+        encoded.
+  _fh: The cumulative frequency of all symbols up to and including the one to
+        be encoded.
+       Together with _fl, this defines the range [_fl,_fh) in which the
+        decoded value will fall.
+  _ft: The sum of the frequencies of all the symbols*/
+void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft);
+
+/*Equivalent to ec_encode() with _ft==1<<_bits.*/
+void ec_encode_bin(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _bits);
+
+/* Encode a bit that has a 1/(1<<_logp) probability of being a one */
+void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp);
+
+/*Encodes a symbol given an "inverse" CDF table.
+  _s:    The index of the symbol to encode.
+  _icdf: The "inverse" CDF, such that symbol _s falls in the range
+          [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb.
+         The values must be monotonically non-increasing, and the last value
+          must be 0.
+  _ftb: The number of bits of precision in the cumulative distribution.*/
+void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb);
+
+/*Encodes a raw unsigned integer in the stream.
+  _fl: The integer to encode.
+  _ft: The number of integers that can be encoded (one more than the max).
+       This must be at least one, and no more than 2**32-1.*/
+void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft);
+
+/*Encodes a sequence of raw bits in the stream.
+  _fl:  The bits to encode.
+  _ftb: The number of bits to encode.
+        This must be between 1 and 25, inclusive.*/
+void ec_enc_bits(ec_enc *_this,opus_uint32 _fl,unsigned _ftb);
+
+/*Overwrites a few bits at the very start of an existing stream, after they
+   have already been encoded.
+  This makes it possible to have a few flags up front, where it is easy for
+   decoders to access them without parsing the whole stream, even if their
+   values are not determined until late in the encoding process, without having
+   to buffer all the intermediate symbols in the encoder.
+  In order for this to work, at least _nbits bits must have already been
+   encoded using probabilities that are an exact power of two.
+  The encoder can verify the number of encoded bits is sufficient, but cannot
+   check this latter condition.
+  _val:   The bits to encode (in the least _nbits significant bits).
+          They will be decoded in order from most-significant to least.
+  _nbits: The number of bits to overwrite.
+          This must be no more than 8.*/
+void ec_enc_patch_initial_bits(ec_enc *_this,unsigned _val,unsigned _nbits);
+
+/*Compacts the data to fit in the target size.
+  This moves up the raw bits at the end of the current buffer so they are at
+   the end of the new buffer size.
+  The caller must ensure that the amount of data that's already been written
+   will fit in the new size.
+  _size: The number of bytes in the new buffer.
+         This must be large enough to contain the bits already written, and
+          must be no larger than the existing size.*/
+void ec_enc_shrink(ec_enc *_this,opus_uint32 _size);
+
+/*Indicates that there are no more symbols to encode.
+  All reamining output bytes are flushed to the output buffer.
+  ec_enc_init() must be called before the encoder can be used again.*/
+void ec_enc_done(ec_enc *_this);
+
+#endif

+ 773 - 0
drivers/opus/celt/fixed_debug.h

@@ -0,0 +1,773 @@
+/* Copyright (C) 2003-2008 Jean-Marc Valin
+   Copyright (C) 2007-2012 Xiph.Org Foundation */
+/**
+   @file fixed_debug.h
+   @brief Fixed-point operations with debugging
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_DEBUG_H
+#define FIXED_DEBUG_H
+
+#include <stdio.h>
+#include "opus_defines.h"
+
+#ifdef CELT_C
+OPUS_EXPORT opus_int64 celt_mips=0;
+#else
+extern opus_int64 celt_mips;
+#endif
+
+#define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b))
+#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL32(MULT16_16(SHR32((a),16),SHR((b),16)),1), SHR32(MULT16_16SU(SHR32((a),16),((b)&0x0000ffff)),15)), SHR32(MULT16_16SU(SHR32((b),16),((a)&0x0000ffff)),15))
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR32((b),16)), SHR32(MULT16_16SU((a),((b)&0x0000ffff)),16))
+
+#define MULT16_32_P16(a,b) MULT16_32_PX(a,b,16)
+
+#define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
+#define QCONST32(x,bits) ((opus_val32)(.5+(x)*(((opus_val32)1)<<(bits))))
+
+#define VERIFY_SHORT(x) ((x)<=32767&&(x)>=-32768)
+#define VERIFY_INT(x) ((x)<=2147483647LL&&(x)>=-2147483648LL)
+#define VERIFY_UINT(x) ((x)<=(2147483647LLU<<1))
+
+#define SHR(a,b) SHR32(a,b)
+#define PSHR(a,b) PSHR32(a,b)
+
+static OPUS_INLINE short NEG16(int x)
+{
+   int res;
+   if (!VERIFY_SHORT(x))
+   {
+      fprintf (stderr, "NEG16: input is not short: %d\n", (int)x);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = -x;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "NEG16: output is not short: %d\n", (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips++;
+   return res;
+}
+static OPUS_INLINE int NEG32(opus_int64 x)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(x))
+   {
+      fprintf (stderr, "NEG16: input is not int: %d\n", (int)x);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = -x;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "NEG16: output is not int: %d\n", (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+
+#define EXTRACT16(x) EXTRACT16_(x, __FILE__, __LINE__)
+static OPUS_INLINE short EXTRACT16_(int x, char *file, int line)
+{
+   int res;
+   if (!VERIFY_SHORT(x))
+   {
+      fprintf (stderr, "EXTRACT16: input is not short: %d in %s: line %d\n", x, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = x;
+   celt_mips++;
+   return res;
+}
+
+#define EXTEND32(x) EXTEND32_(x, __FILE__, __LINE__)
+static OPUS_INLINE int EXTEND32_(int x, char *file, int line)
+{
+   int res;
+   if (!VERIFY_SHORT(x))
+   {
+      fprintf (stderr, "EXTEND32: input is not short: %d in %s: line %d\n", x, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = x;
+   celt_mips++;
+   return res;
+}
+
+#define SHR16(a, shift) SHR16_(a, shift, __FILE__, __LINE__)
+static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line)
+{
+   int res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
+   {
+      fprintf (stderr, "SHR16: inputs are not short: %d >> %d in %s: line %d\n", a, shift, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a>>shift;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "SHR16: output is not short: %d in %s: line %d\n", res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips++;
+   return res;
+}
+#define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
+static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
+{
+   int res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
+   {
+      fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a<<shift;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips++;
+   return res;
+}
+
+static OPUS_INLINE int SHR32(opus_int64 a, int shift)
+{
+   opus_int64  res;
+   if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
+   {
+      fprintf (stderr, "SHR32: inputs are not int: %d %d\n", (int)a, shift);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a>>shift;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "SHR32: output is not int: %d\n", (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+#define SHL32(a, shift) SHL32_(a, shift, __FILE__, __LINE__)
+static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
+{
+   opus_int64  res;
+   if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
+   {
+      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a<<shift;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+
+#define PSHR32(a,shift) (celt_mips--,SHR32(ADD32((a),(((opus_val32)(1)<<((shift))>>1))),shift))
+#define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift)))
+
+#define ROUND16(x,a) (celt_mips--,EXTRACT16(PSHR32((x),(a))))
+#define HALF16(x)  (SHR16(x,1))
+#define HALF32(x)  (SHR32(x,1))
+
+//#define SHR(a,shift) ((a) >> (shift))
+//#define SHL(a,shift) ((a) << (shift))
+
+#define ADD16(a, b) ADD16_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE short ADD16_(int a, int b, char *file, int line)
+{
+   int res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "ADD16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a+b;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "ADD16: output is not short: %d+%d=%d in %s: line %d\n", a,b,res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips++;
+   return res;
+}
+
+#define SUB16(a, b) SUB16_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE short SUB16_(int a, int b, char *file, int line)
+{
+   int res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "SUB16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a-b;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "SUB16: output is not short: %d in %s: line %d\n", res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips++;
+   return res;
+}
+
+#define ADD32(a, b) ADD32_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE int ADD32_(opus_int64 a, opus_int64 b, char *file, int line)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "ADD32: inputs are not int: %d %d in %s: line %d\n", (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a+b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "ADD32: output is not int: %d in %s: line %d\n", (int)res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+
+#define SUB32(a, b) SUB32_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE int SUB32_(opus_int64 a, opus_int64 b, char *file, int line)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "SUB32: inputs are not int: %d %d in %s: line %d\n", (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a-b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "SUB32: output is not int: %d in %s: line %d\n", (int)res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+
+#undef UADD32
+#define UADD32(a, b) UADD32_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int line)
+{
+   opus_uint64 res;
+   if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
+   {
+      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a+b;
+   if (!VERIFY_UINT(res))
+   {
+      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+
+#undef USUB32
+#define USUB32(a, b) USUB32_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int line)
+{
+   opus_uint64 res;
+   if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
+   {
+      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   if (a<b)
+   {
+      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a-b;
+   if (!VERIFY_UINT(res))
+   {
+      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+
+/* result fits in 16 bits */
+static OPUS_INLINE short MULT16_16_16(int a, int b)
+{
+   int res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_16: inputs are not short: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a*b;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "MULT16_16_16: output is not short: %d\n", res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips++;
+   return res;
+}
+
+#define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT16_16: output is not int: %d in %s: line %d\n", (int)res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips++;
+   return res;
+}
+
+#define MAC16_16(c,a,b)     (celt_mips-=2,ADD32((c),MULT16_16((a),(b))))
+
+#define MULT16_32_QX(a, b, Q) MULT16_32_QX_(a, b, Q, __FILE__, __LINE__)
+static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int line)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT16_32_Q%d: inputs are not short+int: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   if (ABS32(b)>=((opus_val32)(1)<<(15+Q)))
+   {
+      fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = (((opus_int64)a)*(opus_int64)b) >> Q;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT16_32_Q%d: output is not int: %d*%d=%d in %s: line %d\n", Q, (int)a, (int)b,(int)res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   if (Q==15)
+      celt_mips+=3;
+   else
+      celt_mips+=4;
+   return res;
+}
+
+#define MULT16_32_PX(a, b, Q) MULT16_32_PX_(a, b, Q, __FILE__, __LINE__)
+static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT16_32_P%d: inputs are not short+int: %d %d in %s: line %d\n\n", Q, (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   if (ABS32(b)>=((opus_int64)(1)<<(15+Q)))
+   {
+      fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((((opus_int64)a)*(opus_int64)b) + (((opus_val32)(1)<<Q)>>1))>> Q;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT16_32_P%d: output is not int: %d*%d=%d in %s: line %d\n\n", Q, (int)a, (int)b,(int)res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   if (Q==15)
+      celt_mips+=4;
+   else
+      celt_mips+=5;
+   return res;
+}
+
+#define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15)
+#define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b))))
+
+static OPUS_INLINE int SATURATE(int a, int b)
+{
+   if (a>b)
+      a=b;
+   if (a<-b)
+      a = -b;
+   celt_mips+=3;
+   return a;
+}
+
+static OPUS_INLINE opus_int16 SATURATE16(opus_int32 a)
+{
+   celt_mips+=3;
+   if (a>32767)
+      return 32767;
+   else if (a<-32768)
+      return -32768;
+   else return a;
+}
+
+static OPUS_INLINE int MULT16_16_Q11_32(int a, int b)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_Q11: inputs are not short: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   res >>= 11;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT16_16_Q11: output is not short: %d*%d=%d\n", (int)a, (int)b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=3;
+   return res;
+}
+static OPUS_INLINE short MULT16_16_Q13(int a, int b)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_Q13: inputs are not short: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   res >>= 13;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "MULT16_16_Q13: output is not short: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=3;
+   return res;
+}
+static OPUS_INLINE short MULT16_16_Q14(int a, int b)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_Q14: inputs are not short: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   res >>= 14;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "MULT16_16_Q14: output is not short: %d\n", (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=3;
+   return res;
+}
+
+#define MULT16_16_Q15(a, b) MULT16_16_Q15_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE short MULT16_16_Q15_(int a, int b, char *file, int line)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_Q15: inputs are not short: %d %d in %s: line %d\n", a, b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   res >>= 15;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "MULT16_16_Q15: output is not short: %d in %s: line %d\n", (int)res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=1;
+   return res;
+}
+
+static OPUS_INLINE short MULT16_16_P13(int a, int b)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_P13: inputs are not short: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   res += 4096;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT16_16_P13: overflow: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res >>= 13;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "MULT16_16_P13: output is not short: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=4;
+   return res;
+}
+static OPUS_INLINE short MULT16_16_P14(int a, int b)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_P14: inputs are not short: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   res += 8192;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT16_16_P14: overflow: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res >>= 14;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "MULT16_16_P14: output is not short: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=4;
+   return res;
+}
+static OPUS_INLINE short MULT16_16_P15(int a, int b)
+{
+   opus_int64 res;
+   if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "MULT16_16_P15: inputs are not short: %d %d\n", a, b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)a)*b;
+   res += 16384;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT16_16_P15: overflow: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res >>= 15;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "MULT16_16_P15: output is not short: %d*%d=%d\n", a, b, (int)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=2;
+   return res;
+}
+
+#define DIV32_16(a, b) DIV32_16_(a, b, __FILE__, __LINE__)
+
+static OPUS_INLINE int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line)
+{
+   opus_int64 res;
+   if (b==0)
+   {
+      fprintf(stderr, "DIV32_16: divide by zero: %d/%d in %s: line %d\n", (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+      return 0;
+   }
+   if (!VERIFY_INT(a) || !VERIFY_SHORT(b))
+   {
+      fprintf (stderr, "DIV32_16: inputs are not int/short: %d %d in %s: line %d\n", (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a/b;
+   if (!VERIFY_SHORT(res))
+   {
+      fprintf (stderr, "DIV32_16: output is not short: %d / %d = %d in %s: line %d\n", (int)a,(int)b,(int)res, file, line);
+      if (res>32767)
+         res = 32767;
+      if (res<-32768)
+         res = -32768;
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=35;
+   return res;
+}
+
+#define DIV32(a, b) DIV32_(a, b, __FILE__, __LINE__)
+static OPUS_INLINE int DIV32_(opus_int64 a, opus_int64 b, char *file, int line)
+{
+   opus_int64 res;
+   if (b==0)
+   {
+      fprintf(stderr, "DIV32: divide by zero: %d/%d in %s: line %d\n", (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+      return 0;
+   }
+
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "DIV32: inputs are not int/short: %d %d in %s: line %d\n", (int)a, (int)b, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a/b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "DIV32: output is not int: %d in %s: line %d\n", (int)res, file, line);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=70;
+   return res;
+}
+
+#undef PRINT_MIPS
+#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
+
+#endif

+ 134 - 0
drivers/opus/celt/fixed_generic.h

@@ -0,0 +1,134 @@
+/* Copyright (C) 2007-2009 Xiph.Org Foundation
+   Copyright (C) 2003-2008 Jean-Marc Valin
+   Copyright (C) 2007-2008 CSIRO */
+/**
+   @file fixed_generic.h
+   @brief Generic fixed-point operations
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_GENERIC_H
+#define FIXED_GENERIC_H
+
+/** Multiply a 16-bit signed value by a 16-bit unsigned value. The result is a 32-bit signed value */
+#define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b))
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
+
+/** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */
+#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
+
+/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
+#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
+
+/** Compile-time conversion of float constant to 16-bit value */
+#define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
+
+/** Compile-time conversion of float constant to 32-bit value */
+#define QCONST32(x,bits) ((opus_val32)(.5+(x)*(((opus_val32)1)<<(bits))))
+
+/** Negate a 16-bit value */
+#define NEG16(x) (-(x))
+/** Negate a 32-bit value */
+#define NEG32(x) (-(x))
+
+/** Change a 32-bit value into a 16-bit value. The value is assumed to fit in 16-bit, otherwise the result is undefined */
+#define EXTRACT16(x) ((opus_val16)(x))
+/** Change a 16-bit value into a 32-bit value */
+#define EXTEND32(x) ((opus_val32)(x))
+
+/** Arithmetic shift-right of a 16-bit value */
+#define SHR16(a,shift) ((a) >> (shift))
+/** Arithmetic shift-left of a 16-bit value */
+#define SHL16(a,shift) ((opus_int16)((opus_uint16)(a)<<(shift)))
+/** Arithmetic shift-right of a 32-bit value */
+#define SHR32(a,shift) ((a) >> (shift))
+/** Arithmetic shift-left of a 32-bit value */
+#define SHL32(a,shift) ((opus_int32)((opus_uint32)(a)<<(shift)))
+
+/** 32-bit arithmetic shift right with rounding-to-nearest instead of rounding down */
+#define PSHR32(a,shift) (SHR32((a)+((EXTEND32(1)<<((shift))>>1)),shift))
+/** 32-bit arithmetic shift right where the argument can be negative */
+#define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift)))
+
+/** "RAW" macros, should not be used outside of this header file */
+#define SHR(a,shift) ((a) >> (shift))
+#define SHL(a,shift) SHL32(a,shift)
+#define PSHR(a,shift) (SHR((a)+((EXTEND32(1)<<((shift))>>1)),shift))
+#define SATURATE(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
+
+#define SATURATE16(x) (EXTRACT16((x)>32767 ? 32767 : (x)<-32768 ? -32768 : (x)))
+
+/** Shift by a and round-to-neareast 32-bit value. Result is a 16-bit value */
+#define ROUND16(x,a) (EXTRACT16(PSHR32((x),(a))))
+/** Divide by two */
+#define HALF16(x)  (SHR16(x,1))
+#define HALF32(x)  (SHR32(x,1))
+
+/** Add two 16-bit values */
+#define ADD16(a,b) ((opus_val16)((opus_val16)(a)+(opus_val16)(b)))
+/** Subtract two 16-bit values */
+#define SUB16(a,b) ((opus_val16)(a)-(opus_val16)(b))
+/** Add two 32-bit values */
+#define ADD32(a,b) ((opus_val32)(a)+(opus_val32)(b))
+/** Subtract two 32-bit values */
+#define SUB32(a,b) ((opus_val32)(a)-(opus_val32)(b))
+
+/** 16x16 multiplication where the result fits in 16 bits */
+#define MULT16_16_16(a,b)     ((((opus_val16)(a))*((opus_val16)(b))))
+
+/* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */
+/** 16x16 multiplication where the result fits in 32 bits */
+#define MULT16_16(a,b)     (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b)))
+
+/** 16x16 multiply-add where the result fits in 32 bits */
+#define MAC16_16(c,a,b) (ADD32((c),MULT16_16((a),(b))))
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+
+#define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
+#define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))
+#define MULT16_16_Q13(a,b) (SHR(MULT16_16((a),(b)),13))
+#define MULT16_16_Q14(a,b) (SHR(MULT16_16((a),(b)),14))
+#define MULT16_16_Q15(a,b) (SHR(MULT16_16((a),(b)),15))
+
+#define MULT16_16_P13(a,b) (SHR(ADD32(4096,MULT16_16((a),(b))),13))
+#define MULT16_16_P14(a,b) (SHR(ADD32(8192,MULT16_16((a),(b))),14))
+#define MULT16_16_P15(a,b) (SHR(ADD32(16384,MULT16_16((a),(b))),15))
+
+/** Divide a 32-bit value by a 16-bit value. Result fits in 16 bits */
+#define DIV32_16(a,b) ((opus_val16)(((opus_val32)(a))/((opus_val16)(b))))
+
+/** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */
+#define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))
+
+#endif

+ 140 - 0
drivers/opus/celt/float_cast.h

@@ -0,0 +1,140 @@
+/* Copyright (C) 2001 Erik de Castro Lopo <erikd AT mega-nerd DOT com> */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Version 1.1 */
+
+#ifndef FLOAT_CAST_H
+#define FLOAT_CAST_H
+
+
+#include "arch.h"
+
+/*============================================================================
+**      On Intel Pentium processors (especially PIII and probably P4), converting
+**      from float to int is very slow. To meet the C specs, the code produced by
+**      most C compilers targeting Pentium needs to change the FPU rounding mode
+**      before the float to int conversion is performed.
+**
+**      Changing the FPU rounding mode causes the FPU pipeline to be flushed. It
+**      is this flushing of the pipeline which is so slow.
+**
+**      Fortunately the ISO C99 specifications define the functions lrint, lrintf,
+**      llrint and llrintf which fix this problem as a side effect.
+**
+**      On Unix-like systems, the configure process should have detected the
+**      presence of these functions. If they weren't found we have to replace them
+**      here with a standard C cast.
+*/
+
+/*
+**      The C99 prototypes for lrint and lrintf are as follows:
+**
+**              long int lrintf (float x) ;
+**              long int lrint  (double x) ;
+*/
+
+/*      The presence of the required functions are detected during the configure
+**      process and the values HAVE_LRINT and HAVE_LRINTF are set accordingly in
+**      the config.h file.
+*/
+
+#if (HAVE_LRINTF)
+
+/*      These defines enable functionality introduced with the 1999 ISO C
+**      standard. They must be defined before the inclusion of math.h to
+**      engage them. If optimisation is enabled, these functions will be
+**      inlined. With optimisation switched off, you have to link in the
+**      maths library using -lm.
+*/
+
+#define _ISOC9X_SOURCE 1
+#define _ISOC99_SOURCE 1
+
+#define __USE_ISOC9X 1
+#define __USE_ISOC99 1
+
+#include <math.h>
+#define float2int(x) lrintf(x)
+
+#elif (defined(HAVE_LRINT))
+
+#define _ISOC9X_SOURCE 1
+#define _ISOC99_SOURCE 1
+
+#define __USE_ISOC9X 1
+#define __USE_ISOC99 1
+
+#include <math.h>
+#define float2int(x) lrint(x)
+
+#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined (WIN64) || defined (_WIN64))
+        #include <xmmintrin.h>
+
+        __inline long int float2int(float value)
+        {
+                return _mm_cvtss_si32(_mm_load_ss(&value));
+        }
+#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined (WIN32) || defined (_WIN32))
+        #include <math.h>
+
+        /*      Win32 doesn't seem to have these functions.
+        **      Therefore implement OPUS_INLINE versions of these functions here.
+        */
+
+        __inline long int
+        float2int (float flt)
+        {       int intgr;
+
+                _asm
+                {       fld flt
+                        fistp intgr
+                } ;
+
+                return intgr ;
+        }
+
+#else
+
+#if (defined(__GNUC__) && defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L)
+        /* supported by gcc in C99 mode, but not by all other compilers */
+        #warning "Don't have the functions lrint() and lrintf ()."
+        #warning "Replacing these functions with a standard C cast."
+#endif /* __STDC_VERSION__ >= 199901L */
+        #include <math.h>
+        #define float2int(flt) ((int)(floor(.5+flt)))
+#endif
+
+#ifndef DISABLE_FLOAT_API
+static OPUS_INLINE opus_int16 FLOAT2INT16(float x)
+{
+   x = x*CELT_SIG_SCALE;
+   x = MAX32(x, -32768);
+   x = MIN32(x, 32767);
+   return (opus_int16)float2int(x);
+}
+#endif /* DISABLE_FLOAT_API */
+
+#endif /* FLOAT_CAST_H */

+ 719 - 0
drivers/opus/celt/kiss_fft.c

@@ -0,0 +1,719 @@
+/*Copyright (c) 2003-2004, Mark Borgerding
+  Lots of modifications by Jean-Marc Valin
+  Copyright (c) 2005-2007, Xiph.Org Foundation
+  Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+/* This code is originally from Mark Borgerding's KISS-FFT but has been
+   heavily modified to better suit Opus */
+
+#ifndef SKIP_CONFIG_H
+#  ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#  endif
+#endif
+
+#include "_kiss_fft_guts.h"
+#include "arch.h"
+#include "os_support.h"
+#include "mathops.h"
+#include "stack_alloc.h"
+
+/* The guts header contains all the multiplication and addition macros that are defined for
+   complex numbers.  It also delares the kf_ internal functions.
+*/
+
+static void kf_bfly2(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx * Fout2;
+   const kiss_twiddle_cpx * tw1;
+   int i,j;
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout2 = Fout + m;
+      tw1 = st->twiddles;
+      for(j=0;j<m;j++)
+      {
+         kiss_fft_cpx t;
+         Fout->r = SHR32(Fout->r, 1);Fout->i = SHR32(Fout->i, 1);
+         Fout2->r = SHR32(Fout2->r, 1);Fout2->i = SHR32(Fout2->i, 1);
+         C_MUL (t,  *Fout2 , *tw1);
+         tw1 += fstride;
+         C_SUB( *Fout2 ,  *Fout , t );
+         C_ADDTO( *Fout ,  t );
+         ++Fout2;
+         ++Fout;
+      }
+   }
+}
+
+static void ki_bfly2(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx * Fout2;
+   const kiss_twiddle_cpx * tw1;
+   kiss_fft_cpx t;
+   int i,j;
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout2 = Fout + m;
+      tw1 = st->twiddles;
+      for(j=0;j<m;j++)
+      {
+         C_MULC (t,  *Fout2 , *tw1);
+         tw1 += fstride;
+         C_SUB( *Fout2 ,  *Fout , t );
+         C_ADDTO( *Fout ,  t );
+         ++Fout2;
+         ++Fout;
+      }
+   }
+}
+
+static void kf_bfly4(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   const kiss_twiddle_cpx *tw1,*tw2,*tw3;
+   kiss_fft_cpx scratch[6];
+   const size_t m2=2*m;
+   const size_t m3=3*m;
+   int i, j;
+
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      tw3 = tw2 = tw1 = st->twiddles;
+      for (j=0;j<m;j++)
+      {
+         C_MUL4(scratch[0],Fout[m] , *tw1 );
+         C_MUL4(scratch[1],Fout[m2] , *tw2 );
+         C_MUL4(scratch[2],Fout[m3] , *tw3 );
+
+         Fout->r = PSHR32(Fout->r, 2);
+         Fout->i = PSHR32(Fout->i, 2);
+         C_SUB( scratch[5] , *Fout, scratch[1] );
+         C_ADDTO(*Fout, scratch[1]);
+         C_ADD( scratch[3] , scratch[0] , scratch[2] );
+         C_SUB( scratch[4] , scratch[0] , scratch[2] );
+         C_SUB( Fout[m2], *Fout, scratch[3] );
+         tw1 += fstride;
+         tw2 += fstride*2;
+         tw3 += fstride*3;
+         C_ADDTO( *Fout , scratch[3] );
+
+         Fout[m].r = scratch[5].r + scratch[4].i;
+         Fout[m].i = scratch[5].i - scratch[4].r;
+         Fout[m3].r = scratch[5].r - scratch[4].i;
+         Fout[m3].i = scratch[5].i + scratch[4].r;
+         ++Fout;
+      }
+   }
+}
+
+static void ki_bfly4(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   const kiss_twiddle_cpx *tw1,*tw2,*tw3;
+   kiss_fft_cpx scratch[6];
+   const size_t m2=2*m;
+   const size_t m3=3*m;
+   int i, j;
+
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      tw3 = tw2 = tw1 = st->twiddles;
+      for (j=0;j<m;j++)
+      {
+         C_MULC(scratch[0],Fout[m] , *tw1 );
+         C_MULC(scratch[1],Fout[m2] , *tw2 );
+         C_MULC(scratch[2],Fout[m3] , *tw3 );
+
+         C_SUB( scratch[5] , *Fout, scratch[1] );
+         C_ADDTO(*Fout, scratch[1]);
+         C_ADD( scratch[3] , scratch[0] , scratch[2] );
+         C_SUB( scratch[4] , scratch[0] , scratch[2] );
+         C_SUB( Fout[m2], *Fout, scratch[3] );
+         tw1 += fstride;
+         tw2 += fstride*2;
+         tw3 += fstride*3;
+         C_ADDTO( *Fout , scratch[3] );
+
+         Fout[m].r = scratch[5].r - scratch[4].i;
+         Fout[m].i = scratch[5].i + scratch[4].r;
+         Fout[m3].r = scratch[5].r + scratch[4].i;
+         Fout[m3].i = scratch[5].i - scratch[4].r;
+         ++Fout;
+      }
+   }
+}
+
+#ifndef RADIX_TWO_ONLY
+
+static void kf_bfly3(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   int i;
+   size_t k;
+   const size_t m2 = 2*m;
+   const kiss_twiddle_cpx *tw1,*tw2;
+   kiss_fft_cpx scratch[5];
+   kiss_twiddle_cpx epi3;
+
+   kiss_fft_cpx * Fout_beg = Fout;
+   epi3 = st->twiddles[fstride*m];
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      tw1=tw2=st->twiddles;
+      k=m;
+      do {
+         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
+
+         C_MUL(scratch[1],Fout[m] , *tw1);
+         C_MUL(scratch[2],Fout[m2] , *tw2);
+
+         C_ADD(scratch[3],scratch[1],scratch[2]);
+         C_SUB(scratch[0],scratch[1],scratch[2]);
+         tw1 += fstride;
+         tw2 += fstride*2;
+
+         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
+         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
+
+         C_MULBYSCALAR( scratch[0] , epi3.i );
+
+         C_ADDTO(*Fout,scratch[3]);
+
+         Fout[m2].r = Fout[m].r + scratch[0].i;
+         Fout[m2].i = Fout[m].i - scratch[0].r;
+
+         Fout[m].r -= scratch[0].i;
+         Fout[m].i += scratch[0].r;
+
+         ++Fout;
+      } while(--k);
+   }
+}
+
+static void ki_bfly3(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   int i, k;
+   const size_t m2 = 2*m;
+   const kiss_twiddle_cpx *tw1,*tw2;
+   kiss_fft_cpx scratch[5];
+   kiss_twiddle_cpx epi3;
+
+   kiss_fft_cpx * Fout_beg = Fout;
+   epi3 = st->twiddles[fstride*m];
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      tw1=tw2=st->twiddles;
+      k=m;
+      do{
+
+         C_MULC(scratch[1],Fout[m] , *tw1);
+         C_MULC(scratch[2],Fout[m2] , *tw2);
+
+         C_ADD(scratch[3],scratch[1],scratch[2]);
+         C_SUB(scratch[0],scratch[1],scratch[2]);
+         tw1 += fstride;
+         tw2 += fstride*2;
+
+         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
+         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
+
+         C_MULBYSCALAR( scratch[0] , -epi3.i );
+
+         C_ADDTO(*Fout,scratch[3]);
+
+         Fout[m2].r = Fout[m].r + scratch[0].i;
+         Fout[m2].i = Fout[m].i - scratch[0].r;
+
+         Fout[m].r -= scratch[0].i;
+         Fout[m].i += scratch[0].r;
+
+         ++Fout;
+      }while(--k);
+   }
+}
+
+static void kf_bfly5(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+   int i, u;
+   kiss_fft_cpx scratch[13];
+   const kiss_twiddle_cpx * twiddles = st->twiddles;
+   const kiss_twiddle_cpx *tw;
+   kiss_twiddle_cpx ya,yb;
+   kiss_fft_cpx * Fout_beg = Fout;
+
+   ya = twiddles[fstride*m];
+   yb = twiddles[fstride*2*m];
+   tw=st->twiddles;
+
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      for ( u=0; u<m; ++u ) {
+         C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
+         scratch[0] = *Fout0;
+
+         C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
+         C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
+         C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
+         C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+         C_ADD( scratch[7],scratch[1],scratch[4]);
+         C_SUB( scratch[10],scratch[1],scratch[4]);
+         C_ADD( scratch[8],scratch[2],scratch[3]);
+         C_SUB( scratch[9],scratch[2],scratch[3]);
+
+         Fout0->r += scratch[7].r + scratch[8].r;
+         Fout0->i += scratch[7].i + scratch[8].i;
+
+         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
+         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
+
+         scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
+         scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
+
+         C_SUB(*Fout1,scratch[5],scratch[6]);
+         C_ADD(*Fout4,scratch[5],scratch[6]);
+
+         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
+         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
+         scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
+         scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
+
+         C_ADD(*Fout2,scratch[11],scratch[12]);
+         C_SUB(*Fout3,scratch[11],scratch[12]);
+
+         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+   }
+}
+
+static void ki_bfly5(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+   int i, u;
+   kiss_fft_cpx scratch[13];
+   const kiss_twiddle_cpx * twiddles = st->twiddles;
+   const kiss_twiddle_cpx *tw;
+   kiss_twiddle_cpx ya,yb;
+   kiss_fft_cpx * Fout_beg = Fout;
+
+   ya = twiddles[fstride*m];
+   yb = twiddles[fstride*2*m];
+   tw=st->twiddles;
+
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      for ( u=0; u<m; ++u ) {
+         scratch[0] = *Fout0;
+
+         C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
+         C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
+         C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
+         C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+         C_ADD( scratch[7],scratch[1],scratch[4]);
+         C_SUB( scratch[10],scratch[1],scratch[4]);
+         C_ADD( scratch[8],scratch[2],scratch[3]);
+         C_SUB( scratch[9],scratch[2],scratch[3]);
+
+         Fout0->r += scratch[7].r + scratch[8].r;
+         Fout0->i += scratch[7].i + scratch[8].i;
+
+         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
+         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
+
+         scratch[6].r = -S_MUL(scratch[10].i,ya.i) - S_MUL(scratch[9].i,yb.i);
+         scratch[6].i =  S_MUL(scratch[10].r,ya.i) + S_MUL(scratch[9].r,yb.i);
+
+         C_SUB(*Fout1,scratch[5],scratch[6]);
+         C_ADD(*Fout4,scratch[5],scratch[6]);
+
+         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
+         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
+         scratch[12].r =  S_MUL(scratch[10].i,yb.i) - S_MUL(scratch[9].i,ya.i);
+         scratch[12].i = -S_MUL(scratch[10].r,yb.i) + S_MUL(scratch[9].r,ya.i);
+
+         C_ADD(*Fout2,scratch[11],scratch[12]);
+         C_SUB(*Fout3,scratch[11],scratch[12]);
+
+         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+   }
+}
+
+#endif
+
+
+#ifdef CUSTOM_MODES
+
+static
+void compute_bitrev_table(
+         int Fout,
+         opus_int16 *f,
+         const size_t fstride,
+         int in_stride,
+         opus_int16 * factors,
+         const kiss_fft_state *st
+            )
+{
+   const int p=*factors++; /* the radix  */
+   const int m=*factors++; /* stage's fft length/p */
+
+    /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
+   if (m==1)
+   {
+      int j;
+      for (j=0;j<p;j++)
+      {
+         *f = Fout+j;
+         f += fstride*in_stride;
+      }
+   } else {
+      int j;
+      for (j=0;j<p;j++)
+      {
+         compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st);
+         f += fstride*in_stride;
+         Fout += m;
+      }
+   }
+}
+
+/*  facbuf is populated by p1,m1,p2,m2, ...
+    where
+    p[i] * m[i] = m[i-1]
+    m0 = n                  */
+static
+int kf_factor(int n,opus_int16 * facbuf)
+{
+    int p=4;
+
+    /*factor out powers of 4, powers of 2, then any remaining primes */
+    do {
+        while (n % p) {
+            switch (p) {
+                case 4: p = 2; break;
+                case 2: p = 3; break;
+                default: p += 2; break;
+            }
+            if (p>32000 || (opus_int32)p*(opus_int32)p > n)
+                p = n;          /* no more factors, skip to end */
+        }
+        n /= p;
+#ifdef RADIX_TWO_ONLY
+        if (p!=2 && p != 4)
+#else
+        if (p>5)
+#endif
+        {
+           return 0;
+        }
+        *facbuf++ = p;
+        *facbuf++ = n;
+    } while (n > 1);
+    return 1;
+}
+
+static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
+{
+   int i;
+#ifdef OPUS_FIXED_POINT
+   for (i=0;i<nfft;++i) {
+      opus_val32 phase = -i;
+      kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft));
+   }
+#else
+   for (i=0;i<nfft;++i) {
+      const double pi=3.14159265358979323846264338327;
+      double phase = ( -2*pi /nfft ) * i;
+      kf_cexp(twiddles+i, phase );
+   }
+#endif
+}
+
+/*
+ *
+ * Allocates all necessary storage space for the fft and ifft.
+ * The return value is a contiguous block of memory.  As such,
+ * It can be freed with free().
+ * */
+kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,  const kiss_fft_state *base)
+{
+    kiss_fft_state *st=NULL;
+    size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
+
+    if ( lenmem==NULL ) {
+        st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded );
+    }else{
+        if (mem != NULL && *lenmem >= memneeded)
+            st = (kiss_fft_state*)mem;
+        *lenmem = memneeded;
+    }
+    if (st) {
+        opus_int16 *bitrev;
+        kiss_twiddle_cpx *twiddles;
+
+        st->nfft=nfft;
+#ifndef OPUS_FIXED_POINT
+        st->scale = 1.f/nfft;
+#endif
+        if (base != NULL)
+        {
+           st->twiddles = base->twiddles;
+           st->shift = 0;
+           while (nfft<<st->shift != base->nfft && st->shift < 32)
+              st->shift++;
+           if (st->shift>=32)
+              goto fail;
+        } else {
+           st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft);
+           compute_twiddles(twiddles, nfft);
+           st->shift = -1;
+        }
+        if (!kf_factor(nfft,st->factors))
+        {
+           goto fail;
+        }
+
+        /* bitrev */
+        st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft);
+        if (st->bitrev==NULL)
+            goto fail;
+        compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
+    }
+    return st;
+fail:
+    opus_fft_free(st);
+    return NULL;
+}
+
+kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem )
+{
+   return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL);
+}
+
+void opus_fft_free(const kiss_fft_state *cfg)
+{
+   if (cfg)
+   {
+      opus_free((opus_int16*)cfg->bitrev);
+      if (cfg->shift < 0)
+         opus_free((kiss_twiddle_cpx*)cfg->twiddles);
+      opus_free((kiss_fft_state*)cfg);
+   }
+}
+
+#endif /* CUSTOM_MODES */
+
+void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+    int m2, m;
+    int p;
+    int L;
+    int fstride[MAXFACTORS];
+    int i;
+    int shift;
+
+    /* st->shift can be -1 */
+    shift = st->shift>0 ? st->shift : 0;
+
+    celt_assert2 (fin != fout, "In-place FFT not supported");
+    /* Bit-reverse the input */
+    for (i=0;i<st->nfft;i++)
+    {
+       fout[st->bitrev[i]] = fin[i];
+#ifndef OPUS_FIXED_POINT
+       fout[st->bitrev[i]].r *= st->scale;
+       fout[st->bitrev[i]].i *= st->scale;
+#endif
+    }
+
+    fstride[0] = 1;
+    L=0;
+    do {
+       p = st->factors[2*L];
+       m = st->factors[2*L+1];
+       fstride[L+1] = fstride[L]*p;
+       L++;
+    } while(m!=1);
+    m = st->factors[2*L-1];
+    for (i=L-1;i>=0;i--)
+    {
+       if (i!=0)
+          m2 = st->factors[2*i-1];
+       else
+          m2 = 1;
+       switch (st->factors[2*i])
+       {
+       case 2:
+          kf_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          break;
+       case 4:
+          kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          break;
+ #ifndef RADIX_TWO_ONLY
+       case 3:
+          kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          break;
+       case 5:
+          kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          break;
+ #endif
+       }
+       m = m2;
+    }
+}
+
+void opus_ifft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+   int m2, m;
+   int p;
+   int L;
+   int fstride[MAXFACTORS];
+   int i;
+   int shift;
+
+   /* st->shift can be -1 */
+   shift = st->shift>0 ? st->shift : 0;
+   celt_assert2 (fin != fout, "In-place FFT not supported");
+   /* Bit-reverse the input */
+   for (i=0;i<st->nfft;i++)
+      fout[st->bitrev[i]] = fin[i];
+
+   fstride[0] = 1;
+   L=0;
+   do {
+      p = st->factors[2*L];
+      m = st->factors[2*L+1];
+      fstride[L+1] = fstride[L]*p;
+      L++;
+   } while(m!=1);
+   m = st->factors[2*L-1];
+   for (i=L-1;i>=0;i--)
+   {
+      if (i!=0)
+         m2 = st->factors[2*i-1];
+      else
+         m2 = 1;
+      switch (st->factors[2*i])
+      {
+      case 2:
+         ki_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+         break;
+      case 4:
+         ki_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+         break;
+#ifndef RADIX_TWO_ONLY
+      case 3:
+         ki_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+         break;
+      case 5:
+         ki_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+         break;
+#endif
+      }
+      m = m2;
+   }
+}
+

+ 139 - 0
drivers/opus/celt/kiss_fft.h

@@ -0,0 +1,139 @@
+/*Copyright (c) 2003-2004, Mark Borgerding
+  Lots of modifications by Jean-Marc Valin
+  Copyright (c) 2005-2007, Xiph.Org Foundation
+  Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_H
+#define KISS_FFT_H
+
+#include <stdlib.h>
+#include <math.h>
+#include "arch.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef USE_SIMD
+# include <xmmintrin.h>
+# define kiss_fft_scalar __m128
+#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
+#else
+#define KISS_FFT_MALLOC opus_alloc
+#endif
+
+#ifdef OPUS_FIXED_POINT
+#include "arch.h"
+
+#  define kiss_fft_scalar opus_int32
+#  define kiss_twiddle_scalar opus_int16
+
+
+#else
+# ifndef kiss_fft_scalar
+/*  default is float */
+#   define kiss_fft_scalar float
+#   define kiss_twiddle_scalar float
+#   define KF_SUFFIX _celt_single
+# endif
+#endif
+
+typedef struct {
+    kiss_fft_scalar r;
+    kiss_fft_scalar i;
+}kiss_fft_cpx;
+
+typedef struct {
+   kiss_twiddle_scalar r;
+   kiss_twiddle_scalar i;
+}kiss_twiddle_cpx;
+
+#define MAXFACTORS 8
+/* e.g. an fft of length 128 has 4 factors
+ as far as kissfft is concerned
+ 4*4*4*2
+ */
+
+typedef struct kiss_fft_state{
+    int nfft;
+#ifndef OPUS_FIXED_POINT
+    kiss_fft_scalar scale;
+#endif
+    int shift;
+    opus_int16 factors[2*MAXFACTORS];
+    const opus_int16 *bitrev;
+    const kiss_twiddle_cpx *twiddles;
+} kiss_fft_state;
+
+/*typedef struct kiss_fft_state* kiss_fft_cfg;*/
+
+/**
+ *  opus_fft_alloc
+ *
+ *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
+ *
+ *  typical usage:      kiss_fft_cfg mycfg=opus_fft_alloc(1024,0,NULL,NULL);
+ *
+ *  The return value from fft_alloc is a cfg buffer used internally
+ *  by the fft routine or NULL.
+ *
+ *  If lenmem is NULL, then opus_fft_alloc will allocate a cfg buffer using malloc.
+ *  The returned value should be free()d when done to avoid memory leaks.
+ *
+ *  The state can be placed in a user supplied buffer 'mem':
+ *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
+ *      then the function places the cfg in mem and the size used in *lenmem
+ *      and returns mem.
+ *
+ *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
+ *      then the function returns NULL and places the minimum cfg
+ *      buffer size in *lenmem.
+ * */
+
+kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base);
+
+kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem);
+
+/**
+ * opus_fft(cfg,in_out_buf)
+ *
+ * Perform an FFT on a complex input buffer.
+ * for a forward FFT,
+ * fin should be  f[0] , f[1] , ... ,f[nfft-1]
+ * fout will be   F[0] , F[1] , ... ,F[nfft-1]
+ * Note that each element is complex and can be accessed like
+    f[k].r and f[k].i
+ * */
+void opus_fft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+void opus_ifft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+
+void opus_fft_free(const kiss_fft_state *cfg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 134 - 0
drivers/opus/celt/laplace.c

@@ -0,0 +1,134 @@
+/* Copyright (c) 2007 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "laplace.h"
+#include "mathops.h"
+
+/* The minimum probability of an energy delta (out of 32768). */
+#define LAPLACE_LOG_MINP (0)
+#define LAPLACE_MINP (1<<LAPLACE_LOG_MINP)
+/* The minimum number of guaranteed representable energy deltas (in one
+    direction). */
+#define LAPLACE_NMIN (16)
+
+/* When called, decay is positive and at most 11456. */
+static unsigned ec_laplace_get_freq1(unsigned fs0, int decay)
+{
+   unsigned ft;
+   ft = 32768 - LAPLACE_MINP*(2*LAPLACE_NMIN) - fs0;
+   return ft*(opus_int32)(16384-decay)>>15;
+}
+
+void ec_laplace_encode(ec_enc *enc, int *value, unsigned fs, int decay)
+{
+   unsigned fl;
+   int val = *value;
+   fl = 0;
+   if (val)
+   {
+      int s;
+      int i;
+      s = -(val<0);
+      val = (val+s)^s;
+      fl = fs;
+      fs = ec_laplace_get_freq1(fs, decay);
+      /* Search the decaying part of the PDF.*/
+      for (i=1; fs > 0 && i < val; i++)
+      {
+         fs *= 2;
+         fl += fs+2*LAPLACE_MINP;
+         fs = (fs*(opus_int32)decay)>>15;
+      }
+      /* Everything beyond that has probability LAPLACE_MINP. */
+      if (!fs)
+      {
+         int di;
+         int ndi_max;
+         ndi_max = (32768-fl+LAPLACE_MINP-1)>>LAPLACE_LOG_MINP;
+         ndi_max = (ndi_max-s)>>1;
+         di = IMIN(val - i, ndi_max - 1);
+         fl += (2*di+1+s)*LAPLACE_MINP;
+         fs = IMIN(LAPLACE_MINP, 32768-fl);
+         *value = (i+di+s)^s;
+      }
+      else
+      {
+         fs += LAPLACE_MINP;
+         fl += fs&~s;
+      }
+      celt_assert(fl+fs<=32768);
+      celt_assert(fs>0);
+   }
+   ec_encode_bin(enc, fl, fl+fs, 15);
+}
+
+int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay)
+{
+   int val=0;
+   unsigned fl;
+   unsigned fm;
+   fm = ec_decode_bin(dec, 15);
+   fl = 0;
+   if (fm >= fs)
+   {
+      val++;
+      fl = fs;
+      fs = ec_laplace_get_freq1(fs, decay)+LAPLACE_MINP;
+      /* Search the decaying part of the PDF.*/
+      while(fs > LAPLACE_MINP && fm >= fl+2*fs)
+      {
+         fs *= 2;
+         fl += fs;
+         fs = ((fs-2*LAPLACE_MINP)*(opus_int32)decay)>>15;
+         fs += LAPLACE_MINP;
+         val++;
+      }
+      /* Everything beyond that has probability LAPLACE_MINP. */
+      if (fs <= LAPLACE_MINP)
+      {
+         int di;
+         di = (fm-fl)>>(LAPLACE_LOG_MINP+1);
+         val += di;
+         fl += 2*di*LAPLACE_MINP;
+      }
+      if (fm < fl+fs)
+         val = -val;
+      else
+         fl += fs;
+   }
+   celt_assert(fl<32768);
+   celt_assert(fs>0);
+   celt_assert(fl<=fm);
+   celt_assert(fm<IMIN(fl+fs,32768));
+   ec_dec_update(dec, fl, IMIN(fl+fs,32768), 32768);
+   return val;
+}

+ 48 - 0
drivers/opus/celt/laplace.h

@@ -0,0 +1,48 @@
+/* Copyright (c) 2007 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "entenc.h"
+#include "entdec.h"
+
+/** Encode a value that is assumed to be the realisation of a
+    Laplace-distributed random process
+ @param enc Entropy encoder state
+ @param value Value to encode
+ @param fs Probability of 0, multiplied by 32768
+ @param decay Probability of the value +/- 1, multiplied by 16384
+*/
+void ec_laplace_encode(ec_enc *enc, int *value, unsigned fs, int decay);
+
+/** Decode a value that is assumed to be the realisation of a
+    Laplace-distributed random process
+ @param dec Entropy decoder state
+ @param fs Probability of 0, multiplied by 32768
+ @param decay Probability of the value +/- 1, multiplied by 16384
+ @return Value decoded
+ */
+int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay);

+ 208 - 0
drivers/opus/celt/mathops.c

@@ -0,0 +1,208 @@
+/* Copyright (c) 2002-2008 Jean-Marc Valin
+   Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file mathops.h
+   @brief Various math functions
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "mathops.h"
+
+/*Compute floor(sqrt(_val)) with exact arithmetic.
+  This has been tested on all possible 32-bit inputs.*/
+unsigned isqrt32(opus_uint32 _val){
+  unsigned b;
+  unsigned g;
+  int      bshift;
+  /*Uses the second method from
+     http://www.azillionmonkeys.com/qed/sqroot.html
+    The main idea is to search for the largest binary digit b such that
+     (g+b)*(g+b) <= _val, and add it to the solution g.*/
+  g=0;
+  bshift=(EC_ILOG(_val)-1)>>1;
+  b=1U<<bshift;
+  do{
+    opus_uint32 t;
+    t=(((opus_uint32)g<<1)+b)<<bshift;
+    if(t<=_val){
+      g+=b;
+      _val-=t;
+    }
+    b>>=1;
+    bshift--;
+  }
+  while(bshift>=0);
+  return g;
+}
+
+#ifdef OPUS_FIXED_POINT
+
+opus_val32 frac_div32(opus_val32 a, opus_val32 b)
+{
+   opus_val16 rcp;
+   opus_val32 result, rem;
+   int shift = celt_ilog2(b)-29;
+   a = VSHR32(a,shift);
+   b = VSHR32(b,shift);
+   /* 16-bit reciprocal */
+   rcp = ROUND16(celt_rcp(ROUND16(b,16)),3);
+   result = MULT16_32_Q15(rcp, a);
+   rem = PSHR32(a,2)-MULT32_32_Q31(result, b);
+   result = ADD32(result, SHL32(MULT16_32_Q15(rcp, rem),2));
+   if (result >= 536870912)       /*  2^29 */
+      return 2147483647;          /*  2^31 - 1 */
+   else if (result <= -536870912) /* -2^29 */
+      return -2147483647;         /* -2^31 */
+   else
+      return SHL32(result, 2);
+}
+
+/** Reciprocal sqrt approximation in the range [0.25,1) (Q16 in, Q14 out) */
+opus_val16 celt_rsqrt_norm(opus_val32 x)
+{
+   opus_val16 n;
+   opus_val16 r;
+   opus_val16 r2;
+   opus_val16 y;
+   /* Range of n is [-16384,32767] ([-0.5,1) in Q15). */
+   n = x-32768;
+   /* Get a rough initial guess for the root.
+      The optimal minimax quadratic approximation (using relative error) is
+       r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
+      Coefficients here, and the final result r, are Q14.*/
+   r = ADD16(23557, MULT16_16_Q15(n, ADD16(-13490, MULT16_16_Q15(n, 6713))));
+   /* We want y = x*r*r-1 in Q15, but x is 32-bit Q16 and r is Q14.
+      We can compute the result from n and r using Q15 multiplies with some
+       adjustment, carefully done to avoid overflow.
+      Range of y is [-1564,1594]. */
+   r2 = MULT16_16_Q15(r, r);
+   y = SHL16(SUB16(ADD16(MULT16_16_Q15(r2, n), r2), 16384), 1);
+   /* Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5).
+      This yields the Q14 reciprocal square root of the Q16 x, with a maximum
+       relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a
+       peak absolute error of 2.26591/16384. */
+   return ADD16(r, MULT16_16_Q15(r, MULT16_16_Q15(y,
+              SUB16(MULT16_16_Q15(y, 12288), 16384))));
+}
+
+/** Sqrt approximation (QX input, QX/2 output) */
+opus_val32 celt_sqrt(opus_val32 x)
+{
+   int k;
+   opus_val16 n;
+   opus_val32 rt;
+   static const opus_val16 C[5] = {23175, 11561, -3011, 1699, -664};
+   if (x==0)
+      return 0;
+   else if (x>=1073741824)
+      return 32767;
+   k = (celt_ilog2(x)>>1)-7;
+   x = VSHR32(x, 2*k);
+   n = x-32768;
+   rt = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2],
+              MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, (C[4])))))))));
+   rt = VSHR32(rt,7-k);
+   return rt;
+}
+
+#define L1 32767
+#define L2 -7651
+#define L3 8277
+#define L4 -626
+
+static OPUS_INLINE opus_val16 _celt_cos_pi_2(opus_val16 x)
+{
+   opus_val16 x2;
+
+   x2 = MULT16_16_P15(x,x);
+   return ADD16(1,MIN16(32766,ADD32(SUB16(L1,x2), MULT16_16_P15(x2, ADD32(L2, MULT16_16_P15(x2, ADD32(L3, MULT16_16_P15(L4, x2
+                                                                                ))))))));
+}
+
+#undef L1
+#undef L2
+#undef L3
+#undef L4
+
+opus_val16 celt_cos_norm(opus_val32 x)
+{
+   x = x&0x0001ffff;
+   if (x>SHL32(EXTEND32(1), 16))
+      x = SUB32(SHL32(EXTEND32(1), 17),x);
+   if (x&0x00007fff)
+   {
+      if (x<SHL32(EXTEND32(1), 15))
+      {
+         return _celt_cos_pi_2(EXTRACT16(x));
+      } else {
+         return NEG32(_celt_cos_pi_2(EXTRACT16(65536-x)));
+      }
+   } else {
+      if (x&0x0000ffff)
+         return 0;
+      else if (x&0x0001ffff)
+         return -32767;
+      else
+         return 32767;
+   }
+}
+
+/** Reciprocal approximation (Q15 input, Q16 output) */
+opus_val32 celt_rcp(opus_val32 x)
+{
+   int i;
+   opus_val16 n;
+   opus_val16 r;
+   celt_assert2(x>0, "celt_rcp() only defined for positive values");
+   i = celt_ilog2(x);
+   /* n is Q15 with range [0,1). */
+   n = VSHR32(x,i-15)-32768;
+   /* Start with a linear approximation:
+      r = 1.8823529411764706-0.9411764705882353*n.
+      The coefficients and the result are Q14 in the range [15420,30840].*/
+   r = ADD16(30840, MULT16_16_Q15(-15420, n));
+   /* Perform two Newton iterations:
+      r -= r*((r*n)-1.Q15)
+         = r*((r*n)+(r-1.Q15)). */
+   r = SUB16(r, MULT16_16_Q15(r,
+             ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768))));
+   /* We subtract an extra 1 in the second iteration to avoid overflow; it also
+       neatly compensates for truncation error in the rest of the process. */
+   r = SUB16(r, ADD16(1, MULT16_16_Q15(r,
+             ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768)))));
+   /* r is now the Q15 solution to 2/(n+1), with a maximum relative error
+       of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute
+       error of 1.24665/32768. */
+   return VSHR32(EXTEND32(r),i-16);
+}
+
+#endif

+ 258 - 0
drivers/opus/celt/mathops.h

@@ -0,0 +1,258 @@
+/* Copyright (c) 2002-2008 Jean-Marc Valin
+   Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file mathops.h
+   @brief Various math functions
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef MATHOPS_H
+#define MATHOPS_H
+
+#include "arch.h"
+#include "entcode.h"
+#include "os_support.h"
+
+/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
+#define FRAC_MUL16(a,b) ((16384+((opus_int32)(opus_int16)(a)*(opus_int16)(b)))>>15)
+
+unsigned isqrt32(opus_uint32 _val);
+
+#ifndef OVERRIDE_CELT_MAXABS16
+static OPUS_INLINE opus_val32 celt_maxabs16(const opus_val16 *x, int len)
+{
+   int i;
+   opus_val16 maxval = 0;
+   opus_val16 minval = 0;
+   for (i=0;i<len;i++)
+   {
+      maxval = MAX16(maxval, x[i]);
+      minval = MIN16(minval, x[i]);
+   }
+   return MAX32(EXTEND32(maxval),-EXTEND32(minval));
+}
+#endif
+
+#ifndef OVERRIDE_CELT_MAXABS32
+#ifdef OPUS_FIXED_POINT
+static OPUS_INLINE opus_val32 celt_maxabs32(const opus_val32 *x, int len)
+{
+   int i;
+   opus_val32 maxval = 0;
+   opus_val32 minval = 0;
+   for (i=0;i<len;i++)
+   {
+      maxval = MAX32(maxval, x[i]);
+      minval = MIN32(minval, x[i]);
+   }
+   return MAX32(maxval, -minval);
+}
+#else
+#define celt_maxabs32(x,len) celt_maxabs16(x,len)
+#endif
+#endif
+
+
+#ifndef OPUS_FIXED_POINT
+
+#define PI 3.141592653f
+#define celt_sqrt(x) ((float)sqrt(x))
+#define celt_rsqrt(x) (1.f/celt_sqrt(x))
+#define celt_rsqrt_norm(x) (celt_rsqrt(x))
+#define celt_cos_norm(x) ((float)cos((.5f*PI)*(x)))
+#define celt_rcp(x) (1.f/(x))
+#define celt_div(a,b) ((a)/(b))
+#define frac_div32(a,b) ((float)(a)/(b))
+
+#ifdef FLOAT_APPROX
+
+/* Note: This assumes radix-2 floating point with the exponent at bits 23..30 and an offset of 127
+         denorm, +/- inf and NaN are *not* handled */
+
+/** Base-2 log approximation (log2(x)). */
+static OPUS_INLINE float celt_log2(float x)
+{
+   int integer;
+   float frac;
+   union {
+      float f;
+      opus_uint32 i;
+   } in;
+   in.f = x;
+   integer = (in.i>>23)-127;
+   in.i -= integer<<23;
+   frac = in.f - 1.5f;
+   frac = -0.41445418f + frac*(0.95909232f
+          + frac*(-0.33951290f + frac*0.16541097f));
+   return 1+integer+frac;
+}
+
+/** Base-2 exponential approximation (2^x). */
+static OPUS_INLINE float celt_exp2(float x)
+{
+   int integer;
+   float frac;
+   union {
+      float f;
+      opus_uint32 i;
+   } res;
+   integer = floor(x);
+   if (integer < -50)
+      return 0;
+   frac = x-integer;
+   /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+   res.f = 0.99992522f + frac * (0.69583354f
+           + frac * (0.22606716f + 0.078024523f*frac));
+   res.i = (res.i + (integer<<23)) & 0x7fffffff;
+   return res.f;
+}
+
+#else
+#define celt_log2(x) ((float)(1.442695040888963387*log(x)))
+#define celt_exp2(x) ((float)exp(0.6931471805599453094*(x)))
+#endif
+
+#endif
+
+#ifdef OPUS_FIXED_POINT
+
+#include "os_support.h"
+
+#ifndef OVERRIDE_CELT_ILOG2
+/** Integer log in base2. Undefined for zero and negative numbers */
+static OPUS_INLINE opus_int16 celt_ilog2(opus_int32 x)
+{
+   celt_assert2(x>0, "celt_ilog2() only defined for strictly positive numbers");
+   return EC_ILOG(x)-1;
+}
+#endif
+
+
+/** Integer log in base2. Defined for zero, but not for negative numbers */
+static OPUS_INLINE opus_int16 celt_zlog2(opus_val32 x)
+{
+   return x <= 0 ? 0 : celt_ilog2(x);
+}
+
+opus_val16 celt_rsqrt_norm(opus_val32 x);
+
+opus_val32 celt_sqrt(opus_val32 x);
+
+opus_val16 celt_cos_norm(opus_val32 x);
+
+/** Base-2 logarithm approximation (log2(x)). (Q14 input, Q10 output) */
+static OPUS_INLINE opus_val16 celt_log2(opus_val32 x)
+{
+   int i;
+   opus_val16 n, frac;
+   /* -0.41509302963303146, 0.9609890551383969, -0.31836011537636605,
+       0.15530808010959576, -0.08556153059057618 */
+   static const opus_val16 C[5] = {-6801+(1<<(13-DB_SHIFT)), 15746, -5217, 2545, -1401};
+   if (x==0)
+      return -32767;
+   i = celt_ilog2(x);
+   n = VSHR32(x,i-15)-32768-16384;
+   frac = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2], MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, C[4]))))))));
+   return SHL16(i-13,DB_SHIFT)+SHR16(frac,14-DB_SHIFT);
+}
+
+/*
+ K0 = 1
+ K1 = log(2)
+ K2 = 3-4*log(2)
+ K3 = 3*log(2) - 2
+*/
+#define D0 16383
+#define D1 22804
+#define D2 14819
+#define D3 10204
+
+static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x)
+{
+   opus_val16 frac;
+   frac = SHL16(x, 4);
+   return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac))))));
+}
+/** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */
+static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x)
+{
+   int integer;
+   opus_val16 frac;
+   integer = SHR16(x,10);
+   if (integer>14)
+      return 0x7f000000;
+   else if (integer < -15)
+      return 0;
+   frac = celt_exp2_frac(x-SHL16(integer,10));
+   return VSHR32(EXTEND32(frac), -integer-2);
+}
+
+opus_val32 celt_rcp(opus_val32 x);
+
+#define celt_div(a,b) MULT32_32_Q31((opus_val32)(a),celt_rcp(b))
+
+opus_val32 frac_div32(opus_val32 a, opus_val32 b);
+
+#define M1 32767
+#define M2 -21
+#define M3 -11943
+#define M4 4936
+
+/* Atan approximation using a 4th order polynomial. Input is in Q15 format
+   and normalized by pi/4. Output is in Q15 format */
+static OPUS_INLINE opus_val16 celt_atan01(opus_val16 x)
+{
+   return MULT16_16_P15(x, ADD32(M1, MULT16_16_P15(x, ADD32(M2, MULT16_16_P15(x, ADD32(M3, MULT16_16_P15(M4, x)))))));
+}
+
+#undef M1
+#undef M2
+#undef M3
+#undef M4
+
+/* atan2() approximation valid for positive input values */
+static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
+{
+   if (y < x)
+   {
+      opus_val32 arg;
+      arg = celt_div(SHL32(EXTEND32(y),15),x);
+      if (arg >= 32767)
+         arg = 32767;
+      return SHR16(celt_atan01(EXTRACT16(arg)),1);
+   } else {
+      opus_val32 arg;
+      arg = celt_div(SHL32(EXTEND32(x),15),y);
+      if (arg >= 32767)
+         arg = 32767;
+      return 25736-SHR16(celt_atan01(EXTRACT16(arg)),1);
+   }
+}
+
+#endif /* OPUS_FIXED_POINT */
+#endif /* MATHOPS_H */

+ 311 - 0
drivers/opus/celt/mdct.c

@@ -0,0 +1,311 @@
+	/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2008 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* This is a simple MDCT implementation that uses a N/4 complex FFT
+   to do most of the work. It should be relatively straightforward to
+   plug in pretty much and FFT here.
+
+   This replaces the Vorbis FFT (and uses the exact same API), which
+   was a bit too messy and that was ending up duplicating code
+   (might as well use the same FFT everywhere).
+
+   The algorithm is similar to (and inspired from) Fabrice Bellard's
+   MDCT implementation in FFMPEG, but has differences in signs, ordering
+   and scaling in many places.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+#endif
+
+#include "mdct.h"
+#include "kiss_fft.h"
+#include "_kiss_fft_guts.h"
+#include <math.h>
+#include "os_support.h"
+#include "mathops.h"
+#include "stack_alloc.h"
+
+#ifdef CUSTOM_MODES
+
+int clt_mdct_init(celt_mdct_lookup *l,int N, int maxshift)
+{
+   int i;
+   int N4;
+   kiss_twiddle_scalar *trig;
+#if defined(OPUS_FIXED_POINT)
+   int N2=N>>1;
+#endif
+   l->n = N;
+   N4 = N>>2;
+   l->maxshift = maxshift;
+   for (i=0;i<=maxshift;i++)
+   {
+      if (i==0)
+         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0);
+      else
+         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0]);
+#ifndef ENABLE_TI_DSPLIB55
+      if (l->kfft[i]==NULL)
+         return 0;
+#endif
+   }
+   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
+   if (l->trig==NULL)
+     return 0;
+   /* We have enough points that sine isn't necessary */
+#if defined(OPUS_FIXED_POINT)
+   for (i=0;i<=N4;i++)
+      trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
+#else
+   for (i=0;i<=N4;i++)
+      trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
+#endif
+   return 1;
+}
+
+void clt_mdct_clear(celt_mdct_lookup *l)
+{
+   int i;
+   for (i=0;i<=l->maxshift;i++)
+      opus_fft_free(l->kfft[i]);
+   opus_free((kiss_twiddle_scalar*)l->trig);
+}
+
+#endif /* CUSTOM_MODES */
+
+/* Forward MDCT trashes the input array */
+void clt_mdct_forward(const celt_mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 *window, int overlap, int shift, int stride)
+{
+   int i;
+   int N, N2, N4;
+   kiss_twiddle_scalar sine;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_scalar, f2);
+   SAVE_STACK;
+   N = l->n;
+   N >>= shift;
+   N2 = N>>1;
+   N4 = N>>2;
+   ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N2, kiss_fft_scalar);
+   /* sin(x) ~= x here */
+#ifdef OPUS_FIXED_POINT
+   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
+#else
+   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
+#endif
+
+   /* Consider the input to be composed of four blocks: [a, b, c, d] */
+   /* Window, shuffle, fold */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
+         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+      wp1 = window;
+      wp2 = window+overlap-1;
+      for(;i<N4-((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ = *xp2;
+         *yp++ = *xp1;
+         xp1+=2;
+         xp2-=2;
+      }
+      for(;i<N4;i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
+         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &l->trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         re = yp[0];
+         im = yp[1];
+         yr = -S_MUL(re,t[i<<shift])  -  S_MUL(im,t[(N4-i)<<shift]);
+         yi = -S_MUL(im,t[i<<shift])  +  S_MUL(re,t[(N4-i)<<shift]);
+         /* works because the cos is nearly one */
+         *yp++ = yr + S_MUL(yi,sine);
+         *yp++ = yi - S_MUL(yr,sine);
+      }
+   }
+
+   /* N/4 complex FFT, down-scales by 4/N */
+   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
+
+   /* Post-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &l->trig[0];
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
+         yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
+         /* works because the cos is nearly one */
+         *yp1 = yr - S_MUL(yi,sine);
+         *yp2 = yi + S_MUL(yr,sine);;
+         fp += 2;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
+      }
+   }
+   RESTORE_STACK;
+}
+
+void clt_mdct_backward(const celt_mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
+{
+   int i;
+   int N, N2, N4;
+   kiss_twiddle_scalar sine;
+   VARDECL(kiss_fft_scalar, f2);
+   SAVE_STACK;
+   N = l->n;
+   N >>= shift;
+   N2 = N>>1;
+   N4 = N>>2;
+   ALLOC(f2, N2, kiss_fft_scalar);
+   /* sin(x) ~= x here */
+#ifdef OPUS_FIXED_POINT
+   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
+#else
+   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
+#endif
+
+   /* Pre-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
+      const kiss_twiddle_scalar *t = &l->trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
+         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
+         /* works because the cos is nearly one */
+         *yp++ = yr - S_MUL(yi,sine);
+         *yp++ = yi + S_MUL(yr,sine);
+         xp1+=2*stride;
+         xp2-=2*stride;
+      }
+   }
+
+   /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
+   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
+
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &l->trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         kiss_twiddle_scalar t0, t1;
+         re = yp0[0];
+         im = yp0[1];
+         t0 = t[i<<shift];
+         t1 = t[(N4-i)<<shift];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         re = yp1[0];
+         im = yp1[1];
+         /* works because the cos is nearly one */
+         yp0[0] = -(yr - S_MUL(yi,sine));
+         yp1[1] = yi + S_MUL(yr,sine);
+
+         t0 = t[(N4-i-1)<<shift];
+         t1 = t[(i+1)<<shift];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         /* works because the cos is nearly one */
+         yp1[0] = -(yr - S_MUL(yi,sine));
+         yp0[1] = yi + S_MUL(yr,sine);
+         yp0 += 2;
+         yp1 -= 2;
+      }
+   }
+
+   /* Mirror on both sides for TDAC */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const opus_val16 * OPUS_RESTRICT wp1 = window;
+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
+      {
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
+         wp1++;
+         wp2--;
+      }
+   }
+   RESTORE_STACK;
+}

+ 70 - 0
drivers/opus/celt/mdct.h

@@ -0,0 +1,70 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2008 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* This is a simple MDCT implementation that uses a N/4 complex FFT
+   to do most of the work. It should be relatively straightforward to
+   plug in pretty much and FFT here.
+
+   This replaces the Vorbis FFT (and uses the exact same API), which
+   was a bit too messy and that was ending up duplicating code
+   (might as well use the same FFT everywhere).
+
+   The algorithm is similar to (and inspired from) Fabrice Bellard's
+   MDCT implementation in FFMPEG, but has differences in signs, ordering
+   and scaling in many places.
+*/
+
+#ifndef MDCT_H
+#define MDCT_H
+
+#include "opus_defines.h"
+#include "kiss_fft.h"
+#include "arch.h"
+
+typedef struct {
+   int n;
+   int maxshift;
+   const kiss_fft_state *kfft[4];
+   const kiss_twiddle_scalar * OPUS_RESTRICT trig;
+} celt_mdct_lookup;
+
+int clt_mdct_init(celt_mdct_lookup *l,int N, int maxshift);
+void clt_mdct_clear(celt_mdct_lookup *l);
+
+/** Compute a forward MDCT and scale by 4/N, trashes the input array */
+void clt_mdct_forward(const celt_mdct_lookup *l, kiss_fft_scalar *in,
+      kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 *window, int overlap, int shift, int stride);
+
+/** Compute a backward MDCT (no scaling) and performs weighted overlap-add
+    (scales implicitly by 1/2) */
+void clt_mdct_backward(const celt_mdct_lookup *l, kiss_fft_scalar *in,
+      kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride);
+
+#endif

+ 48 - 0
drivers/opus/celt/mfrngcod.h

@@ -0,0 +1,48 @@
+/* Copyright (c) 2001-2008 Timothy B. Terriberry
+   Copyright (c) 2008-2009 Xiph.Org Foundation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(_mfrngcode_H)
+# define _mfrngcode_H (1)
+# include "entcode.h"
+
+/*Constants used by the entropy encoder/decoder.*/
+
+/*The number of bits to output at a time.*/
+# define EC_SYM_BITS   (8)
+/*The total number of bits in each of the state registers.*/
+# define EC_CODE_BITS  (32)
+/*The maximum symbol value.*/
+# define EC_SYM_MAX    ((1U<<EC_SYM_BITS)-1)
+/*Bits to shift by to move a symbol into the high-order position.*/
+# define EC_CODE_SHIFT (EC_CODE_BITS-EC_SYM_BITS-1)
+/*Carry bit of the high-order range symbol.*/
+# define EC_CODE_TOP   (((opus_uint32)1U)<<(EC_CODE_BITS-1))
+/*Low-order bit of the high-order range symbol.*/
+# define EC_CODE_BOT   (EC_CODE_TOP>>EC_SYM_BITS)
+/*The number of bits available for the last, partial symbol in the code field.*/
+# define EC_CODE_EXTRA ((EC_CODE_BITS-2)%EC_SYM_BITS+1)
+#endif

+ 438 - 0
drivers/opus/celt/modes.c

@@ -0,0 +1,438 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "celt.h"
+#include "opus_modes.h"
+#include "rate.h"
+#include "os_support.h"
+#include "stack_alloc.h"
+#include "quant_bands.h"
+
+static const opus_int16 eband5ms[] = {
+/*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k 9.6 12k 15.6 */
+  0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
+};
+
+/* Alternate tuning (partially derived from Vorbis) */
+#define BITALLOC_SIZE 11
+/* Bit allocation table in units of 1/32 bit/sample (0.1875 dB SNR) */
+static const unsigned char band_allocation[] = {
+/*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k 9.6 12k 15.6 */
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 90, 80, 75, 69, 63, 56, 49, 40, 34, 29, 20, 18, 10,  0,  0,  0,  0,  0,  0,  0,  0,
+110,100, 90, 84, 78, 71, 65, 58, 51, 45, 39, 32, 26, 20, 12,  0,  0,  0,  0,  0,  0,
+118,110,103, 93, 86, 80, 75, 70, 65, 59, 53, 47, 40, 31, 23, 15,  4,  0,  0,  0,  0,
+126,119,112,104, 95, 89, 83, 78, 72, 66, 60, 54, 47, 39, 32, 25, 17, 12,  1,  0,  0,
+134,127,120,114,103, 97, 91, 85, 78, 72, 66, 60, 54, 47, 41, 35, 29, 23, 16, 10,  1,
+144,137,130,124,113,107,101, 95, 88, 82, 76, 70, 64, 57, 51, 45, 39, 33, 26, 15,  1,
+152,145,138,132,123,117,111,105, 98, 92, 86, 80, 74, 67, 61, 55, 49, 43, 36, 20,  1,
+162,155,148,142,133,127,121,115,108,102, 96, 90, 84, 77, 71, 65, 59, 53, 46, 30,  1,
+172,165,158,152,143,137,131,125,118,112,106,100, 94, 87, 81, 75, 69, 63, 56, 45, 20,
+200,200,200,200,200,200,200,200,198,193,188,183,178,173,168,163,158,153,148,129,104,
+};
+
+#ifndef CUSTOM_MODES_ONLY
+ #ifdef OPUS_FIXED_POINT
+  #include "static_modes_fixed.h"
+ #else
+  #include "static_modes_float.h"
+ #endif
+#endif /* CUSTOM_MODES_ONLY */
+
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+#ifdef CUSTOM_MODES
+
+/* Defining 25 critical bands for the full 0-20 kHz audio bandwidth
+   Taken from http://ccrma.stanford.edu/~jos/bbt/Bark_Frequency_Scale.html */
+#define BARK_BANDS 25
+static const opus_int16 bark_freq[BARK_BANDS+1] = {
+      0,   100,   200,   300,   400,
+    510,   630,   770,   920,  1080,
+   1270,  1480,  1720,  2000,  2320,
+   2700,  3150,  3700,  4400,  5300,
+   6400,  7700,  9500, 12000, 15500,
+  20000};
+
+static opus_int16 *compute_ebands(opus_int32 Fs, int frame_size, int res, int *nbEBands)
+{
+   opus_int16 *eBands;
+   int i, j, lin, low, high, nBark, offset=0;
+
+   /* All modes that have 2.5 ms short blocks use the same definition */
+   if (Fs == 400*(opus_int32)frame_size)
+   {
+      *nbEBands = sizeof(eband5ms)/sizeof(eband5ms[0])-1;
+      eBands = opus_alloc(sizeof(opus_int16)*(*nbEBands+1));
+      for (i=0;i<*nbEBands+1;i++)
+         eBands[i] = eband5ms[i];
+      return eBands;
+   }
+   /* Find the number of critical bands supported by our sampling rate */
+   for (nBark=1;nBark<BARK_BANDS;nBark++)
+    if (bark_freq[nBark+1]*2 >= Fs)
+       break;
+
+   /* Find where the linear part ends (i.e. where the spacing is more than min_width */
+   for (lin=0;lin<nBark;lin++)
+      if (bark_freq[lin+1]-bark_freq[lin] >= res)
+         break;
+
+   low = (bark_freq[lin]+res/2)/res;
+   high = nBark-lin;
+   *nbEBands = low+high;
+   eBands = opus_alloc(sizeof(opus_int16)*(*nbEBands+2));
+
+   if (eBands==NULL)
+      return NULL;
+
+   /* Linear spacing (min_width) */
+   for (i=0;i<low;i++)
+      eBands[i] = i;
+   if (low>0)
+      offset = eBands[low-1]*res - bark_freq[lin-1];
+   /* Spacing follows critical bands */
+   for (i=0;i<high;i++)
+   {
+      int target = bark_freq[lin+i];
+      /* Round to an even value */
+      eBands[i+low] = (target+offset/2+res)/(2*res)*2;
+      offset = eBands[i+low]*res - target;
+   }
+   /* Enforce the minimum spacing at the boundary */
+   for (i=0;i<*nbEBands;i++)
+      if (eBands[i] < i)
+         eBands[i] = i;
+   /* Round to an even value */
+   eBands[*nbEBands] = (bark_freq[nBark]+res)/(2*res)*2;
+   if (eBands[*nbEBands] > frame_size)
+      eBands[*nbEBands] = frame_size;
+   for (i=1;i<*nbEBands-1;i++)
+   {
+      if (eBands[i+1]-eBands[i] < eBands[i]-eBands[i-1])
+      {
+         eBands[i] -= (2*eBands[i]-eBands[i-1]-eBands[i+1])/2;
+      }
+   }
+   /* Remove any empty bands. */
+   for (i=j=0;i<*nbEBands;i++)
+      if(eBands[i+1]>eBands[j])
+         eBands[++j]=eBands[i+1];
+   *nbEBands=j;
+
+   for (i=1;i<*nbEBands;i++)
+   {
+      /* Every band must be smaller than the last band. */
+      celt_assert(eBands[i]-eBands[i-1]<=eBands[*nbEBands]-eBands[*nbEBands-1]);
+      /* Each band must be no larger than twice the size of the previous one. */
+      celt_assert(eBands[i+1]-eBands[i]<=2*(eBands[i]-eBands[i-1]));
+   }
+
+   return eBands;
+}
+
+static void compute_allocation_table(CELTMode *mode)
+{
+   int i, j;
+   unsigned char *allocVectors;
+   int maxBands = sizeof(eband5ms)/sizeof(eband5ms[0])-1;
+
+   mode->nbAllocVectors = BITALLOC_SIZE;
+   allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
+   if (allocVectors==NULL)
+      return;
+
+   /* Check for standard mode */
+   if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
+   {
+      for (i=0;i<BITALLOC_SIZE*mode->nbEBands;i++)
+         allocVectors[i] = band_allocation[i];
+      mode->allocVectors = allocVectors;
+      return;
+   }
+   /* If not the standard mode, interpolate */
+   /* Compute per-codec-band allocation from per-critical-band matrix */
+   for (i=0;i<BITALLOC_SIZE;i++)
+   {
+      for (j=0;j<mode->nbEBands;j++)
+      {
+         int k;
+         for (k=0;k<maxBands;k++)
+         {
+            if (400*(opus_int32)eband5ms[k] > mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize)
+               break;
+         }
+         if (k>maxBands-1)
+            allocVectors[i*mode->nbEBands+j] = band_allocation[i*maxBands + maxBands-1];
+         else {
+            opus_int32 a0, a1;
+            a1 = mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize - 400*(opus_int32)eband5ms[k-1];
+            a0 = 400*(opus_int32)eband5ms[k] - mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize;
+            allocVectors[i*mode->nbEBands+j] = (a0*band_allocation[i*maxBands+k-1]
+                                             + a1*band_allocation[i*maxBands+k])/(a0+a1);
+         }
+      }
+   }
+
+   /*printf ("\n");
+   for (i=0;i<BITALLOC_SIZE;i++)
+   {
+      for (j=0;j<mode->nbEBands;j++)
+         printf ("%d ", allocVectors[i*mode->nbEBands+j]);
+      printf ("\n");
+   }
+   exit(0);*/
+
+   mode->allocVectors = allocVectors;
+}
+
+#endif /* CUSTOM_MODES */
+
+CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error)
+{
+   int i;
+#ifdef CUSTOM_MODES
+   CELTMode *mode=NULL;
+   int res;
+   opus_val16 *window;
+   opus_int16 *logN;
+   int LM;
+   ALLOC_STACK;
+#if !defined(VAR_ARRAYS) && !defined(USE_ALLOCA)
+   if (global_stack==NULL)
+      goto failure;
+#endif
+#endif
+
+#ifndef CUSTOM_MODES_ONLY
+   for (i=0;i<TOTAL_MODES;i++)
+   {
+      int j;
+      for (j=0;j<4;j++)
+      {
+         if (Fs == static_mode_list[i]->Fs &&
+               (frame_size<<j) == static_mode_list[i]->shortMdctSize*static_mode_list[i]->nbShortMdcts)
+         {
+            if (error)
+               *error = OPUS_OK;
+            return (CELTMode*)static_mode_list[i];
+         }
+      }
+   }
+#endif /* CUSTOM_MODES_ONLY */
+
+#ifndef CUSTOM_MODES
+   if (error)
+      *error = OPUS_BAD_ARG;
+   return NULL;
+#else
+
+   /* The good thing here is that permutation of the arguments will automatically be invalid */
+
+   if (Fs < 8000 || Fs > 96000)
+   {
+      if (error)
+         *error = OPUS_BAD_ARG;
+      return NULL;
+   }
+   if (frame_size < 40 || frame_size > 1024 || frame_size%2!=0)
+   {
+      if (error)
+         *error = OPUS_BAD_ARG;
+      return NULL;
+   }
+   /* Frames of less than 1ms are not supported. */
+   if ((opus_int32)frame_size*1000 < Fs)
+   {
+      if (error)
+         *error = OPUS_BAD_ARG;
+      return NULL;
+   }
+
+   if ((opus_int32)frame_size*75 >= Fs && (frame_size%16)==0)
+   {
+     LM = 3;
+   } else if ((opus_int32)frame_size*150 >= Fs && (frame_size%8)==0)
+   {
+     LM = 2;
+   } else if ((opus_int32)frame_size*300 >= Fs && (frame_size%4)==0)
+   {
+     LM = 1;
+   } else
+   {
+     LM = 0;
+   }
+
+   /* Shorts longer than 3.3ms are not supported. */
+   if ((opus_int32)(frame_size>>LM)*300 > Fs)
+   {
+      if (error)
+         *error = OPUS_BAD_ARG;
+      return NULL;
+   }
+
+   mode = opus_alloc(sizeof(CELTMode));
+   if (mode==NULL)
+      goto failure;
+   mode->Fs = Fs;
+
+   /* Pre/de-emphasis depends on sampling rate. The "standard" pre-emphasis
+      is defined as A(z) = 1 - 0.85*z^-1 at 48 kHz. Other rates should
+      approximate that. */
+   if(Fs < 12000) /* 8 kHz */
+   {
+      mode->preemph[0] =  QCONST16(0.3500061035f, 15);
+      mode->preemph[1] = -QCONST16(0.1799926758f, 15);
+      mode->preemph[2] =  QCONST16(0.2719968125f, SIG_SHIFT); /* exact 1/preemph[3] */
+      mode->preemph[3] =  QCONST16(3.6765136719f, 13);
+   } else if(Fs < 24000) /* 16 kHz */
+   {
+      mode->preemph[0] =  QCONST16(0.6000061035f, 15);
+      mode->preemph[1] = -QCONST16(0.1799926758f, 15);
+      mode->preemph[2] =  QCONST16(0.4424998650f, SIG_SHIFT); /* exact 1/preemph[3] */
+      mode->preemph[3] =  QCONST16(2.2598876953f, 13);
+   } else if(Fs < 40000) /* 32 kHz */
+   {
+      mode->preemph[0] =  QCONST16(0.7799987793f, 15);
+      mode->preemph[1] = -QCONST16(0.1000061035f, 15);
+      mode->preemph[2] =  QCONST16(0.7499771125f, SIG_SHIFT); /* exact 1/preemph[3] */
+      mode->preemph[3] =  QCONST16(1.3333740234f, 13);
+   } else /* 48 kHz */
+   {
+      mode->preemph[0] =  QCONST16(0.8500061035f, 15);
+      mode->preemph[1] =  QCONST16(0.0f, 15);
+      mode->preemph[2] =  QCONST16(1.f, SIG_SHIFT);
+      mode->preemph[3] =  QCONST16(1.f, 13);
+   }
+
+   mode->maxLM = LM;
+   mode->nbShortMdcts = 1<<LM;
+   mode->shortMdctSize = frame_size/mode->nbShortMdcts;
+   res = (mode->Fs+mode->shortMdctSize)/(2*mode->shortMdctSize);
+
+   mode->eBands = compute_ebands(Fs, mode->shortMdctSize, res, &mode->nbEBands);
+   if (mode->eBands==NULL)
+      goto failure;
+#if !defined(SMALL_FOOTPRINT)
+   /* Make sure we don't allocate a band larger than our PVQ table.
+      208 should be enough, but let's be paranoid. */
+   if ((mode->eBands[mode->nbEBands] - mode->eBands[mode->nbEBands-1])<<LM >
+    208) {
+       goto failure;
+   }
+#endif
+
+   mode->effEBands = mode->nbEBands;
+   while (mode->eBands[mode->effEBands] > mode->shortMdctSize)
+      mode->effEBands--;
+
+   /* Overlap must be divisible by 4 */
+   mode->overlap = ((mode->shortMdctSize>>2)<<2);
+
+   compute_allocation_table(mode);
+   if (mode->allocVectors==NULL)
+      goto failure;
+
+   window = (opus_val16*)opus_alloc(mode->overlap*sizeof(opus_val16));
+   if (window==NULL)
+      goto failure;
+
+#ifndef OPUS_FIXED_POINT
+   for (i=0;i<mode->overlap;i++)
+      window[i] = Q15ONE*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap));
+#else
+   for (i=0;i<mode->overlap;i++)
+      window[i] = MIN32(32767,floor(.5+32768.*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap))));
+#endif
+   mode->window = window;
+
+   logN = (opus_int16*)opus_alloc(mode->nbEBands*sizeof(opus_int16));
+   if (logN==NULL)
+      goto failure;
+
+   for (i=0;i<mode->nbEBands;i++)
+      logN[i] = log2_frac(mode->eBands[i+1]-mode->eBands[i], BITRES);
+   mode->logN = logN;
+
+   compute_pulse_cache(mode, mode->maxLM);
+
+   if (clt_mdct_init(&mode->mdct, 2*mode->shortMdctSize*mode->nbShortMdcts,
+           mode->maxLM) == 0)
+      goto failure;
+
+   if (error)
+      *error = OPUS_OK;
+
+   return mode;
+failure:
+   if (error)
+      *error = OPUS_ALLOC_FAIL;
+   if (mode!=NULL)
+      opus_custom_mode_destroy(mode);
+   return NULL;
+#endif /* !CUSTOM_MODES */
+}
+
+#ifdef CUSTOM_MODES
+void opus_custom_mode_destroy(CELTMode *mode)
+{
+   if (mode == NULL)
+      return;
+#ifndef CUSTOM_MODES_ONLY
+   {
+     int i;
+     for (i=0;i<TOTAL_MODES;i++)
+     {
+        if (mode == static_mode_list[i])
+        {
+           return;
+        }
+     }
+   }
+#endif /* CUSTOM_MODES_ONLY */
+   opus_free((opus_int16*)mode->eBands);
+   opus_free((opus_int16*)mode->allocVectors);
+
+   opus_free((opus_val16*)mode->window);
+   opus_free((opus_int16*)mode->logN);
+
+   opus_free((opus_int16*)mode->cache.index);
+   opus_free((unsigned char*)mode->cache.bits);
+   opus_free((unsigned char*)mode->cache.caps);
+   clt_mdct_clear(&mode->mdct);
+
+   opus_free((CELTMode *)mode);
+}
+#endif

+ 210 - 0
drivers/opus/celt/opus_custom_demo.c

@@ -0,0 +1,210 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "opus_custom.h"
+#include "arch.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#define MAX_PACKET 1275
+
+int main(int argc, char *argv[])
+{
+   int err;
+   char *inFile, *outFile;
+   FILE *fin, *fout;
+   OpusCustomMode *mode=NULL;
+   OpusCustomEncoder *enc;
+   OpusCustomDecoder *dec;
+   int len;
+   opus_int32 frame_size, channels, rate;
+   int bytes_per_packet;
+   unsigned char data[MAX_PACKET];
+   int complexity;
+#if !(defined (OPUS_FIXED_POINT) && !defined(CUSTOM_MODES)) && defined(RESYNTH)
+   int i;
+   double rmsd = 0;
+#endif
+   int count = 0;
+   opus_int32 skip;
+   opus_int16 *in, *out;
+   if (argc != 9 && argc != 8 && argc != 7)
+   {
+      fprintf (stderr, "Usage: test_opus_custom <rate> <channels> <frame size> "
+               " <bytes per packet> [<complexity> [packet loss rate]] "
+               "<input> <output>\n");
+      return 1;
+   }
+
+   rate = (opus_int32)atol(argv[1]);
+   channels = atoi(argv[2]);
+   frame_size = atoi(argv[3]);
+   mode = opus_custom_mode_create(rate, frame_size, NULL);
+   if (mode == NULL)
+   {
+      fprintf(stderr, "failed to create a mode\n");
+      return 1;
+   }
+
+   bytes_per_packet = atoi(argv[4]);
+   if (bytes_per_packet < 0 || bytes_per_packet > MAX_PACKET)
+   {
+      fprintf (stderr, "bytes per packet must be between 0 and %d\n",
+                        MAX_PACKET);
+      return 1;
+   }
+
+   inFile = argv[argc-2];
+   fin = fopen(inFile, "rb");
+   if (!fin)
+   {
+      fprintf (stderr, "Could not open input file %s\n", argv[argc-2]);
+      return 1;
+   }
+   outFile = argv[argc-1];
+   fout = fopen(outFile, "wb+");
+   if (!fout)
+   {
+      fprintf (stderr, "Could not open output file %s\n", argv[argc-1]);
+      fclose(fin);
+      return 1;
+   }
+
+   enc = opus_custom_encoder_create(mode, channels, &err);
+   if (err != 0)
+   {
+      fprintf(stderr, "Failed to create the encoder: %s\n", opus_strerror(err));
+      fclose(fin);
+      fclose(fout);
+      return 1;
+   }
+   dec = opus_custom_decoder_create(mode, channels, &err);
+   if (err != 0)
+   {
+      fprintf(stderr, "Failed to create the decoder: %s\n", opus_strerror(err));
+      fclose(fin);
+      fclose(fout);
+      return 1;
+   }
+   opus_custom_decoder_ctl(dec, OPUS_GET_LOOKAHEAD(&skip));
+
+   if (argc>7)
+   {
+      complexity=atoi(argv[5]);
+      opus_custom_encoder_ctl(enc,OPUS_SET_COMPLEXITY(complexity));
+   }
+
+   in = (opus_int16*)malloc(frame_size*channels*sizeof(opus_int16));
+   out = (opus_int16*)malloc(frame_size*channels*sizeof(opus_int16));
+
+   while (!feof(fin))
+   {
+      int ret;
+      err = fread(in, sizeof(short), frame_size*channels, fin);
+      if (feof(fin))
+         break;
+      len = opus_custom_encode(enc, in, frame_size, data, bytes_per_packet);
+      if (len <= 0)
+         fprintf (stderr, "opus_custom_encode() failed: %s\n", opus_strerror(len));
+
+      /* This is for simulating bit errors */
+#if 0
+      int errors = 0;
+      int eid = 0;
+      /* This simulates random bit error */
+      for (i=0;i<len*8;i++)
+      {
+         if (rand()%atoi(argv[8])==0)
+         {
+            if (i<64)
+            {
+               errors++;
+               eid = i;
+            }
+            data[i/8] ^= 1<<(7-(i%8));
+         }
+      }
+      if (errors == 1)
+         data[eid/8] ^= 1<<(7-(eid%8));
+      else if (errors%2 == 1)
+         data[rand()%8] ^= 1<<rand()%8;
+#endif
+
+#if 1 /* Set to zero to use the encoder's output instead */
+      /* This is to simulate packet loss */
+      if (argc==9 && rand()%1000<atoi(argv[argc-3]))
+      /*if (errors && (errors%2==0))*/
+         ret = opus_custom_decode(dec, NULL, len, out, frame_size);
+      else
+         ret = opus_custom_decode(dec, data, len, out, frame_size);
+      if (ret < 0)
+         fprintf(stderr, "opus_custom_decode() failed: %s\n", opus_strerror(ret));
+#else
+      for (i=0;i<ret*channels;i++)
+         out[i] = in[i];
+#endif
+#if !(defined (OPUS_FIXED_POINT) && !defined(CUSTOM_MODES)) && defined(RESYNTH)
+      for (i=0;i<ret*channels;i++)
+      {
+         rmsd += (in[i]-out[i])*1.0*(in[i]-out[i]);
+         /*out[i] -= in[i];*/
+      }
+#endif
+      count++;
+      fwrite(out+skip*channels, sizeof(short), (ret-skip)*channels, fout);
+      skip = 0;
+   }
+   PRINT_MIPS(stderr);
+
+   opus_custom_encoder_destroy(enc);
+   opus_custom_decoder_destroy(dec);
+   fclose(fin);
+   fclose(fout);
+   opus_custom_mode_destroy(mode);
+   free(in);
+   free(out);
+#if !(defined (OPUS_FIXED_POINT) && !defined(CUSTOM_MODES)) && defined(RESYNTH)
+   if (rmsd > 0)
+   {
+      rmsd = sqrt(rmsd/(1.0*frame_size*channels*count));
+      fprintf (stderr, "Error: encoder doesn't match decoder\n");
+      fprintf (stderr, "RMS mismatch is %f\n", rmsd);
+      return 1;
+   } else {
+      fprintf (stderr, "Encoder matches decoder!!\n");
+   }
+#endif
+   return 0;
+}
+

+ 83 - 0
drivers/opus/celt/opus_modes.h

@@ -0,0 +1,83 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef OPUS_MODES_H
+#define OPUS_MODES_H
+
+#include "opus_types.h"
+#include "celt.h"
+#include "arch.h"
+#include "mdct.h"
+#include "entenc.h"
+#include "entdec.h"
+
+#define MAX_PERIOD 1024
+
+#ifndef OVERLAP
+#define OVERLAP(mode) ((mode)->overlap)
+#endif
+
+#ifndef FRAMESIZE
+#define FRAMESIZE(mode) ((mode)->mdctSize)
+#endif
+
+typedef struct {
+   int size;
+   const opus_int16 *index;
+   const unsigned char *bits;
+   const unsigned char *caps;
+} PulseCache;
+
+/** Mode definition (opaque)
+ @brief Mode definition
+ */
+struct OpusCustomMode {
+   opus_int32 Fs;
+   int          overlap;
+
+   int          nbEBands;
+   int          effEBands;
+   opus_val16    preemph[4];
+   const opus_int16   *eBands;   /**< Definition for each "pseudo-critical band" */
+
+   int         maxLM;
+   int         nbShortMdcts;
+   int         shortMdctSize;
+
+   int          nbAllocVectors; /**< Number of lines in the matrix below */
+   const unsigned char   *allocVectors;   /**< Number of bits in each band for several rates */
+   const opus_int16 *logN;
+
+   const opus_val16 *window;
+   celt_mdct_lookup mdct;
+   PulseCache cache;
+};
+
+
+#endif

+ 92 - 0
drivers/opus/celt/os_support.h

@@ -0,0 +1,92 @@
+/* Copyright (C) 2007 Jean-Marc Valin
+
+   File: os_support.h
+   This is the (tiny) OS abstraction layer. Aside from math.h, this is the
+   only place where system headers are allowed.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef OS_SUPPORT_H
+#define OS_SUPPORT_H
+
+#ifdef CUSTOM_SUPPORT
+#  include "custom_support.h"
+#endif
+
+#include "opus_types.h"
+#include "opus_defines.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */
+#ifndef OVERRIDE_OPUS_ALLOC
+static OPUS_INLINE void *opus_alloc (size_t size)
+{
+   return malloc(size);
+}
+#endif
+
+/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */
+#ifndef OVERRIDE_OPUS_ALLOC_SCRATCH
+static OPUS_INLINE void *opus_alloc_scratch (size_t size)
+{
+   /* Scratch space doesn't need to be cleared */
+   return opus_alloc(size);
+}
+#endif
+
+/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */
+#ifndef OVERRIDE_OPUS_FREE
+static OPUS_INLINE void opus_free (void *ptr)
+{
+   free(ptr);
+}
+#endif
+
+/** Copy n bytes of memory from src to dst. The 0* term provides compile-time type checking  */
+#ifndef OVERRIDE_OPUS_COPY
+#define OPUS_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
+#endif
+
+/** Copy n bytes of memory from src to dst, allowing overlapping regions. The 0* term
+    provides compile-time type checking */
+#ifndef OVERRIDE_OPUS_MOVE
+#define OPUS_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
+#endif
+
+/** Set n elements of dst to zero, starting at address s */
+#ifndef OVERRIDE_OPUS_CLEAR
+#define OPUS_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
+#endif
+
+/*#ifdef __GNUC__
+#pragma GCC poison printf sprintf
+#pragma GCC poison malloc free realloc calloc
+#endif*/
+
+#endif /* OS_SUPPORT_H */
+

+ 537 - 0
drivers/opus/celt/pitch.c

@@ -0,0 +1,537 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file pitch.c
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "pitch.h"
+#include "os_support.h"
+#include "opus_modes.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "celt_lpc.h"
+
+static void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len,
+                            int max_pitch, int *best_pitch
+#ifdef OPUS_FIXED_POINT
+                            , int yshift, opus_val32 maxcorr
+#endif
+                            )
+{
+   int i, j;
+   opus_val32 Syy=1;
+   opus_val16 best_num[2];
+   opus_val32 best_den[2];
+#ifdef OPUS_FIXED_POINT
+   int xshift;
+
+   xshift = celt_ilog2(maxcorr)-14;
+#endif
+
+   best_num[0] = -1;
+   best_num[1] = -1;
+   best_den[0] = 0;
+   best_den[1] = 0;
+   best_pitch[0] = 0;
+   best_pitch[1] = 1;
+   for (j=0;j<len;j++)
+      Syy = ADD32(Syy, SHR32(MULT16_16(y[j],y[j]), yshift));
+   for (i=0;i<max_pitch;i++)
+   {
+      if (xcorr[i]>0)
+      {
+         opus_val16 num;
+         opus_val32 xcorr16;
+         xcorr16 = EXTRACT16(VSHR32(xcorr[i], xshift));
+#ifndef OPUS_FIXED_POINT
+         /* Considering the range of xcorr16, this should avoid both underflows
+            and overflows (inf) when squaring xcorr16 */
+         xcorr16 *= 1e-12f;
+#endif
+         num = MULT16_16_Q15(xcorr16,xcorr16);
+         if (MULT16_32_Q15(num,best_den[1]) > MULT16_32_Q15(best_num[1],Syy))
+         {
+            if (MULT16_32_Q15(num,best_den[0]) > MULT16_32_Q15(best_num[0],Syy))
+            {
+               best_num[1] = best_num[0];
+               best_den[1] = best_den[0];
+               best_pitch[1] = best_pitch[0];
+               best_num[0] = num;
+               best_den[0] = Syy;
+               best_pitch[0] = i;
+            } else {
+               best_num[1] = num;
+               best_den[1] = Syy;
+               best_pitch[1] = i;
+            }
+         }
+      }
+      Syy += SHR32(MULT16_16(y[i+len],y[i+len]),yshift) - SHR32(MULT16_16(y[i],y[i]),yshift);
+      Syy = MAX32(1, Syy);
+   }
+}
+
+static void celt_fir5(const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         opus_val16 *mem)
+{
+   int i;
+   opus_val16 num0, num1, num2, num3, num4;
+   opus_val32 mem0, mem1, mem2, mem3, mem4;
+   num0=num[0];
+   num1=num[1];
+   num2=num[2];
+   num3=num[3];
+   num4=num[4];
+   mem0=mem[0];
+   mem1=mem[1];
+   mem2=mem[2];
+   mem3=mem[3];
+   mem4=mem[4];
+   for (i=0;i<N;i++)
+   {
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+      sum = MAC16_16(sum,num0,mem0);
+      sum = MAC16_16(sum,num1,mem1);
+      sum = MAC16_16(sum,num2,mem2);
+      sum = MAC16_16(sum,num3,mem3);
+      sum = MAC16_16(sum,num4,mem4);
+      mem4 = mem3;
+      mem3 = mem2;
+      mem2 = mem1;
+      mem1 = mem0;
+      mem0 = x[i];
+      y[i] = ROUND16(sum, SIG_SHIFT);
+   }
+   mem[0]=mem0;
+   mem[1]=mem1;
+   mem[2]=mem2;
+   mem[3]=mem3;
+   mem[4]=mem4;
+}
+
+
+void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
+      int len, int C, int arch)
+{
+   int i;
+   opus_val32 ac[5];
+   opus_val16 tmp=Q15ONE;
+   opus_val16 lpc[4], mem[5]={0,0,0,0,0};
+   opus_val16 lpc2[5];
+   opus_val16 c1 = QCONST16(.8f,15);
+#ifdef OPUS_FIXED_POINT
+   int shift;
+   opus_val32 maxabs = celt_maxabs32(x[0], len);
+   if (C==2)
+   {
+      opus_val32 maxabs_1 = celt_maxabs32(x[1], len);
+      maxabs = MAX32(maxabs, maxabs_1);
+   }
+   if (maxabs<1)
+      maxabs=1;
+   shift = celt_ilog2(maxabs)-10;
+   if (shift<0)
+      shift=0;
+   if (C==2)
+      shift++;
+#endif
+   for (i=1;i<len>>1;i++)
+      x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift);
+   x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift);
+   if (C==2)
+   {
+      for (i=1;i<len>>1;i++)
+         x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift);
+      x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift);
+   }
+
+   _celt_autocorr(x_lp, ac, NULL, 0,
+                  4, len>>1, arch);
+
+   /* Noise floor -40 dB */
+#ifdef OPUS_FIXED_POINT
+   ac[0] += SHR32(ac[0],13);
+#else
+   ac[0] *= 1.0001f;
+#endif
+   /* Lag windowing */
+   for (i=1;i<=4;i++)
+   {
+      /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
+#ifdef OPUS_FIXED_POINT
+      ac[i] -= MULT16_32_Q15(2*i*i, ac[i]);
+#else
+      ac[i] -= ac[i]*(.008f*i)*(.008f*i);
+#endif
+   }
+
+   _celt_lpc(lpc, ac, 4);
+   for (i=0;i<4;i++)
+   {
+      tmp = MULT16_16_Q15(QCONST16(.9f,15), tmp);
+      lpc[i] = MULT16_16_Q15(lpc[i], tmp);
+   }
+   /* Add a zero */
+   lpc2[0] = lpc[0] + QCONST16(.8f,SIG_SHIFT);
+   lpc2[1] = lpc[1] + MULT16_16_Q15(c1,lpc[0]);
+   lpc2[2] = lpc[2] + MULT16_16_Q15(c1,lpc[1]);
+   lpc2[3] = lpc[3] + MULT16_16_Q15(c1,lpc[2]);
+   lpc2[4] = MULT16_16_Q15(c1,lpc[3]);
+   celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
+}
+
+#if 0 /* This is a simple version of the pitch correlation that should work
+         well on DSPs like Blackfin and TI C5x/C6x */
+
+#ifdef OPUS_FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr(opus_val16 *x, opus_val16 *y, opus_val32 *xcorr, int len, int max_pitch)
+{
+   int i, j;
+#ifdef OPUS_FIXED_POINT
+   opus_val32 maxcorr=1;
+#endif
+   for (i=0;i<max_pitch;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<len;j++)
+         sum = MAC16_16(sum, x[j],y[i+j]);
+      xcorr[i] = sum;
+#ifdef OPUS_FIXED_POINT
+      maxcorr = MAX32(maxcorr, sum);
+#endif
+   }
+#ifdef OPUS_FIXED_POINT
+   return maxcorr;
+#endif
+}
+
+#else /* Unrolled version of the pitch correlation -- runs faster on x86 and ARM */
+
+#ifdef OPUS_FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
+{
+   int i,j;
+   /*The EDSP version requires that max_pitch is at least 1, and that _x is
+      32-bit aligned.
+     Since it's hard to put asserts in assembly, put them here.*/
+   celt_assert(max_pitch>0);
+   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+#ifdef OPUS_FIXED_POINT
+   opus_val32 maxcorr=1;
+#endif
+   for (i=0;i<max_pitch-3;i+=4)
+   {
+      opus_val32 sum[4]={0,0,0,0};
+      xcorr_kernel(_x, _y+i, sum, len);
+      xcorr[i]=sum[0];
+      xcorr[i+1]=sum[1];
+      xcorr[i+2]=sum[2];
+      xcorr[i+3]=sum[3];
+#ifdef OPUS_FIXED_POINT
+      sum[0] = MAX32(sum[0], sum[1]);
+      sum[2] = MAX32(sum[2], sum[3]);
+      sum[0] = MAX32(sum[0], sum[2]);
+      maxcorr = MAX32(maxcorr, sum[0]);
+#endif
+   }
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
+   for (;i<max_pitch;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<len;j++)
+         sum = MAC16_16(sum, _x[j],_y[i+j]);
+      xcorr[i] = sum;
+#ifdef OPUS_FIXED_POINT
+      maxcorr = MAX32(maxcorr, sum);
+#endif
+   }
+#ifdef OPUS_FIXED_POINT
+   return maxcorr;
+#endif
+}
+
+#endif
+void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
+                  int len, int max_pitch, int *pitch, int arch)
+{
+   int i, j;
+   int lag;
+   int best_pitch[2]={0,0};
+   VARDECL(opus_val16, x_lp4);
+   VARDECL(opus_val16, y_lp4);
+   VARDECL(opus_val32, xcorr);
+#ifdef OPUS_FIXED_POINT
+   opus_val32 maxcorr;
+   opus_val32 xmax, ymax;
+   int shift=0;
+#endif
+   int offset;
+
+   SAVE_STACK;
+
+   celt_assert(len>0);
+   celt_assert(max_pitch>0);
+   lag = len+max_pitch;
+
+   ALLOC(x_lp4, len>>2, opus_val16);
+   ALLOC(y_lp4, lag>>2, opus_val16);
+   ALLOC(xcorr, max_pitch>>1, opus_val32);
+
+   /* Downsample by 2 again */
+   for (j=0;j<len>>2;j++)
+      x_lp4[j] = x_lp[2*j];
+   for (j=0;j<lag>>2;j++)
+      y_lp4[j] = y[2*j];
+
+#ifdef OPUS_FIXED_POINT
+   xmax = celt_maxabs16(x_lp4, len>>2);
+   ymax = celt_maxabs16(y_lp4, lag>>2);
+   shift = celt_ilog2(MAX32(1, MAX32(xmax, ymax)))-11;
+   if (shift>0)
+   {
+      for (j=0;j<len>>2;j++)
+         x_lp4[j] = SHR16(x_lp4[j], shift);
+      for (j=0;j<lag>>2;j++)
+         y_lp4[j] = SHR16(y_lp4[j], shift);
+      /* Use double the shift for a MAC */
+      shift *= 2;
+   } else {
+      shift = 0;
+   }
+#endif
+
+   /* Coarse search with 4x decimation */
+
+#ifdef OPUS_FIXED_POINT
+   maxcorr =
+#endif
+   celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch);
+
+   find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch
+#ifdef OPUS_FIXED_POINT
+                   , 0, maxcorr
+#endif
+                   );
+
+   /* Finer search with 2x decimation */
+#ifdef OPUS_FIXED_POINT
+   maxcorr=1;
+#endif
+   for (i=0;i<max_pitch>>1;i++)
+   {
+      opus_val32 sum=0;
+      xcorr[i] = 0;
+      if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2)
+         continue;
+      for (j=0;j<len>>1;j++)
+         sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
+      xcorr[i] = MAX32(-1, sum);
+#ifdef OPUS_FIXED_POINT
+      maxcorr = MAX32(maxcorr, sum);
+#endif
+   }
+   find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch
+#ifdef OPUS_FIXED_POINT
+                   , shift+1, maxcorr
+#endif
+                   );
+
+   /* Refine by pseudo-interpolation */
+   if (best_pitch[0]>0 && best_pitch[0]<(max_pitch>>1)-1)
+   {
+      opus_val32 a, b, c;
+      a = xcorr[best_pitch[0]-1];
+      b = xcorr[best_pitch[0]];
+      c = xcorr[best_pitch[0]+1];
+      if ((c-a) > MULT16_32_Q15(QCONST16(.7f,15),b-a))
+         offset = 1;
+      else if ((a-c) > MULT16_32_Q15(QCONST16(.7f,15),b-c))
+         offset = -1;
+      else
+         offset = 0;
+   } else {
+      offset = 0;
+   }
+   *pitch = 2*best_pitch[0]-offset;
+
+   RESTORE_STACK;
+}
+
+static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
+opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
+      int N, int *T0_, int prev_period, opus_val16 prev_gain)
+{
+   int k, i, T, T0;
+   opus_val16 g, g0;
+   opus_val16 pg;
+   opus_val32 xy,xx,yy,xy2;
+   opus_val32 xcorr[3];
+   opus_val32 best_xy, best_yy;
+   int offset;
+   int minperiod0;
+   VARDECL(opus_val32, yy_lookup);
+   SAVE_STACK;
+
+   minperiod0 = minperiod;
+   maxperiod /= 2;
+   minperiod /= 2;
+   *T0_ /= 2;
+   prev_period /= 2;
+   N /= 2;
+   x += maxperiod;
+   if (*T0_>=maxperiod)
+      *T0_=maxperiod-1;
+
+   T = T0 = *T0_;
+   ALLOC(yy_lookup, maxperiod+1, opus_val32);
+   dual_inner_prod(x, x, x-T0, N, &xx, &xy);
+   yy_lookup[0] = xx;
+   yy=xx;
+   for (i=1;i<=maxperiod;i++)
+   {
+      yy = yy+MULT16_16(x[-i],x[-i])-MULT16_16(x[N-i],x[N-i]);
+      yy_lookup[i] = MAX32(0, yy);
+   }
+   yy = yy_lookup[T0];
+   best_xy = xy;
+   best_yy = yy;
+#ifdef OPUS_FIXED_POINT
+      {
+         opus_val32 x2y2;
+         int sh, t;
+         x2y2 = 1+HALF32(MULT32_32_Q31(xx,yy));
+         sh = celt_ilog2(x2y2)>>1;
+         t = VSHR32(x2y2, 2*(sh-7));
+         g = g0 = VSHR32(MULT16_32_Q15(celt_rsqrt_norm(t), xy),sh+1);
+      }
+#else
+      g = g0 = xy/celt_sqrt(1+xx*yy);
+#endif
+   /* Look for any pitch at T/k */
+   for (k=2;k<=15;k++)
+   {
+      int T1, T1b;
+      opus_val16 g1;
+      opus_val16 cont=0;
+      opus_val16 thresh;
+      T1 = (2*T0+k)/(2*k);
+      if (T1 < minperiod)
+         break;
+      /* Look for another strong correlation at T1b */
+      if (k==2)
+      {
+         if (T1+T0>maxperiod)
+            T1b = T0;
+         else
+            T1b = T0+T1;
+      } else
+      {
+         T1b = (2*second_check[k]*T0+k)/(2*k);
+      }
+      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
+      xy += xy2;
+      yy = yy_lookup[T1] + yy_lookup[T1b];
+#ifdef OPUS_FIXED_POINT
+      {
+         opus_val32 x2y2;
+         int sh, t;
+         x2y2 = 1+MULT32_32_Q31(xx,yy);
+         sh = celt_ilog2(x2y2)>>1;
+         t = VSHR32(x2y2, 2*(sh-7));
+         g1 = VSHR32(MULT16_32_Q15(celt_rsqrt_norm(t), xy),sh+1);
+      }
+#else
+      g1 = xy/celt_sqrt(1+2.f*xx*1.f*yy);
+#endif
+      if (abs(T1-prev_period)<=1)
+         cont = prev_gain;
+      else if (abs(T1-prev_period)<=2 && 5*k*k < T0)
+         cont = HALF32(prev_gain);
+      else
+         cont = 0;
+      thresh = MAX16(QCONST16(.3f,15), MULT16_16_Q15(QCONST16(.7f,15),g0)-cont);
+      /* Bias against very high pitch (very short period) to avoid false-positives
+         due to short-term correlation */
+      if (T1<3*minperiod)
+         thresh = MAX16(QCONST16(.4f,15), MULT16_16_Q15(QCONST16(.85f,15),g0)-cont);
+      else if (T1<2*minperiod)
+         thresh = MAX16(QCONST16(.5f,15), MULT16_16_Q15(QCONST16(.9f,15),g0)-cont);
+      if (g1 > thresh)
+      {
+         best_xy = xy;
+         best_yy = yy;
+         T = T1;
+         g = g1;
+      }
+   }
+   best_xy = MAX32(0, best_xy);
+   if (best_yy <= best_xy)
+      pg = Q15ONE;
+   else
+      pg = SHR32(frac_div32(best_xy,best_yy+1),16);
+
+   for (k=0;k<3;k++)
+   {
+      int T1 = T+k-1;
+      xy = 0;
+      for (i=0;i<N;i++)
+         xy = MAC16_16(xy, x[i], x[i-T1]);
+      xcorr[k] = xy;
+   }
+   if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
+      offset = 1;
+   else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
+      offset = -1;
+   else
+      offset = 0;
+   if (pg > g)
+      pg = g;
+   *T0_ = 2*T+offset;
+
+   if (*T0_<minperiod0)
+      *T0_=minperiod0;
+   RESTORE_STACK;
+   return pg;
+}

+ 173 - 0
drivers/opus/celt/pitch.h

@@ -0,0 +1,173 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file pitch.h
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_H
+#define PITCH_H
+
+#include "opus_modes.h"
+#include "cpu_support.h"
+
+#if defined(__SSE__) && !defined(OPUS_FIXED_POINT)
+#include "x86/pitch_sse.h"
+#endif
+
+#if defined(OPUS_ARM_ASM) && defined(OPUS_FIXED_POINT)
+# include "arm/pitch_arm.h"
+#endif
+
+void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
+      int len, int C, int arch);
+
+void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
+                  int len, int max_pitch, int *pitch, int arch);
+
+opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
+      int N, int *T0, int prev_period, opus_val16 prev_gain);
+
+/* OPT: This is the kernel you really want to optimize. It gets used a lot
+   by the prefilter and by the PLC. */
+#ifndef OVERRIDE_XCORR_KERNEL
+static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+{
+   int j;
+   opus_val16 y_0, y_1, y_2, y_3;
+   celt_assert(len>=3);
+   y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
+   y_0=*y++;
+   y_1=*y++;
+   y_2=*y++;
+   for (j=0;j<len-3;j+=4)
+   {
+      opus_val16 tmp;
+      tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+      tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+      tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+      tmp=*x++;
+      y_2=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_3);
+      sum[1] = MAC16_16(sum[1],tmp,y_0);
+      sum[2] = MAC16_16(sum[2],tmp,y_1);
+      sum[3] = MAC16_16(sum[3],tmp,y_2);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+   }
+   if (j<len)
+   {
+      opus_val16 tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+   }
+}
+#endif /* OVERRIDE_XCORR_KERNEL */
+
+#ifndef OVERRIDE_DUAL_INNER_PROD
+static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01=0;
+   opus_val32 xy02=0;
+   for (i=0;i<N;i++)
+   {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+#endif
+
+#ifdef OPUS_FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch);
+
+#if !defined(OVERRIDE_PITCH_XCORR)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD)
+extern
+#  if defined(OPUS_FIXED_POINT)
+opus_val32
+#  else
+void
+#  endif
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val32 *, int, int);
+
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch))
+# else
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
+# endif
+#endif
+
+#endif

+ 556 - 0
drivers/opus/celt/quant_bands.c

@@ -0,0 +1,556 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "quant_bands.h"
+#include "laplace.h"
+#include <math.h>
+#include "os_support.h"
+#include "arch.h"
+#include "mathops.h"
+#include "stack_alloc.h"
+#include "rate.h"
+
+#ifdef OPUS_FIXED_POINT
+/* Mean energy in each band quantized in Q4 */
+const signed char eMeans[25] = {
+      103,100, 92, 85, 81,
+       77, 72, 70, 78, 75,
+       73, 71, 78, 74, 69,
+       72, 70, 74, 76, 71,
+       60, 60, 60, 60, 60
+};
+#else
+/* Mean energy in each band quantized in Q4 and converted back to float */
+const opus_val16 eMeans[25] = {
+      6.437500f, 6.250000f, 5.750000f, 5.312500f, 5.062500f,
+      4.812500f, 4.500000f, 4.375000f, 4.875000f, 4.687500f,
+      4.562500f, 4.437500f, 4.875000f, 4.625000f, 4.312500f,
+      4.500000f, 4.375000f, 4.625000f, 4.750000f, 4.437500f,
+      3.750000f, 3.750000f, 3.750000f, 3.750000f, 3.750000f
+};
+#endif
+/* prediction coefficients: 0.9, 0.8, 0.65, 0.5 */
+#ifdef OPUS_FIXED_POINT
+static const opus_val16 pred_coef[4] = {29440, 26112, 21248, 16384};
+static const opus_val16 beta_coef[4] = {30147, 22282, 12124, 6554};
+static const opus_val16 beta_intra = 4915;
+#else
+static const opus_val16 pred_coef[4] = {29440/32768., 26112/32768., 21248/32768., 16384/32768.};
+static const opus_val16 beta_coef[4] = {30147/32768., 22282/32768., 12124/32768., 6554/32768.};
+static const opus_val16 beta_intra = 4915/32768.;
+#endif
+
+/*Parameters of the Laplace-like probability models used for the coarse energy.
+  There is one pair of parameters for each frame size, prediction type
+   (inter/intra), and band number.
+  The first number of each pair is the probability of 0, and the second is the
+   decay rate, both in Q8 precision.*/
+static const unsigned char e_prob_model[4][2][42] = {
+   /*120 sample frames.*/
+   {
+      /*Inter*/
+      {
+          72, 127,  65, 129,  66, 128,  65, 128,  64, 128,  62, 128,  64, 128,
+          64, 128,  92,  78,  92,  79,  92,  78,  90,  79, 116,  41, 115,  40,
+         114,  40, 132,  26, 132,  26, 145,  17, 161,  12, 176,  10, 177,  11
+      },
+      /*Intra*/
+      {
+          24, 179,  48, 138,  54, 135,  54, 132,  53, 134,  56, 133,  55, 132,
+          55, 132,  61, 114,  70,  96,  74,  88,  75,  88,  87,  74,  89,  66,
+          91,  67, 100,  59, 108,  50, 120,  40, 122,  37,  97,  43,  78,  50
+      }
+   },
+   /*240 sample frames.*/
+   {
+      /*Inter*/
+      {
+          83,  78,  84,  81,  88,  75,  86,  74,  87,  71,  90,  73,  93,  74,
+          93,  74, 109,  40, 114,  36, 117,  34, 117,  34, 143,  17, 145,  18,
+         146,  19, 162,  12, 165,  10, 178,   7, 189,   6, 190,   8, 177,   9
+      },
+      /*Intra*/
+      {
+          23, 178,  54, 115,  63, 102,  66,  98,  69,  99,  74,  89,  71,  91,
+          73,  91,  78,  89,  86,  80,  92,  66,  93,  64, 102,  59, 103,  60,
+         104,  60, 117,  52, 123,  44, 138,  35, 133,  31,  97,  38,  77,  45
+      }
+   },
+   /*480 sample frames.*/
+   {
+      /*Inter*/
+      {
+          61,  90,  93,  60, 105,  42, 107,  41, 110,  45, 116,  38, 113,  38,
+         112,  38, 124,  26, 132,  27, 136,  19, 140,  20, 155,  14, 159,  16,
+         158,  18, 170,  13, 177,  10, 187,   8, 192,   6, 175,   9, 159,  10
+      },
+      /*Intra*/
+      {
+          21, 178,  59, 110,  71,  86,  75,  85,  84,  83,  91,  66,  88,  73,
+          87,  72,  92,  75,  98,  72, 105,  58, 107,  54, 115,  52, 114,  55,
+         112,  56, 129,  51, 132,  40, 150,  33, 140,  29,  98,  35,  77,  42
+      }
+   },
+   /*960 sample frames.*/
+   {
+      /*Inter*/
+      {
+          42, 121,  96,  66, 108,  43, 111,  40, 117,  44, 123,  32, 120,  36,
+         119,  33, 127,  33, 134,  34, 139,  21, 147,  23, 152,  20, 158,  25,
+         154,  26, 166,  21, 173,  16, 184,  13, 184,  10, 150,  13, 139,  15
+      },
+      /*Intra*/
+      {
+          22, 178,  63, 114,  74,  82,  84,  83,  92,  82, 103,  62,  96,  72,
+          96,  67, 101,  73, 107,  72, 113,  55, 118,  52, 125,  52, 118,  52,
+         117,  55, 135,  49, 137,  39, 157,  32, 145,  29,  97,  33,  77,  40
+      }
+   }
+};
+
+static const unsigned char small_energy_icdf[3]={2,1,0};
+
+static opus_val32 loss_distortion(const opus_val16 *eBands, opus_val16 *oldEBands, int start, int end, int len, int C)
+{
+   int c, i;
+   opus_val32 dist = 0;
+   c=0; do {
+      for (i=start;i<end;i++)
+      {
+         opus_val16 d = SUB16(SHR16(eBands[i+c*len], 3), SHR16(oldEBands[i+c*len], 3));
+         dist = MAC16_16(dist, d,d);
+      }
+   } while (++c<C);
+   return MIN32(200,SHR32(dist,2*DB_SHIFT-6));
+}
+
+static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
+      const opus_val16 *eBands, opus_val16 *oldEBands,
+      opus_int32 budget, opus_int32 tell,
+      const unsigned char *prob_model, opus_val16 *error, ec_enc *enc,
+      int C, int LM, int intra, opus_val16 max_decay, int lfe)
+{
+   int i, c;
+   int badness = 0;
+   opus_val32 prev[2] = {0,0};
+   opus_val16 coef;
+   opus_val16 beta;
+
+   if (tell+3 <= budget)
+      ec_enc_bit_logp(enc, intra, 3);
+   if (intra)
+   {
+      coef = 0;
+      beta = beta_intra;
+   } else {
+      beta = beta_coef[LM];
+      coef = pred_coef[LM];
+   }
+
+   /* Encode at a fixed coarse resolution */
+   for (i=start;i<end;i++)
+   {
+      c=0;
+      do {
+         int bits_left;
+         int qi, qi0;
+         opus_val32 q;
+         opus_val16 x;
+         opus_val32 f, tmp;
+         opus_val16 oldE;
+         opus_val16 decay_bound;
+         x = eBands[i+c*m->nbEBands];
+         oldE = MAX16(-QCONST16(9.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]);
+#ifdef OPUS_FIXED_POINT
+         f = SHL32(EXTEND32(x),7) - PSHR32(MULT16_16(coef,oldE), 8) - prev[c];
+         /* Rounding to nearest integer here is really important! */
+         qi = (f+QCONST32(.5f,DB_SHIFT+7))>>(DB_SHIFT+7);
+         decay_bound = EXTRACT16(MAX32(-QCONST16(28.f,DB_SHIFT),
+               SUB32((opus_val32)oldEBands[i+c*m->nbEBands],max_decay)));
+#else
+         f = x-coef*oldE-prev[c];
+         /* Rounding to nearest integer here is really important! */
+         qi = (int)floor(.5f+f);
+         decay_bound = MAX16(-QCONST16(28.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]) - max_decay;
+#endif
+         /* Prevent the energy from going down too quickly (e.g. for bands
+            that have just one bin) */
+         if (qi < 0 && x < decay_bound)
+         {
+            qi += (int)SHR16(SUB16(decay_bound,x), DB_SHIFT);
+            if (qi > 0)
+               qi = 0;
+         }
+         qi0 = qi;
+         /* If we don't have enough bits to encode all the energy, just assume
+             something safe. */
+         tell = ec_tell(enc);
+         bits_left = budget-tell-3*C*(end-i);
+         if (i!=start && bits_left < 30)
+         {
+            if (bits_left < 24)
+               qi = IMIN(1, qi);
+            if (bits_left < 16)
+               qi = IMAX(-1, qi);
+         }
+         if (lfe && i>=2)
+            qi = IMIN(qi, 0);
+         if (budget-tell >= 15)
+         {
+            int pi;
+            pi = 2*IMIN(i,20);
+            ec_laplace_encode(enc, &qi,
+                  prob_model[pi]<<7, prob_model[pi+1]<<6);
+         }
+         else if(budget-tell >= 2)
+         {
+            qi = IMAX(-1, IMIN(qi, 1));
+            ec_enc_icdf(enc, 2*qi^-(qi<0), small_energy_icdf, 2);
+         }
+         else if(budget-tell >= 1)
+         {
+            qi = IMIN(0, qi);
+            ec_enc_bit_logp(enc, -qi, 1);
+         }
+         else
+            qi = -1;
+         error[i+c*m->nbEBands] = PSHR32(f,7) - SHL16(qi,DB_SHIFT);
+         badness += abs(qi0-qi);
+         q = (opus_val32)SHL32(EXTEND32(qi),DB_SHIFT);
+
+         tmp = PSHR32(MULT16_16(coef,oldE),8) + prev[c] + SHL32(q,7);
+#ifdef OPUS_FIXED_POINT
+         tmp = MAX32(-QCONST32(28.f, DB_SHIFT+7), tmp);
+#endif
+         oldEBands[i+c*m->nbEBands] = PSHR32(tmp, 7);
+         prev[c] = prev[c] + SHL32(q,7) - MULT16_16(beta,PSHR32(q,8));
+      } while (++c < C);
+   }
+   return lfe ? 0 : badness;
+}
+
+void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
+      const opus_val16 *eBands, opus_val16 *oldEBands, opus_uint32 budget,
+      opus_val16 *error, ec_enc *enc, int C, int LM, int nbAvailableBytes,
+      int force_intra, opus_val32 *delayedIntra, int two_pass, int loss_rate, int lfe)
+{
+   int intra;
+   opus_val16 max_decay;
+   VARDECL(opus_val16, oldEBands_intra);
+   VARDECL(opus_val16, error_intra);
+   ec_enc enc_start_state;
+   opus_uint32 tell;
+   int badness1=0;
+   opus_int32 intra_bias;
+   opus_val32 new_distortion;
+   SAVE_STACK;
+
+   intra = force_intra || (!two_pass && *delayedIntra>2*C*(end-start) && nbAvailableBytes > (end-start)*C);
+   intra_bias = (opus_int32)((budget**delayedIntra*loss_rate)/(C*512));
+   new_distortion = loss_distortion(eBands, oldEBands, start, effEnd, m->nbEBands, C);
+
+   tell = ec_tell(enc);
+   if (tell+3 > budget)
+      two_pass = intra = 0;
+
+   max_decay = QCONST16(16.f,DB_SHIFT);
+   if (end-start>10)
+   {
+#ifdef OPUS_FIXED_POINT
+      max_decay = MIN32(max_decay, SHL32(EXTEND32(nbAvailableBytes),DB_SHIFT-3));
+#else
+      max_decay = MIN32(max_decay, .125f*nbAvailableBytes);
+#endif
+   }
+   if (lfe)
+      max_decay=3;
+   enc_start_state = *enc;
+
+   ALLOC(oldEBands_intra, C*m->nbEBands, opus_val16);
+   ALLOC(error_intra, C*m->nbEBands, opus_val16);
+   OPUS_COPY(oldEBands_intra, oldEBands, C*m->nbEBands);
+
+   if (two_pass || intra)
+   {
+      badness1 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands_intra, budget,
+            tell, e_prob_model[LM][1], error_intra, enc, C, LM, 1, max_decay, lfe);
+   }
+
+   if (!intra)
+   {
+      unsigned char *intra_buf;
+      ec_enc enc_intra_state;
+      opus_int32 tell_intra;
+      opus_uint32 nstart_bytes;
+      opus_uint32 nintra_bytes;
+      opus_uint32 save_bytes;
+      int badness2;
+      VARDECL(unsigned char, intra_bits);
+
+      tell_intra = ec_tell_frac(enc);
+
+      enc_intra_state = *enc;
+
+      nstart_bytes = ec_range_bytes(&enc_start_state);
+      nintra_bytes = ec_range_bytes(&enc_intra_state);
+      intra_buf = ec_get_buffer(&enc_intra_state) + nstart_bytes;
+      save_bytes = nintra_bytes-nstart_bytes;
+      if (save_bytes == 0)
+         save_bytes = ALLOC_NONE;
+      ALLOC(intra_bits, save_bytes, unsigned char);
+      /* Copy bits from intra bit-stream */
+      OPUS_COPY(intra_bits, intra_buf, nintra_bytes - nstart_bytes);
+
+      *enc = enc_start_state;
+
+      badness2 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands, budget,
+            tell, e_prob_model[LM][intra], error, enc, C, LM, 0, max_decay, lfe);
+
+      if (two_pass && (badness1 < badness2 || (badness1 == badness2 && ((opus_int32)ec_tell_frac(enc))+intra_bias > tell_intra)))
+      {
+         *enc = enc_intra_state;
+         /* Copy intra bits to bit-stream */
+         OPUS_COPY(intra_buf, intra_bits, nintra_bytes - nstart_bytes);
+         OPUS_COPY(oldEBands, oldEBands_intra, C*m->nbEBands);
+         OPUS_COPY(error, error_intra, C*m->nbEBands);
+         intra = 1;
+      }
+   } else {
+      OPUS_COPY(oldEBands, oldEBands_intra, C*m->nbEBands);
+      OPUS_COPY(error, error_intra, C*m->nbEBands);
+   }
+
+   if (intra)
+      *delayedIntra = new_distortion;
+   else
+      *delayedIntra = ADD32(MULT16_32_Q15(MULT16_16_Q15(pred_coef[LM], pred_coef[LM]),*delayedIntra),
+            new_distortion);
+
+   RESTORE_STACK;
+}
+
+void quant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, ec_enc *enc, int C)
+{
+   int i, c;
+
+   /* Encode finer resolution */
+   for (i=start;i<end;i++)
+   {
+      opus_int16 frac = 1<<fine_quant[i];
+      if (fine_quant[i] <= 0)
+         continue;
+      c=0;
+      do {
+         int q2;
+         opus_val16 offset;
+#ifdef OPUS_FIXED_POINT
+         /* Has to be without rounding */
+         q2 = (error[i+c*m->nbEBands]+QCONST16(.5f,DB_SHIFT))>>(DB_SHIFT-fine_quant[i]);
+#else
+         q2 = (int)floor((error[i+c*m->nbEBands]+.5f)*frac);
+#endif
+         if (q2 > frac-1)
+            q2 = frac-1;
+         if (q2<0)
+            q2 = 0;
+         ec_enc_bits(enc, q2, fine_quant[i]);
+#ifdef OPUS_FIXED_POINT
+         offset = SUB16(SHR32(SHL32(EXTEND32(q2),DB_SHIFT)+QCONST16(.5f,DB_SHIFT),fine_quant[i]),QCONST16(.5f,DB_SHIFT));
+#else
+         offset = (q2+.5f)*(1<<(14-fine_quant[i]))*(1.f/16384) - .5f;
+#endif
+         oldEBands[i+c*m->nbEBands] += offset;
+         error[i+c*m->nbEBands] -= offset;
+         /*printf ("%f ", error[i] - offset);*/
+      } while (++c < C);
+   }
+}
+
+void quant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int C)
+{
+   int i, prio, c;
+
+   /* Use up the remaining bits */
+   for (prio=0;prio<2;prio++)
+   {
+      for (i=start;i<end && bits_left>=C ;i++)
+      {
+         if (fine_quant[i] >= MAX_FINE_BITS || fine_priority[i]!=prio)
+            continue;
+         c=0;
+         do {
+            int q2;
+            opus_val16 offset;
+            q2 = error[i+c*m->nbEBands]<0 ? 0 : 1;
+            ec_enc_bits(enc, q2, 1);
+#ifdef OPUS_FIXED_POINT
+            offset = SHR16(SHL16(q2,DB_SHIFT)-QCONST16(.5f,DB_SHIFT),fine_quant[i]+1);
+#else
+            offset = (q2-.5f)*(1<<(14-fine_quant[i]-1))*(1.f/16384);
+#endif
+            oldEBands[i+c*m->nbEBands] += offset;
+            bits_left--;
+         } while (++c < C);
+      }
+   }
+}
+
+void unquant_coarse_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int intra, ec_dec *dec, int C, int LM)
+{
+   const unsigned char *prob_model = e_prob_model[LM][intra];
+   int i, c;
+   opus_val32 prev[2] = {0, 0};
+   opus_val16 coef;
+   opus_val16 beta;
+   opus_int32 budget;
+   opus_int32 tell;
+
+   if (intra)
+   {
+      coef = 0;
+      beta = beta_intra;
+   } else {
+      beta = beta_coef[LM];
+      coef = pred_coef[LM];
+   }
+
+   budget = dec->storage*8;
+
+   /* Decode at a fixed coarse resolution */
+   for (i=start;i<end;i++)
+   {
+      c=0;
+      do {
+         int qi;
+         opus_val32 q;
+         opus_val32 tmp;
+         /* It would be better to express this invariant as a
+            test on C at function entry, but that isn't enough
+            to make the static analyzer happy. */
+         celt_assert(c<2);
+         tell = ec_tell(dec);
+         if(budget-tell>=15)
+         {
+            int pi;
+            pi = 2*IMIN(i,20);
+            qi = ec_laplace_decode(dec,
+                  prob_model[pi]<<7, prob_model[pi+1]<<6);
+         }
+         else if(budget-tell>=2)
+         {
+            qi = ec_dec_icdf(dec, small_energy_icdf, 2);
+            qi = (qi>>1)^-(qi&1);
+         }
+         else if(budget-tell>=1)
+         {
+            qi = -ec_dec_bit_logp(dec, 1);
+         }
+         else
+            qi = -1;
+         q = (opus_val32)SHL32(EXTEND32(qi),DB_SHIFT);
+
+         oldEBands[i+c*m->nbEBands] = MAX16(-QCONST16(9.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]);
+         tmp = PSHR32(MULT16_16(coef,oldEBands[i+c*m->nbEBands]),8) + prev[c] + SHL32(q,7);
+#ifdef OPUS_FIXED_POINT
+         tmp = MAX32(-QCONST32(28.f, DB_SHIFT+7), tmp);
+#endif
+         oldEBands[i+c*m->nbEBands] = PSHR32(tmp, 7);
+         prev[c] = prev[c] + SHL32(q,7) - MULT16_16(beta,PSHR32(q,8));
+      } while (++c < C);
+   }
+}
+
+void unquant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant, ec_dec *dec, int C)
+{
+   int i, c;
+   /* Decode finer resolution */
+   for (i=start;i<end;i++)
+   {
+      if (fine_quant[i] <= 0)
+         continue;
+      c=0;
+      do {
+         int q2;
+         opus_val16 offset;
+         q2 = ec_dec_bits(dec, fine_quant[i]);
+#ifdef OPUS_FIXED_POINT
+         offset = SUB16(SHR32(SHL32(EXTEND32(q2),DB_SHIFT)+QCONST16(.5f,DB_SHIFT),fine_quant[i]),QCONST16(.5f,DB_SHIFT));
+#else
+         offset = (q2+.5f)*(1<<(14-fine_quant[i]))*(1.f/16384) - .5f;
+#endif
+         oldEBands[i+c*m->nbEBands] += offset;
+      } while (++c < C);
+   }
+}
+
+void unquant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant,  int *fine_priority, int bits_left, ec_dec *dec, int C)
+{
+   int i, prio, c;
+
+   /* Use up the remaining bits */
+   for (prio=0;prio<2;prio++)
+   {
+      for (i=start;i<end && bits_left>=C ;i++)
+      {
+         if (fine_quant[i] >= MAX_FINE_BITS || fine_priority[i]!=prio)
+            continue;
+         c=0;
+         do {
+            int q2;
+            opus_val16 offset;
+            q2 = ec_dec_bits(dec, 1);
+#ifdef OPUS_FIXED_POINT
+            offset = SHR16(SHL16(q2,DB_SHIFT)-QCONST16(.5f,DB_SHIFT),fine_quant[i]+1);
+#else
+            offset = (q2-.5f)*(1<<(14-fine_quant[i]-1))*(1.f/16384);
+#endif
+            oldEBands[i+c*m->nbEBands] += offset;
+            bits_left--;
+         } while (++c < C);
+      }
+   }
+}
+
+void amp2Log2(const CELTMode *m, int effEnd, int end,
+      celt_ener *bandE, opus_val16 *bandLogE, int C)
+{
+   int c, i;
+   c=0;
+   do {
+      for (i=0;i<effEnd;i++)
+         bandLogE[i+c*m->nbEBands] =
+               celt_log2(SHL32(bandE[i+c*m->nbEBands],2))
+               - SHL16((opus_val16)eMeans[i],6);
+      for (i=effEnd;i<end;i++)
+         bandLogE[c*m->nbEBands+i] = -QCONST16(14.f,DB_SHIFT);
+   } while (++c < C);
+}

+ 66 - 0
drivers/opus/celt/quant_bands.h

@@ -0,0 +1,66 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef QUANT_BANDS
+#define QUANT_BANDS
+
+#include "arch.h"
+#include "opus_modes.h"
+#include "entenc.h"
+#include "entdec.h"
+#include "mathops.h"
+
+#ifdef OPUS_FIXED_POINT
+extern const signed char eMeans[25];
+#else
+extern const opus_val16 eMeans[25];
+#endif
+
+void amp2Log2(const CELTMode *m, int effEnd, int end,
+      celt_ener *bandE, opus_val16 *bandLogE, int C);
+
+void log2Amp(const CELTMode *m, int start, int end,
+      celt_ener *eBands, const opus_val16 *oldEBands, int C);
+
+void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
+      const opus_val16 *eBands, opus_val16 *oldEBands, opus_uint32 budget,
+      opus_val16 *error, ec_enc *enc, int C, int LM,
+      int nbAvailableBytes, int force_intra, opus_val32 *delayedIntra,
+      int two_pass, int loss_rate, int lfe);
+
+void quant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, ec_enc *enc, int C);
+
+void quant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int C);
+
+void unquant_coarse_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int intra, ec_dec *dec, int C, int LM);
+
+void unquant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant, ec_dec *dec, int C);
+
+void unquant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant, int *fine_priority, int bits_left, ec_dec *dec, int C);
+
+#endif /* QUANT_BANDS */

+ 638 - 0
drivers/opus/celt/rate.c

@@ -0,0 +1,638 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include <math.h>
+#include "opus_modes.h"
+#include "cwrs.h"
+#include "arch.h"
+#include "os_support.h"
+
+#include "entcode.h"
+#include "rate.h"
+
+static const unsigned char LOG2_FRAC_TABLE[24]={
+   0,
+   8,13,
+  16,19,21,23,
+  24,26,27,28,29,30,31,32,
+  32,33,34,34,35,36,36,37,37
+};
+
+#ifdef CUSTOM_MODES
+
+/*Determines if V(N,K) fits in a 32-bit unsigned integer.
+  N and K are themselves limited to 15 bits.*/
+static int fits_in32(int _n, int _k)
+{
+   static const opus_int16 maxN[15] = {
+      32767, 32767, 32767, 1476, 283, 109,  60,  40,
+       29,  24,  20,  18,  16,  14,  13};
+   static const opus_int16 maxK[15] = {
+      32767, 32767, 32767, 32767, 1172, 238,  95,  53,
+       36,  27,  22,  18,  16,  15,  13};
+   if (_n>=14)
+   {
+      if (_k>=14)
+         return 0;
+      else
+         return _n <= maxN[_k];
+   } else {
+      return _k <= maxK[_n];
+   }
+}
+
+void compute_pulse_cache(CELTMode *m, int LM)
+{
+   int C;
+   int i;
+   int j;
+   int curr=0;
+   int nbEntries=0;
+   int entryN[100], entryK[100], entryI[100];
+   const opus_int16 *eBands = m->eBands;
+   PulseCache *cache = &m->cache;
+   opus_int16 *cindex;
+   unsigned char *bits;
+   unsigned char *cap;
+
+   cindex = (opus_int16 *)opus_alloc(sizeof(cache->index[0])*m->nbEBands*(LM+2));
+   cache->index = cindex;
+
+   /* Scan for all unique band sizes */
+   for (i=0;i<=LM+1;i++)
+   {
+      for (j=0;j<m->nbEBands;j++)
+      {
+         int k;
+         int N = (eBands[j+1]-eBands[j])<<i>>1;
+         cindex[i*m->nbEBands+j] = -1;
+         /* Find other bands that have the same size */
+         for (k=0;k<=i;k++)
+         {
+            int n;
+            for (n=0;n<m->nbEBands && (k!=i || n<j);n++)
+            {
+               if (N == (eBands[n+1]-eBands[n])<<k>>1)
+               {
+                  cindex[i*m->nbEBands+j] = cindex[k*m->nbEBands+n];
+                  break;
+               }
+            }
+         }
+         if (cache->index[i*m->nbEBands+j] == -1 && N!=0)
+         {
+            int K;
+            entryN[nbEntries] = N;
+            K = 0;
+            while (fits_in32(N,get_pulses(K+1)) && K<MAX_PSEUDO)
+               K++;
+            entryK[nbEntries] = K;
+            cindex[i*m->nbEBands+j] = curr;
+            entryI[nbEntries] = curr;
+
+            curr += K+1;
+            nbEntries++;
+         }
+      }
+   }
+   bits = (unsigned char *)opus_alloc(sizeof(unsigned char)*curr);
+   cache->bits = bits;
+   cache->size = curr;
+   /* Compute the cache for all unique sizes */
+   for (i=0;i<nbEntries;i++)
+   {
+      unsigned char *ptr = bits+entryI[i];
+      opus_int16 tmp[MAX_PULSES+1];
+      get_required_bits(tmp, entryN[i], get_pulses(entryK[i]), BITRES);
+      for (j=1;j<=entryK[i];j++)
+         ptr[j] = tmp[get_pulses(j)]-1;
+      ptr[0] = entryK[i];
+   }
+
+   /* Compute the maximum rate for each band at which we'll reliably use as
+       many bits as we ask for. */
+   cache->caps = cap = (unsigned char *)opus_alloc(sizeof(cache->caps[0])*(LM+1)*2*m->nbEBands);
+   for (i=0;i<=LM;i++)
+   {
+      for (C=1;C<=2;C++)
+      {
+         for (j=0;j<m->nbEBands;j++)
+         {
+            int N0;
+            int max_bits;
+            N0 = m->eBands[j+1]-m->eBands[j];
+            /* N=1 bands only have a sign bit and fine bits. */
+            if (N0<<i == 1)
+               max_bits = C*(1+MAX_FINE_BITS)<<BITRES;
+            else
+            {
+               const unsigned char *pcache;
+               opus_int32           num;
+               opus_int32           den;
+               int                  LM0;
+               int                  N;
+               int                  offset;
+               int                  ndof;
+               int                  qb;
+               int                  k;
+               LM0 = 0;
+               /* Even-sized bands bigger than N=2 can be split one more time.
+                  As of commit 44203907 all bands >1 are even, including custom modes.*/
+               if (N0 > 2)
+               {
+                  N0>>=1;
+                  LM0--;
+               }
+               /* N0=1 bands can't be split down to N<2. */
+               else if (N0 <= 1)
+               {
+                  LM0=IMIN(i,1);
+                  N0<<=LM0;
+               }
+               /* Compute the cost for the lowest-level PVQ of a fully split
+                   band. */
+               pcache = bits + cindex[(LM0+1)*m->nbEBands+j];
+               max_bits = pcache[pcache[0]]+1;
+               /* Add in the cost of coding regular splits. */
+               N = N0;
+               for(k=0;k<i-LM0;k++){
+                  max_bits <<= 1;
+                  /* Offset the number of qtheta bits by log2(N)/2
+                      + QTHETA_OFFSET compared to their "fair share" of
+                      total/N */
+                  offset = ((m->logN[j]+((LM0+k)<<BITRES))>>1)-QTHETA_OFFSET;
+                  /* The number of qtheta bits we'll allocate if the remainder
+                      is to be max_bits.
+                     The average measured cost for theta is 0.89701 times qb,
+                      approximated here as 459/512. */
+                  num=459*(opus_int32)((2*N-1)*offset+max_bits);
+                  den=((opus_int32)(2*N-1)<<9)-459;
+                  qb = IMIN((num+(den>>1))/den, 57);
+                  celt_assert(qb >= 0);
+                  max_bits += qb;
+                  N <<= 1;
+               }
+               /* Add in the cost of a stereo split, if necessary. */
+               if (C==2)
+               {
+                  max_bits <<= 1;
+                  offset = ((m->logN[j]+(i<<BITRES))>>1)-(N==2?QTHETA_OFFSET_TWOPHASE:QTHETA_OFFSET);
+                  ndof = 2*N-1-(N==2);
+                  /* The average measured cost for theta with the step PDF is
+                      0.95164 times qb, approximated here as 487/512. */
+                  num = (N==2?512:487)*(opus_int32)(max_bits+ndof*offset);
+                  den = ((opus_int32)ndof<<9)-(N==2?512:487);
+                  qb = IMIN((num+(den>>1))/den, (N==2?64:61));
+                  celt_assert(qb >= 0);
+                  max_bits += qb;
+               }
+               /* Add the fine bits we'll use. */
+               /* Compensate for the extra DoF in stereo */
+               ndof = C*N + ((C==2 && N>2) ? 1 : 0);
+               /* Offset the number of fine bits by log2(N)/2 + FINE_OFFSET
+                   compared to their "fair share" of total/N */
+               offset = ((m->logN[j] + (i<<BITRES))>>1)-FINE_OFFSET;
+               /* N=2 is the only point that doesn't match the curve */
+               if (N==2)
+                  offset += 1<<BITRES>>2;
+               /* The number of fine bits we'll allocate if the remainder is
+                   to be max_bits. */
+               num = max_bits+ndof*offset;
+               den = (ndof-1)<<BITRES;
+               qb = IMIN((num+(den>>1))/den, MAX_FINE_BITS);
+               celt_assert(qb >= 0);
+               max_bits += C*qb<<BITRES;
+            }
+            max_bits = (4*max_bits/(C*((m->eBands[j+1]-m->eBands[j])<<i)))-64;
+            celt_assert(max_bits >= 0);
+            celt_assert(max_bits < 256);
+            *cap++ = (unsigned char)max_bits;
+         }
+      }
+   }
+}
+
+#endif /* CUSTOM_MODES */
+
+#define ALLOC_STEPS 6
+
+static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start,
+      const int *bits1, const int *bits2, const int *thresh, const int *cap, opus_int32 total, opus_int32 *_balance,
+      int skip_rsv, int *intensity, int intensity_rsv, int *dual_stereo, int dual_stereo_rsv, int *bits,
+      int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth)
+{
+   opus_int32 psum;
+   int lo, hi;
+   int i, j;
+   int logM;
+   int stereo;
+   int codedBands=-1;
+   int alloc_floor;
+   opus_int32 left, percoeff;
+   int done;
+   opus_int32 balance;
+   SAVE_STACK;
+
+   alloc_floor = C<<BITRES;
+   stereo = C>1;
+
+   logM = LM<<BITRES;
+   lo = 0;
+   hi = 1<<ALLOC_STEPS;
+   for (i=0;i<ALLOC_STEPS;i++)
+   {
+      int mid = (lo+hi)>>1;
+      psum = 0;
+      done = 0;
+      for (j=end;j-->start;)
+      {
+         int tmp = bits1[j] + (mid*(opus_int32)bits2[j]>>ALLOC_STEPS);
+         if (tmp >= thresh[j] || done)
+         {
+            done = 1;
+            /* Don't allocate more than we can actually use */
+            psum += IMIN(tmp, cap[j]);
+         } else {
+            if (tmp >= alloc_floor)
+               psum += alloc_floor;
+         }
+      }
+      if (psum > total)
+         hi = mid;
+      else
+         lo = mid;
+   }
+   psum = 0;
+   /*printf ("interp bisection gave %d\n", lo);*/
+   done = 0;
+   for (j=end;j-->start;)
+   {
+      int tmp = bits1[j] + (lo*bits2[j]>>ALLOC_STEPS);
+      if (tmp < thresh[j] && !done)
+      {
+         if (tmp >= alloc_floor)
+            tmp = alloc_floor;
+         else
+            tmp = 0;
+      } else
+         done = 1;
+      /* Don't allocate more than we can actually use */
+      tmp = IMIN(tmp, cap[j]);
+      bits[j] = tmp;
+      psum += tmp;
+   }
+
+   /* Decide which bands to skip, working backwards from the end. */
+   for (codedBands=end;;codedBands--)
+   {
+      int band_width;
+      int band_bits;
+      int rem;
+      j = codedBands-1;
+      /* Never skip the first band, nor a band that has been boosted by
+          dynalloc.
+         In the first case, we'd be coding a bit to signal we're going to waste
+          all the other bits.
+         In the second case, we'd be coding a bit to redistribute all the bits
+          we just signaled should be cocentrated in this band. */
+      if (j<=skip_start)
+      {
+         /* Give the bit we reserved to end skipping back. */
+         total += skip_rsv;
+         break;
+      }
+      /*Figure out how many left-over bits we would be adding to this band.
+        This can include bits we've stolen back from higher, skipped bands.*/
+      left = total-psum;
+      percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+      left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
+      rem = IMAX(left-(m->eBands[j]-m->eBands[start]),0);
+      band_width = m->eBands[codedBands]-m->eBands[j];
+      band_bits = (int)(bits[j] + percoeff*band_width + rem);
+      /*Only code a skip decision if we're above the threshold for this band.
+        Otherwise it is force-skipped.
+        This ensures that we have enough bits to code the skip flag.*/
+      if (band_bits >= IMAX(thresh[j], alloc_floor+(1<<BITRES)))
+      {
+         if (encode)
+         {
+            /*This if() block is the only part of the allocation function that
+               is not a mandatory part of the bitstream: any bands we choose to
+               skip here must be explicitly signaled.*/
+            /*Choose a threshold with some hysteresis to keep bands from
+               fluctuating in and out.*/
+#ifdef FUZZING
+            if ((rand()&0x1) == 0)
+#else
+            if (codedBands<=start+2 || (band_bits > ((j<prev?7:9)*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
+#endif
+            {
+               ec_enc_bit_logp(ec, 1, 1);
+               break;
+            }
+            ec_enc_bit_logp(ec, 0, 1);
+         } else if (ec_dec_bit_logp(ec, 1)) {
+            break;
+         }
+         /*We used a bit to skip this band.*/
+         psum += 1<<BITRES;
+         band_bits -= 1<<BITRES;
+      }
+      /*Reclaim the bits originally allocated to this band.*/
+      psum -= bits[j]+intensity_rsv;
+      if (intensity_rsv > 0)
+         intensity_rsv = LOG2_FRAC_TABLE[j-start];
+      psum += intensity_rsv;
+      if (band_bits >= alloc_floor)
+      {
+         /*If we have enough for a fine energy bit per channel, use it.*/
+         psum += alloc_floor;
+         bits[j] = alloc_floor;
+      } else {
+         /*Otherwise this band gets nothing at all.*/
+         bits[j] = 0;
+      }
+   }
+
+   celt_assert(codedBands > start);
+   /* Code the intensity and dual stereo parameters. */
+   if (intensity_rsv > 0)
+   {
+      if (encode)
+      {
+         *intensity = IMIN(*intensity, codedBands);
+         ec_enc_uint(ec, *intensity-start, codedBands+1-start);
+      }
+      else
+         *intensity = start+ec_dec_uint(ec, codedBands+1-start);
+   }
+   else
+      *intensity = 0;
+   if (*intensity <= start)
+   {
+      total += dual_stereo_rsv;
+      dual_stereo_rsv = 0;
+   }
+   if (dual_stereo_rsv > 0)
+   {
+      if (encode)
+         ec_enc_bit_logp(ec, *dual_stereo, 1);
+      else
+         *dual_stereo = ec_dec_bit_logp(ec, 1);
+   }
+   else
+      *dual_stereo = 0;
+
+   /* Allocate the remaining bits */
+   left = total-psum;
+   percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+   left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
+   for (j=start;j<codedBands;j++)
+      bits[j] += ((int)percoeff*(m->eBands[j+1]-m->eBands[j]));
+   for (j=start;j<codedBands;j++)
+   {
+      int tmp = (int)IMIN(left, m->eBands[j+1]-m->eBands[j]);
+      bits[j] += tmp;
+      left -= tmp;
+   }
+   /*for (j=0;j<end;j++)printf("%d ", bits[j]);printf("\n");*/
+
+   balance = 0;
+   for (j=start;j<codedBands;j++)
+   {
+      int N0, N, den;
+      int offset;
+      int NClogN;
+      opus_int32 excess, bit;
+
+      celt_assert(bits[j] >= 0);
+      N0 = m->eBands[j+1]-m->eBands[j];
+      N=N0<<LM;
+      bit = (opus_int32)bits[j]+balance;
+
+      if (N>1)
+      {
+         excess = MAX32(bit-cap[j],0);
+         bits[j] = bit-excess;
+
+         /* Compensate for the extra DoF in stereo */
+         den=(C*N+ ((C==2 && N>2 && !*dual_stereo && j<*intensity) ? 1 : 0));
+
+         NClogN = den*(m->logN[j] + logM);
+
+         /* Offset for the number of fine bits by log2(N)/2 + FINE_OFFSET
+            compared to their "fair share" of total/N */
+         offset = (NClogN>>1)-den*FINE_OFFSET;
+
+         /* N=2 is the only point that doesn't match the curve */
+         if (N==2)
+            offset += den<<BITRES>>2;
+
+         /* Changing the offset for allocating the second and third
+             fine energy bit */
+         if (bits[j] + offset < den*2<<BITRES)
+            offset += NClogN>>2;
+         else if (bits[j] + offset < den*3<<BITRES)
+            offset += NClogN>>3;
+
+         /* Divide with rounding */
+         ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))) / (den<<BITRES));
+
+         /* Make sure not to bust */
+         if (C*ebits[j] > (bits[j]>>BITRES))
+            ebits[j] = bits[j] >> stereo >> BITRES;
+
+         /* More than that is useless because that's about as far as PVQ can go */
+         ebits[j] = IMIN(ebits[j], MAX_FINE_BITS);
+
+         /* If we rounded down or capped this band, make it a candidate for the
+             final fine energy pass */
+         fine_priority[j] = ebits[j]*(den<<BITRES) >= bits[j]+offset;
+
+         /* Remove the allocated fine bits; the rest are assigned to PVQ */
+         bits[j] -= C*ebits[j]<<BITRES;
+
+      } else {
+         /* For N=1, all bits go to fine energy except for a single sign bit */
+         excess = MAX32(0,bit-(C<<BITRES));
+         bits[j] = bit-excess;
+         ebits[j] = 0;
+         fine_priority[j] = 1;
+      }
+
+      /* Fine energy can't take advantage of the re-balancing in
+          quant_all_bands().
+         Instead, do the re-balancing here.*/
+      if(excess > 0)
+      {
+         int extra_fine;
+         int extra_bits;
+         extra_fine = IMIN(excess>>(stereo+BITRES),MAX_FINE_BITS-ebits[j]);
+         ebits[j] += extra_fine;
+         extra_bits = extra_fine*C<<BITRES;
+         fine_priority[j] = extra_bits >= excess-balance;
+         excess -= extra_bits;
+      }
+      balance = excess;
+
+      celt_assert(bits[j] >= 0);
+      celt_assert(ebits[j] >= 0);
+   }
+   /* Save any remaining bits over the cap for the rebalancing in
+       quant_all_bands(). */
+   *_balance = balance;
+
+   /* The skipped bands use all their bits for fine energy. */
+   for (;j<end;j++)
+   {
+      ebits[j] = bits[j] >> stereo >> BITRES;
+      celt_assert(C*ebits[j]<<BITRES == bits[j]);
+      bits[j] = 0;
+      fine_priority[j] = ebits[j]<1;
+   }
+   RESTORE_STACK;
+   return codedBands;
+}
+
+int compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stereo,
+      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth)
+{
+   int lo, hi, len, j;
+   int codedBands;
+   int skip_start;
+   int skip_rsv;
+   int intensity_rsv;
+   int dual_stereo_rsv;
+   VARDECL(int, bits1);
+   VARDECL(int, bits2);
+   VARDECL(int, thresh);
+   VARDECL(int, trim_offset);
+   SAVE_STACK;
+
+   total = IMAX(total, 0);
+   len = m->nbEBands;
+   skip_start = start;
+   /* Reserve a bit to signal the end of manually skipped bands. */
+   skip_rsv = total >= 1<<BITRES ? 1<<BITRES : 0;
+   total -= skip_rsv;
+   /* Reserve bits for the intensity and dual stereo parameters. */
+   intensity_rsv = dual_stereo_rsv = 0;
+   if (C==2)
+   {
+      intensity_rsv = LOG2_FRAC_TABLE[end-start];
+      if (intensity_rsv>total)
+         intensity_rsv = 0;
+      else
+      {
+         total -= intensity_rsv;
+         dual_stereo_rsv = total>=1<<BITRES ? 1<<BITRES : 0;
+         total -= dual_stereo_rsv;
+      }
+   }
+   ALLOC(bits1, len, int);
+   ALLOC(bits2, len, int);
+   ALLOC(thresh, len, int);
+   ALLOC(trim_offset, len, int);
+
+   for (j=start;j<end;j++)
+   {
+      /* Below this threshold, we're sure not to allocate any PVQ bits */
+      thresh[j] = IMAX((C)<<BITRES, (3*(m->eBands[j+1]-m->eBands[j])<<LM<<BITRES)>>4);
+      /* Tilt of the allocation curve */
+      trim_offset[j] = C*(m->eBands[j+1]-m->eBands[j])*(alloc_trim-5-LM)*(end-j-1)
+            *(1<<(LM+BITRES))>>6;
+      /* Giving less resolution to single-coefficient bands because they get
+         more benefit from having one coarse value per coefficient*/
+      if ((m->eBands[j+1]-m->eBands[j])<<LM==1)
+         trim_offset[j] -= C<<BITRES;
+   }
+   lo = 1;
+   hi = m->nbAllocVectors - 1;
+   do
+   {
+      int done = 0;
+      int psum = 0;
+      int mid = (lo+hi) >> 1;
+      for (j=end;j-->start;)
+      {
+         int bitsj;
+         int N = m->eBands[j+1]-m->eBands[j];
+         bitsj = C*N*m->allocVectors[mid*len+j]<<LM>>2;
+         if (bitsj > 0)
+            bitsj = IMAX(0, bitsj + trim_offset[j]);
+         bitsj += offsets[j];
+         if (bitsj >= thresh[j] || done)
+         {
+            done = 1;
+            /* Don't allocate more than we can actually use */
+            psum += IMIN(bitsj, cap[j]);
+         } else {
+            if (bitsj >= C<<BITRES)
+               psum += C<<BITRES;
+         }
+      }
+      if (psum > total)
+         hi = mid - 1;
+      else
+         lo = mid + 1;
+      /*printf ("lo = %d, hi = %d\n", lo, hi);*/
+   }
+   while (lo <= hi);
+   hi = lo--;
+   /*printf ("interp between %d and %d\n", lo, hi);*/
+   for (j=start;j<end;j++)
+   {
+      int bits1j, bits2j;
+      int N = m->eBands[j+1]-m->eBands[j];
+      bits1j = C*N*m->allocVectors[lo*len+j]<<LM>>2;
+      bits2j = hi>=m->nbAllocVectors ?
+            cap[j] : C*N*m->allocVectors[hi*len+j]<<LM>>2;
+      if (bits1j > 0)
+         bits1j = IMAX(0, bits1j + trim_offset[j]);
+      if (bits2j > 0)
+         bits2j = IMAX(0, bits2j + trim_offset[j]);
+      if (lo > 0)
+         bits1j += offsets[j];
+      bits2j += offsets[j];
+      if (offsets[j]>0)
+         skip_start = j;
+      bits2j = IMAX(0,bits2j-bits1j);
+      bits1[j] = bits1j;
+      bits2[j] = bits2j;
+   }
+   codedBands = interp_bits2pulses(m, start, end, skip_start, bits1, bits2, thresh, cap,
+         total, balance, skip_rsv, intensity, intensity_rsv, dual_stereo, dual_stereo_rsv,
+         pulses, ebits, fine_priority, C, LM, ec, encode, prev, signalBandwidth);
+   RESTORE_STACK;
+   return codedBands;
+}
+

+ 101 - 0
drivers/opus/celt/rate.h

@@ -0,0 +1,101 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef RATE_H
+#define RATE_H
+
+#define MAX_PSEUDO 40
+#define LOG_MAX_PSEUDO 6
+
+#define MAX_PULSES 128
+
+#define MAX_FINE_BITS 8
+
+#define FINE_OFFSET 21
+#define QTHETA_OFFSET 4
+#define QTHETA_OFFSET_TWOPHASE 16
+
+#include "cwrs.h"
+#include "opus_modes.h"
+
+void compute_pulse_cache(CELTMode *m, int LM);
+
+static OPUS_INLINE int get_pulses(int i)
+{
+   return i<8 ? i : (8 + (i&7)) << ((i>>3)-1);
+}
+
+static OPUS_INLINE int bits2pulses(const CELTMode *m, int band, int LM, int bits)
+{
+   int i;
+   int lo, hi;
+   const unsigned char *cache;
+
+   LM++;
+   cache = m->cache.bits + m->cache.index[LM*m->nbEBands+band];
+
+   lo = 0;
+   hi = cache[0];
+   bits--;
+   for (i=0;i<LOG_MAX_PSEUDO;i++)
+   {
+      int mid = (lo+hi+1)>>1;
+      /* OPT: Make sure this is implemented with a conditional move */
+      if ((int)cache[mid] >= bits)
+         hi = mid;
+      else
+         lo = mid;
+   }
+   if (bits- (lo == 0 ? -1 : (int)cache[lo]) <= (int)cache[hi]-bits)
+      return lo;
+   else
+      return hi;
+}
+
+static OPUS_INLINE int pulses2bits(const CELTMode *m, int band, int LM, int pulses)
+{
+   const unsigned char *cache;
+
+   LM++;
+   cache = m->cache.bits + m->cache.index[LM*m->nbEBands+band];
+   return pulses == 0 ? 0 : cache[pulses]+1;
+}
+
+/** Compute the pulse allocation, i.e. how many pulses will go in each
+  * band.
+ @param m mode
+ @param offsets Requested increase or decrease in the number of bits for
+                each band
+ @param total Number of bands
+ @param pulses Number of pulses per band (returned)
+ @return Total number of bits allocated
+*/
+int compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stero,
+      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth);
+
+#endif

+ 182 - 0
drivers/opus/celt/stack_alloc.h

@@ -0,0 +1,182 @@
+/* Copyright (C) 2002-2003 Jean-Marc Valin
+   Copyright (C) 2007-2009 Xiph.Org Foundation */
+/**
+   @file stack_alloc.h
+   @brief Temporary memory allocation on stack
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STACK_ALLOC_H
+#define STACK_ALLOC_H
+
+#include "opus_types.h"
+#include "opus_defines.h"
+
+#if (!defined (VAR_ARRAYS) && !defined (USE_ALLOCA) && !defined (NONTHREADSAFE_PSEUDOSTACK))
+#define VAR_ARRAYS
+#endif
+
+#ifdef USE_ALLOCA
+# ifdef WIN32
+#  include <malloc.h>
+# else
+#  ifdef OPUS_HAVE_ALLOCA_H
+#   include <alloca.h>
+#  else
+#   ifdef __linux__
+#    include <alloca.h>
+#   else
+#    include <stdlib.h>
+#   endif
+#  endif
+# endif
+#endif
+
+/**
+ * @def ALIGN(stack, size)
+ *
+ * Aligns the stack to a 'size' boundary
+ *
+ * @param stack Stack
+ * @param size  New size boundary
+ */
+
+/**
+ * @def PUSH(stack, size, type)
+ *
+ * Allocates 'size' elements of type 'type' on the stack
+ *
+ * @param stack Stack
+ * @param size  Number of elements
+ * @param type  Type of element
+ */
+
+/**
+ * @def VARDECL(var)
+ *
+ * Declare variable on stack
+ *
+ * @param var Variable to declare
+ */
+
+/**
+ * @def ALLOC(var, size, type)
+ *
+ * Allocate 'size' elements of 'type' on stack
+ *
+ * @param var  Name of variable to allocate
+ * @param size Number of elements
+ * @param type Type of element
+ */
+
+#if defined(VAR_ARRAYS)
+
+#define VARDECL(type, var)
+#define ALLOC(var, size, type) type var[size]
+#define SAVE_STACK
+#define RESTORE_STACK
+#define ALLOC_STACK
+/* C99 does not allow VLAs of size zero */
+#define ALLOC_NONE 1
+
+#elif defined(USE_ALLOCA)
+
+#define VARDECL(type, var) type *var
+
+# ifdef WIN32
+#  define ALLOC(var, size, type) var = ((type*)_alloca(sizeof(type)*(size)))
+# else
+#  define ALLOC(var, size, type) var = ((type*)alloca(sizeof(type)*(size)))
+# endif
+
+#define SAVE_STACK
+#define RESTORE_STACK
+#define ALLOC_STACK
+#define ALLOC_NONE 0
+
+#else
+
+#ifdef CELT_C
+char *global_stack=0;
+#else
+extern char *global_stack;
+#endif /* CELT_C */
+
+#ifdef ENABLE_VALGRIND
+
+#include <valgrind/memcheck.h>
+
+#ifdef CELT_C
+char *global_stack_top=0;
+#else
+extern char *global_stack_top;
+#endif /* CELT_C */
+
+#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
+#define PUSH(stack, size, type) (VALGRIND_MAKE_MEM_NOACCESS(stack, global_stack_top-stack),ALIGN((stack),sizeof(type)/sizeof(char)),VALGRIND_MAKE_MEM_UNDEFINED(stack, ((size)*sizeof(type)/sizeof(char))),(stack)+=(2*(size)*sizeof(type)/sizeof(char)),(type*)((stack)-(2*(size)*sizeof(type)/sizeof(char))))
+#define RESTORE_STACK ((global_stack = _saved_stack),VALGRIND_MAKE_MEM_NOACCESS(global_stack, global_stack_top-global_stack))
+#define ALLOC_STACK char *_saved_stack; ((global_stack = (global_stack==0) ? ((global_stack_top=opus_alloc_scratch(GLOBAL_STACK_SIZE*2)+(GLOBAL_STACK_SIZE*2))-(GLOBAL_STACK_SIZE*2)) : global_stack),VALGRIND_MAKE_MEM_NOACCESS(global_stack, global_stack_top-global_stack)); _saved_stack = global_stack;
+
+#else
+
+#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
+#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char))))
+#define RESTORE_STACK (global_stack = _saved_stack)
+#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? opus_alloc_scratch(GLOBAL_STACK_SIZE) : global_stack); _saved_stack = global_stack;
+
+#endif /* ENABLE_VALGRIND */
+
+#include "os_support.h"
+#define VARDECL(type, var) type *var
+#define ALLOC(var, size, type) var = PUSH(global_stack, size, type)
+#define SAVE_STACK char *_saved_stack = global_stack;
+#define ALLOC_NONE 0
+
+#endif /* VAR_ARRAYS */
+
+
+#ifdef ENABLE_VALGRIND
+
+#include <valgrind/memcheck.h>
+#define OPUS_CHECK_ARRAY(ptr, len) VALGRIND_CHECK_MEM_IS_DEFINED(ptr, len*sizeof(*ptr))
+#define OPUS_CHECK_VALUE(value) VALGRIND_CHECK_VALUE_IS_DEFINED(value)
+#define OPUS_CHECK_ARRAY_COND(ptr, len) VALGRIND_CHECK_MEM_IS_DEFINED(ptr, len*sizeof(*ptr))
+#define OPUS_CHECK_VALUE_COND(value) VALGRIND_CHECK_VALUE_IS_DEFINED(value)
+#define OPUS_PRINT_INT(value) do {fprintf(stderr, #value " = %d at %s:%d\n", value, __FILE__, __LINE__);}while(0)
+#define OPUS_FPRINTF fprintf
+
+#else
+
+static OPUS_INLINE int _opus_false(void) {return 0;}
+#define OPUS_CHECK_ARRAY(ptr, len) _opus_false()
+#define OPUS_CHECK_VALUE(value) _opus_false()
+#define OPUS_PRINT_INT(value) do{}while(0)
+#define OPUS_FPRINTF (void)
+
+#endif
+
+
+#endif /* STACK_ALLOC_H */

+ 595 - 0
drivers/opus/celt/static_modes_fixed.h

@@ -0,0 +1,595 @@
+/* The contents of this file was automatically generated by dump_modes.c
+   with arguments: 48000 960
+   It contains static definitions for some pre-defined modes. */
+#include "opus_modes.h"
+#include "rate.h"
+
+#ifndef DEF_WINDOW120
+#define DEF_WINDOW120
+static const opus_val16 window120[120] = {
+2, 20, 55, 108, 178,
+266, 372, 494, 635, 792,
+966, 1157, 1365, 1590, 1831,
+2089, 2362, 2651, 2956, 3276,
+3611, 3961, 4325, 4703, 5094,
+5499, 5916, 6346, 6788, 7241,
+7705, 8179, 8663, 9156, 9657,
+10167, 10684, 11207, 11736, 12271,
+12810, 13353, 13899, 14447, 14997,
+15547, 16098, 16648, 17197, 17744,
+18287, 18827, 19363, 19893, 20418,
+20936, 21447, 21950, 22445, 22931,
+23407, 23874, 24330, 24774, 25208,
+25629, 26039, 26435, 26819, 27190,
+27548, 27893, 28224, 28541, 28845,
+29135, 29411, 29674, 29924, 30160,
+30384, 30594, 30792, 30977, 31151,
+31313, 31463, 31602, 31731, 31849,
+31958, 32057, 32148, 32229, 32303,
+32370, 32429, 32481, 32528, 32568,
+32604, 32634, 32661, 32683, 32701,
+32717, 32729, 32740, 32748, 32754,
+32758, 32762, 32764, 32766, 32767,
+32767, 32767, 32767, 32767, 32767,
+};
+#endif
+
+#ifndef DEF_LOGN400
+#define DEF_LOGN400
+static const opus_int16 logN400[21] = {
+0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 16, 16, 16, 21, 21, 24, 29, 34, 36, };
+#endif
+
+#ifndef DEF_PULSE_CACHE50
+#define DEF_PULSE_CACHE50
+static const opus_int16 cache_index50[105] = {
+-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 41, 41, 41,
+82, 82, 123, 164, 200, 222, 0, 0, 0, 0, 0, 0, 0, 0, 41,
+41, 41, 41, 123, 123, 123, 164, 164, 240, 266, 283, 295, 41, 41, 41,
+41, 41, 41, 41, 41, 123, 123, 123, 123, 240, 240, 240, 266, 266, 305,
+318, 328, 336, 123, 123, 123, 123, 123, 123, 123, 123, 240, 240, 240, 240,
+305, 305, 305, 318, 318, 343, 351, 358, 364, 240, 240, 240, 240, 240, 240,
+240, 240, 305, 305, 305, 305, 343, 343, 343, 351, 351, 370, 376, 382, 387,
+};
+static const unsigned char cache_bits50[392] = {
+40, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 40, 15, 23, 28,
+31, 34, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50,
+51, 52, 53, 54, 55, 55, 57, 58, 59, 60, 61, 62, 63, 63, 65,
+66, 67, 68, 69, 70, 71, 71, 40, 20, 33, 41, 48, 53, 57, 61,
+64, 66, 69, 71, 73, 75, 76, 78, 80, 82, 85, 87, 89, 91, 92,
+94, 96, 98, 101, 103, 105, 107, 108, 110, 112, 114, 117, 119, 121, 123,
+124, 126, 128, 40, 23, 39, 51, 60, 67, 73, 79, 83, 87, 91, 94,
+97, 100, 102, 105, 107, 111, 115, 118, 121, 124, 126, 129, 131, 135, 139,
+142, 145, 148, 150, 153, 155, 159, 163, 166, 169, 172, 174, 177, 179, 35,
+28, 49, 65, 78, 89, 99, 107, 114, 120, 126, 132, 136, 141, 145, 149,
+153, 159, 165, 171, 176, 180, 185, 189, 192, 199, 205, 211, 216, 220, 225,
+229, 232, 239, 245, 251, 21, 33, 58, 79, 97, 112, 125, 137, 148, 157,
+166, 174, 182, 189, 195, 201, 207, 217, 227, 235, 243, 251, 17, 35, 63,
+86, 106, 123, 139, 152, 165, 177, 187, 197, 206, 214, 222, 230, 237, 250,
+25, 31, 55, 75, 91, 105, 117, 128, 138, 146, 154, 161, 168, 174, 180,
+185, 190, 200, 208, 215, 222, 229, 235, 240, 245, 255, 16, 36, 65, 89,
+110, 128, 144, 159, 173, 185, 196, 207, 217, 226, 234, 242, 250, 11, 41,
+74, 103, 128, 151, 172, 191, 209, 225, 241, 255, 9, 43, 79, 110, 138,
+163, 186, 207, 227, 246, 12, 39, 71, 99, 123, 144, 164, 182, 198, 214,
+228, 241, 253, 9, 44, 81, 113, 142, 168, 192, 214, 235, 255, 7, 49,
+90, 127, 160, 191, 220, 247, 6, 51, 95, 134, 170, 203, 234, 7, 47,
+87, 123, 155, 184, 212, 237, 6, 52, 97, 137, 174, 208, 240, 5, 57,
+106, 151, 192, 231, 5, 59, 111, 158, 202, 243, 5, 55, 103, 147, 187,
+224, 5, 60, 113, 161, 206, 248, 4, 65, 122, 175, 224, 4, 67, 127,
+182, 234, };
+static const unsigned char cache_caps50[168] = {
+224, 224, 224, 224, 224, 224, 224, 224, 160, 160, 160, 160, 185, 185, 185,
+178, 178, 168, 134, 61, 37, 224, 224, 224, 224, 224, 224, 224, 224, 240,
+240, 240, 240, 207, 207, 207, 198, 198, 183, 144, 66, 40, 160, 160, 160,
+160, 160, 160, 160, 160, 185, 185, 185, 185, 193, 193, 193, 183, 183, 172,
+138, 64, 38, 240, 240, 240, 240, 240, 240, 240, 240, 207, 207, 207, 207,
+204, 204, 204, 193, 193, 180, 143, 66, 40, 185, 185, 185, 185, 185, 185,
+185, 185, 193, 193, 193, 193, 193, 193, 193, 183, 183, 172, 138, 65, 39,
+207, 207, 207, 207, 207, 207, 207, 207, 204, 204, 204, 204, 201, 201, 201,
+188, 188, 176, 141, 66, 40, 193, 193, 193, 193, 193, 193, 193, 193, 193,
+193, 193, 193, 194, 194, 194, 184, 184, 173, 139, 65, 39, 204, 204, 204,
+204, 204, 204, 204, 204, 201, 201, 201, 201, 198, 198, 198, 187, 187, 175,
+140, 66, 40, };
+#endif
+
+#ifndef FFT_TWIDDLES48000_960
+#define FFT_TWIDDLES48000_960
+static const kiss_twiddle_cpx fft_twiddles48000_960[480] = {
+{32767, 0}, {32766, -429},
+{32757, -858}, {32743, -1287},
+{32724, -1715}, {32698, -2143},
+{32667, -2570}, {32631, -2998},
+{32588, -3425}, {32541, -3851},
+{32488, -4277}, {32429, -4701},
+{32364, -5125}, {32295, -5548},
+{32219, -5971}, {32138, -6393},
+{32051, -6813}, {31960, -7231},
+{31863, -7650}, {31760, -8067},
+{31652, -8481}, {31539, -8895},
+{31419, -9306}, {31294, -9716},
+{31165, -10126}, {31030, -10532},
+{30889, -10937}, {30743, -11340},
+{30592, -11741}, {30436, -12141},
+{30274, -12540}, {30107, -12935},
+{29936, -13328}, {29758, -13718},
+{29577, -14107}, {29390, -14493},
+{29197, -14875}, {29000, -15257},
+{28797, -15635}, {28590, -16010},
+{28379, -16384}, {28162, -16753},
+{27940, -17119}, {27714, -17484},
+{27482, -17845}, {27246, -18205},
+{27006, -18560}, {26760, -18911},
+{26510, -19260}, {26257, -19606},
+{25997, -19947}, {25734, -20286},
+{25466, -20621}, {25194, -20952},
+{24918, -21281}, {24637, -21605},
+{24353, -21926}, {24063, -22242},
+{23770, -22555}, {23473, -22865},
+{23171, -23171}, {22866, -23472},
+{22557, -23769}, {22244, -24063},
+{21927, -24352}, {21606, -24636},
+{21282, -24917}, {20954, -25194},
+{20622, -25465}, {20288, -25733},
+{19949, -25997}, {19607, -26255},
+{19261, -26509}, {18914, -26760},
+{18561, -27004}, {18205, -27246},
+{17846, -27481}, {17485, -27713},
+{17122, -27940}, {16755, -28162},
+{16385, -28378}, {16012, -28590},
+{15636, -28797}, {15258, -28999},
+{14878, -29197}, {14494, -29389},
+{14108, -29576}, {13720, -29757},
+{13329, -29934}, {12937, -30107},
+{12540, -30274}, {12142, -30435},
+{11744, -30592}, {11342, -30743},
+{10939, -30889}, {10534, -31030},
+{10127, -31164}, {9718, -31294},
+{9307, -31418}, {8895, -31537},
+{8482, -31652}, {8067, -31759},
+{7650, -31862}, {7233, -31960},
+{6815, -32051}, {6393, -32138},
+{5973, -32219}, {5549, -32294},
+{5127, -32364}, {4703, -32429},
+{4278, -32487}, {3852, -32541},
+{3426, -32588}, {2999, -32630},
+{2572, -32667}, {2144, -32698},
+{1716, -32724}, {1287, -32742},
+{860, -32757}, {430, -32766},
+{0, -32767}, {-429, -32766},
+{-858, -32757}, {-1287, -32743},
+{-1715, -32724}, {-2143, -32698},
+{-2570, -32667}, {-2998, -32631},
+{-3425, -32588}, {-3851, -32541},
+{-4277, -32488}, {-4701, -32429},
+{-5125, -32364}, {-5548, -32295},
+{-5971, -32219}, {-6393, -32138},
+{-6813, -32051}, {-7231, -31960},
+{-7650, -31863}, {-8067, -31760},
+{-8481, -31652}, {-8895, -31539},
+{-9306, -31419}, {-9716, -31294},
+{-10126, -31165}, {-10532, -31030},
+{-10937, -30889}, {-11340, -30743},
+{-11741, -30592}, {-12141, -30436},
+{-12540, -30274}, {-12935, -30107},
+{-13328, -29936}, {-13718, -29758},
+{-14107, -29577}, {-14493, -29390},
+{-14875, -29197}, {-15257, -29000},
+{-15635, -28797}, {-16010, -28590},
+{-16384, -28379}, {-16753, -28162},
+{-17119, -27940}, {-17484, -27714},
+{-17845, -27482}, {-18205, -27246},
+{-18560, -27006}, {-18911, -26760},
+{-19260, -26510}, {-19606, -26257},
+{-19947, -25997}, {-20286, -25734},
+{-20621, -25466}, {-20952, -25194},
+{-21281, -24918}, {-21605, -24637},
+{-21926, -24353}, {-22242, -24063},
+{-22555, -23770}, {-22865, -23473},
+{-23171, -23171}, {-23472, -22866},
+{-23769, -22557}, {-24063, -22244},
+{-24352, -21927}, {-24636, -21606},
+{-24917, -21282}, {-25194, -20954},
+{-25465, -20622}, {-25733, -20288},
+{-25997, -19949}, {-26255, -19607},
+{-26509, -19261}, {-26760, -18914},
+{-27004, -18561}, {-27246, -18205},
+{-27481, -17846}, {-27713, -17485},
+{-27940, -17122}, {-28162, -16755},
+{-28378, -16385}, {-28590, -16012},
+{-28797, -15636}, {-28999, -15258},
+{-29197, -14878}, {-29389, -14494},
+{-29576, -14108}, {-29757, -13720},
+{-29934, -13329}, {-30107, -12937},
+{-30274, -12540}, {-30435, -12142},
+{-30592, -11744}, {-30743, -11342},
+{-30889, -10939}, {-31030, -10534},
+{-31164, -10127}, {-31294, -9718},
+{-31418, -9307}, {-31537, -8895},
+{-31652, -8482}, {-31759, -8067},
+{-31862, -7650}, {-31960, -7233},
+{-32051, -6815}, {-32138, -6393},
+{-32219, -5973}, {-32294, -5549},
+{-32364, -5127}, {-32429, -4703},
+{-32487, -4278}, {-32541, -3852},
+{-32588, -3426}, {-32630, -2999},
+{-32667, -2572}, {-32698, -2144},
+{-32724, -1716}, {-32742, -1287},
+{-32757, -860}, {-32766, -430},
+{-32767, 0}, {-32766, 429},
+{-32757, 858}, {-32743, 1287},
+{-32724, 1715}, {-32698, 2143},
+{-32667, 2570}, {-32631, 2998},
+{-32588, 3425}, {-32541, 3851},
+{-32488, 4277}, {-32429, 4701},
+{-32364, 5125}, {-32295, 5548},
+{-32219, 5971}, {-32138, 6393},
+{-32051, 6813}, {-31960, 7231},
+{-31863, 7650}, {-31760, 8067},
+{-31652, 8481}, {-31539, 8895},
+{-31419, 9306}, {-31294, 9716},
+{-31165, 10126}, {-31030, 10532},
+{-30889, 10937}, {-30743, 11340},
+{-30592, 11741}, {-30436, 12141},
+{-30274, 12540}, {-30107, 12935},
+{-29936, 13328}, {-29758, 13718},
+{-29577, 14107}, {-29390, 14493},
+{-29197, 14875}, {-29000, 15257},
+{-28797, 15635}, {-28590, 16010},
+{-28379, 16384}, {-28162, 16753},
+{-27940, 17119}, {-27714, 17484},
+{-27482, 17845}, {-27246, 18205},
+{-27006, 18560}, {-26760, 18911},
+{-26510, 19260}, {-26257, 19606},
+{-25997, 19947}, {-25734, 20286},
+{-25466, 20621}, {-25194, 20952},
+{-24918, 21281}, {-24637, 21605},
+{-24353, 21926}, {-24063, 22242},
+{-23770, 22555}, {-23473, 22865},
+{-23171, 23171}, {-22866, 23472},
+{-22557, 23769}, {-22244, 24063},
+{-21927, 24352}, {-21606, 24636},
+{-21282, 24917}, {-20954, 25194},
+{-20622, 25465}, {-20288, 25733},
+{-19949, 25997}, {-19607, 26255},
+{-19261, 26509}, {-18914, 26760},
+{-18561, 27004}, {-18205, 27246},
+{-17846, 27481}, {-17485, 27713},
+{-17122, 27940}, {-16755, 28162},
+{-16385, 28378}, {-16012, 28590},
+{-15636, 28797}, {-15258, 28999},
+{-14878, 29197}, {-14494, 29389},
+{-14108, 29576}, {-13720, 29757},
+{-13329, 29934}, {-12937, 30107},
+{-12540, 30274}, {-12142, 30435},
+{-11744, 30592}, {-11342, 30743},
+{-10939, 30889}, {-10534, 31030},
+{-10127, 31164}, {-9718, 31294},
+{-9307, 31418}, {-8895, 31537},
+{-8482, 31652}, {-8067, 31759},
+{-7650, 31862}, {-7233, 31960},
+{-6815, 32051}, {-6393, 32138},
+{-5973, 32219}, {-5549, 32294},
+{-5127, 32364}, {-4703, 32429},
+{-4278, 32487}, {-3852, 32541},
+{-3426, 32588}, {-2999, 32630},
+{-2572, 32667}, {-2144, 32698},
+{-1716, 32724}, {-1287, 32742},
+{-860, 32757}, {-430, 32766},
+{0, 32767}, {429, 32766},
+{858, 32757}, {1287, 32743},
+{1715, 32724}, {2143, 32698},
+{2570, 32667}, {2998, 32631},
+{3425, 32588}, {3851, 32541},
+{4277, 32488}, {4701, 32429},
+{5125, 32364}, {5548, 32295},
+{5971, 32219}, {6393, 32138},
+{6813, 32051}, {7231, 31960},
+{7650, 31863}, {8067, 31760},
+{8481, 31652}, {8895, 31539},
+{9306, 31419}, {9716, 31294},
+{10126, 31165}, {10532, 31030},
+{10937, 30889}, {11340, 30743},
+{11741, 30592}, {12141, 30436},
+{12540, 30274}, {12935, 30107},
+{13328, 29936}, {13718, 29758},
+{14107, 29577}, {14493, 29390},
+{14875, 29197}, {15257, 29000},
+{15635, 28797}, {16010, 28590},
+{16384, 28379}, {16753, 28162},
+{17119, 27940}, {17484, 27714},
+{17845, 27482}, {18205, 27246},
+{18560, 27006}, {18911, 26760},
+{19260, 26510}, {19606, 26257},
+{19947, 25997}, {20286, 25734},
+{20621, 25466}, {20952, 25194},
+{21281, 24918}, {21605, 24637},
+{21926, 24353}, {22242, 24063},
+{22555, 23770}, {22865, 23473},
+{23171, 23171}, {23472, 22866},
+{23769, 22557}, {24063, 22244},
+{24352, 21927}, {24636, 21606},
+{24917, 21282}, {25194, 20954},
+{25465, 20622}, {25733, 20288},
+{25997, 19949}, {26255, 19607},
+{26509, 19261}, {26760, 18914},
+{27004, 18561}, {27246, 18205},
+{27481, 17846}, {27713, 17485},
+{27940, 17122}, {28162, 16755},
+{28378, 16385}, {28590, 16012},
+{28797, 15636}, {28999, 15258},
+{29197, 14878}, {29389, 14494},
+{29576, 14108}, {29757, 13720},
+{29934, 13329}, {30107, 12937},
+{30274, 12540}, {30435, 12142},
+{30592, 11744}, {30743, 11342},
+{30889, 10939}, {31030, 10534},
+{31164, 10127}, {31294, 9718},
+{31418, 9307}, {31537, 8895},
+{31652, 8482}, {31759, 8067},
+{31862, 7650}, {31960, 7233},
+{32051, 6815}, {32138, 6393},
+{32219, 5973}, {32294, 5549},
+{32364, 5127}, {32429, 4703},
+{32487, 4278}, {32541, 3852},
+{32588, 3426}, {32630, 2999},
+{32667, 2572}, {32698, 2144},
+{32724, 1716}, {32742, 1287},
+{32757, 860}, {32766, 430},
+};
+#ifndef FFT_BITREV480
+#define FFT_BITREV480
+static const opus_int16 fft_bitrev480[480] = {
+0, 120, 240, 360, 30, 150, 270, 390, 60, 180, 300, 420, 90, 210, 330,
+450, 15, 135, 255, 375, 45, 165, 285, 405, 75, 195, 315, 435, 105, 225,
+345, 465, 5, 125, 245, 365, 35, 155, 275, 395, 65, 185, 305, 425, 95,
+215, 335, 455, 20, 140, 260, 380, 50, 170, 290, 410, 80, 200, 320, 440,
+110, 230, 350, 470, 10, 130, 250, 370, 40, 160, 280, 400, 70, 190, 310,
+430, 100, 220, 340, 460, 25, 145, 265, 385, 55, 175, 295, 415, 85, 205,
+325, 445, 115, 235, 355, 475, 1, 121, 241, 361, 31, 151, 271, 391, 61,
+181, 301, 421, 91, 211, 331, 451, 16, 136, 256, 376, 46, 166, 286, 406,
+76, 196, 316, 436, 106, 226, 346, 466, 6, 126, 246, 366, 36, 156, 276,
+396, 66, 186, 306, 426, 96, 216, 336, 456, 21, 141, 261, 381, 51, 171,
+291, 411, 81, 201, 321, 441, 111, 231, 351, 471, 11, 131, 251, 371, 41,
+161, 281, 401, 71, 191, 311, 431, 101, 221, 341, 461, 26, 146, 266, 386,
+56, 176, 296, 416, 86, 206, 326, 446, 116, 236, 356, 476, 2, 122, 242,
+362, 32, 152, 272, 392, 62, 182, 302, 422, 92, 212, 332, 452, 17, 137,
+257, 377, 47, 167, 287, 407, 77, 197, 317, 437, 107, 227, 347, 467, 7,
+127, 247, 367, 37, 157, 277, 397, 67, 187, 307, 427, 97, 217, 337, 457,
+22, 142, 262, 382, 52, 172, 292, 412, 82, 202, 322, 442, 112, 232, 352,
+472, 12, 132, 252, 372, 42, 162, 282, 402, 72, 192, 312, 432, 102, 222,
+342, 462, 27, 147, 267, 387, 57, 177, 297, 417, 87, 207, 327, 447, 117,
+237, 357, 477, 3, 123, 243, 363, 33, 153, 273, 393, 63, 183, 303, 423,
+93, 213, 333, 453, 18, 138, 258, 378, 48, 168, 288, 408, 78, 198, 318,
+438, 108, 228, 348, 468, 8, 128, 248, 368, 38, 158, 278, 398, 68, 188,
+308, 428, 98, 218, 338, 458, 23, 143, 263, 383, 53, 173, 293, 413, 83,
+203, 323, 443, 113, 233, 353, 473, 13, 133, 253, 373, 43, 163, 283, 403,
+73, 193, 313, 433, 103, 223, 343, 463, 28, 148, 268, 388, 58, 178, 298,
+418, 88, 208, 328, 448, 118, 238, 358, 478, 4, 124, 244, 364, 34, 154,
+274, 394, 64, 184, 304, 424, 94, 214, 334, 454, 19, 139, 259, 379, 49,
+169, 289, 409, 79, 199, 319, 439, 109, 229, 349, 469, 9, 129, 249, 369,
+39, 159, 279, 399, 69, 189, 309, 429, 99, 219, 339, 459, 24, 144, 264,
+384, 54, 174, 294, 414, 84, 204, 324, 444, 114, 234, 354, 474, 14, 134,
+254, 374, 44, 164, 284, 404, 74, 194, 314, 434, 104, 224, 344, 464, 29,
+149, 269, 389, 59, 179, 299, 419, 89, 209, 329, 449, 119, 239, 359, 479,
+};
+#endif
+
+#ifndef FFT_BITREV240
+#define FFT_BITREV240
+static const opus_int16 fft_bitrev240[240] = {
+0, 60, 120, 180, 15, 75, 135, 195, 30, 90, 150, 210, 45, 105, 165,
+225, 5, 65, 125, 185, 20, 80, 140, 200, 35, 95, 155, 215, 50, 110,
+170, 230, 10, 70, 130, 190, 25, 85, 145, 205, 40, 100, 160, 220, 55,
+115, 175, 235, 1, 61, 121, 181, 16, 76, 136, 196, 31, 91, 151, 211,
+46, 106, 166, 226, 6, 66, 126, 186, 21, 81, 141, 201, 36, 96, 156,
+216, 51, 111, 171, 231, 11, 71, 131, 191, 26, 86, 146, 206, 41, 101,
+161, 221, 56, 116, 176, 236, 2, 62, 122, 182, 17, 77, 137, 197, 32,
+92, 152, 212, 47, 107, 167, 227, 7, 67, 127, 187, 22, 82, 142, 202,
+37, 97, 157, 217, 52, 112, 172, 232, 12, 72, 132, 192, 27, 87, 147,
+207, 42, 102, 162, 222, 57, 117, 177, 237, 3, 63, 123, 183, 18, 78,
+138, 198, 33, 93, 153, 213, 48, 108, 168, 228, 8, 68, 128, 188, 23,
+83, 143, 203, 38, 98, 158, 218, 53, 113, 173, 233, 13, 73, 133, 193,
+28, 88, 148, 208, 43, 103, 163, 223, 58, 118, 178, 238, 4, 64, 124,
+184, 19, 79, 139, 199, 34, 94, 154, 214, 49, 109, 169, 229, 9, 69,
+129, 189, 24, 84, 144, 204, 39, 99, 159, 219, 54, 114, 174, 234, 14,
+74, 134, 194, 29, 89, 149, 209, 44, 104, 164, 224, 59, 119, 179, 239,
+};
+#endif
+
+#ifndef FFT_BITREV120
+#define FFT_BITREV120
+static const opus_int16 fft_bitrev120[120] = {
+0, 30, 60, 90, 15, 45, 75, 105, 5, 35, 65, 95, 20, 50, 80,
+110, 10, 40, 70, 100, 25, 55, 85, 115, 1, 31, 61, 91, 16, 46,
+76, 106, 6, 36, 66, 96, 21, 51, 81, 111, 11, 41, 71, 101, 26,
+56, 86, 116, 2, 32, 62, 92, 17, 47, 77, 107, 7, 37, 67, 97,
+22, 52, 82, 112, 12, 42, 72, 102, 27, 57, 87, 117, 3, 33, 63,
+93, 18, 48, 78, 108, 8, 38, 68, 98, 23, 53, 83, 113, 13, 43,
+73, 103, 28, 58, 88, 118, 4, 34, 64, 94, 19, 49, 79, 109, 9,
+39, 69, 99, 24, 54, 84, 114, 14, 44, 74, 104, 29, 59, 89, 119,
+};
+#endif
+
+#ifndef FFT_BITREV60
+#define FFT_BITREV60
+static const opus_int16 fft_bitrev60[60] = {
+0, 15, 30, 45, 5, 20, 35, 50, 10, 25, 40, 55, 1, 16, 31,
+46, 6, 21, 36, 51, 11, 26, 41, 56, 2, 17, 32, 47, 7, 22,
+37, 52, 12, 27, 42, 57, 3, 18, 33, 48, 8, 23, 38, 53, 13,
+28, 43, 58, 4, 19, 34, 49, 9, 24, 39, 54, 14, 29, 44, 59,
+};
+#endif
+
+#ifndef FFT_STATE48000_960_0
+#define FFT_STATE48000_960_0
+static const kiss_fft_state fft_state48000_960_0 = {
+480,    /* nfft */
+-1,     /* shift */
+{4, 120, 4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, }, /* factors */
+fft_bitrev480,  /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#ifndef FFT_STATE48000_960_1
+#define FFT_STATE48000_960_1
+static const kiss_fft_state fft_state48000_960_1 = {
+240,    /* nfft */
+1,      /* shift */
+{4, 60, 4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+fft_bitrev240,  /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#ifndef FFT_STATE48000_960_2
+#define FFT_STATE48000_960_2
+static const kiss_fft_state fft_state48000_960_2 = {
+120,    /* nfft */
+2,      /* shift */
+{4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+fft_bitrev120,  /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#ifndef FFT_STATE48000_960_3
+#define FFT_STATE48000_960_3
+static const kiss_fft_state fft_state48000_960_3 = {
+60,     /* nfft */
+3,      /* shift */
+{4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
+fft_bitrev60,   /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#endif
+
+#ifndef MDCT_TWIDDLES960
+#define MDCT_TWIDDLES960
+static const opus_val16 mdct_twiddles960[481] = {
+32767, 32767, 32767, 32767, 32766,
+32763, 32762, 32759, 32757, 32753,
+32751, 32747, 32743, 32738, 32733,
+32729, 32724, 32717, 32711, 32705,
+32698, 32690, 32683, 32676, 32667,
+32658, 32650, 32640, 32631, 32620,
+32610, 32599, 32588, 32577, 32566,
+32554, 32541, 32528, 32515, 32502,
+32487, 32474, 32459, 32444, 32429,
+32413, 32397, 32381, 32364, 32348,
+32331, 32313, 32294, 32277, 32257,
+32239, 32219, 32200, 32180, 32159,
+32138, 32118, 32096, 32074, 32051,
+32029, 32006, 31984, 31960, 31936,
+31912, 31888, 31863, 31837, 31812,
+31786, 31760, 31734, 31707, 31679,
+31652, 31624, 31596, 31567, 31539,
+31508, 31479, 31450, 31419, 31388,
+31357, 31326, 31294, 31262, 31230,
+31198, 31164, 31131, 31097, 31063,
+31030, 30994, 30959, 30924, 30889,
+30853, 30816, 30779, 30743, 30705,
+30668, 30629, 30592, 30553, 30515,
+30475, 30435, 30396, 30356, 30315,
+30274, 30233, 30191, 30149, 30107,
+30065, 30022, 29979, 29936, 29891,
+29847, 29803, 29758, 29713, 29668,
+29622, 29577, 29529, 29483, 29436,
+29390, 29341, 29293, 29246, 29197,
+29148, 29098, 29050, 29000, 28949,
+28899, 28848, 28797, 28746, 28694,
+28642, 28590, 28537, 28485, 28432,
+28378, 28324, 28271, 28217, 28162,
+28106, 28051, 27995, 27940, 27884,
+27827, 27770, 27713, 27657, 27598,
+27540, 27481, 27423, 27365, 27305,
+27246, 27187, 27126, 27066, 27006,
+26945, 26883, 26822, 26760, 26698,
+26636, 26574, 26510, 26448, 26383,
+26320, 26257, 26191, 26127, 26062,
+25997, 25931, 25866, 25800, 25734,
+25667, 25601, 25533, 25466, 25398,
+25330, 25262, 25194, 25125, 25056,
+24987, 24917, 24848, 24778, 24707,
+24636, 24566, 24495, 24424, 24352,
+24280, 24208, 24135, 24063, 23990,
+23917, 23842, 23769, 23695, 23622,
+23546, 23472, 23398, 23322, 23246,
+23171, 23095, 23018, 22942, 22866,
+22788, 22711, 22634, 22557, 22478,
+22400, 22322, 22244, 22165, 22085,
+22006, 21927, 21846, 21766, 21687,
+21606, 21524, 21443, 21363, 21282,
+21199, 21118, 21035, 20954, 20870,
+20788, 20705, 20621, 20538, 20455,
+20371, 20286, 20202, 20118, 20034,
+19947, 19863, 19777, 19692, 19606,
+19520, 19434, 19347, 19260, 19174,
+19088, 18999, 18911, 18825, 18737,
+18648, 18560, 18472, 18384, 18294,
+18205, 18116, 18025, 17936, 17846,
+17757, 17666, 17576, 17485, 17395,
+17303, 17212, 17122, 17030, 16937,
+16846, 16755, 16662, 16569, 16477,
+16385, 16291, 16198, 16105, 16012,
+15917, 15824, 15730, 15636, 15541,
+15447, 15352, 15257, 15162, 15067,
+14973, 14875, 14781, 14685, 14589,
+14493, 14396, 14300, 14204, 14107,
+14010, 13914, 13815, 13718, 13621,
+13524, 13425, 13328, 13230, 13133,
+13033, 12935, 12836, 12738, 12638,
+12540, 12441, 12341, 12241, 12142,
+12044, 11943, 11843, 11744, 11643,
+11542, 11442, 11342, 11241, 11139,
+11039, 10939, 10836, 10736, 10635,
+10534, 10431, 10330, 10228, 10127,
+10024, 9921, 9820, 9718, 9614,
+9512, 9410, 9306, 9204, 9101,
+8998, 8895, 8791, 8689, 8585,
+8481, 8377, 8274, 8171, 8067,
+7962, 7858, 7753, 7650, 7545,
+7441, 7336, 7231, 7129, 7023,
+6917, 6813, 6709, 6604, 6498,
+6393, 6288, 6182, 6077, 5973,
+5867, 5760, 5656, 5549, 5445,
+5339, 5232, 5127, 5022, 4914,
+4809, 4703, 4596, 4490, 4384,
+4278, 4171, 4065, 3958, 3852,
+3745, 3640, 3532, 3426, 3318,
+3212, 3106, 2998, 2891, 2786,
+2679, 2570, 2465, 2358, 2251,
+2143, 2037, 1929, 1823, 1715,
+1609, 1501, 1393, 1287, 1180,
+1073, 964, 858, 751, 644,
+535, 429, 322, 214, 107,
+0, };
+#endif
+
+static const CELTMode mode48000_960_120 = {
+48000,  /* Fs */
+120,    /* overlap */
+21,     /* nbEBands */
+21,     /* effEBands */
+{27853, 0, 4096, 8192, },       /* preemph */
+eband5ms,       /* eBands */
+3,      /* maxLM */
+8,      /* nbShortMdcts */
+120,    /* shortMdctSize */
+11,     /* nbAllocVectors */
+band_allocation,        /* allocVectors */
+logN400,        /* logN */
+window120,      /* window */
+{1920, 3, {&fft_state48000_960_0, &fft_state48000_960_1, &fft_state48000_960_2, &fft_state48000_960_3, }, mdct_twiddles960},    /* mdct */
+{392, cache_index50, cache_bits50, cache_caps50},       /* cache */
+};
+
+/* List of all the available modes */
+#define TOTAL_MODES 1
+static const CELTMode * const static_mode_list[TOTAL_MODES] = {
+&mode48000_960_120,
+};

+ 599 - 0
drivers/opus/celt/static_modes_float.h

@@ -0,0 +1,599 @@
+/* The contents of this file was automatically generated by dump_modes.c
+   with arguments: 48000 960
+   It contains static definitions for some pre-defined modes. */
+#include "opus_modes.h"
+#include "rate.h"
+
+#ifndef DEF_WINDOW120
+#define DEF_WINDOW120
+static const opus_val16 window120[120] = {
+6.7286966e-05f, 0.00060551348f, 0.0016815970f, 0.0032947962f, 0.0054439943f,
+0.0081276923f, 0.011344001f, 0.015090633f, 0.019364886f, 0.024163635f,
+0.029483315f, 0.035319905f, 0.041668911f, 0.048525347f, 0.055883718f,
+0.063737999f, 0.072081616f, 0.080907428f, 0.090207705f, 0.099974111f,
+0.11019769f, 0.12086883f, 0.13197729f, 0.14351214f, 0.15546177f,
+0.16781389f, 0.18055550f, 0.19367290f, 0.20715171f, 0.22097682f,
+0.23513243f, 0.24960208f, 0.26436860f, 0.27941419f, 0.29472040f,
+0.31026818f, 0.32603788f, 0.34200931f, 0.35816177f, 0.37447407f,
+0.39092462f, 0.40749142f, 0.42415215f, 0.44088423f, 0.45766484f,
+0.47447104f, 0.49127978f, 0.50806798f, 0.52481261f, 0.54149077f,
+0.55807973f, 0.57455701f, 0.59090049f, 0.60708841f, 0.62309951f,
+0.63891306f, 0.65450896f, 0.66986776f, 0.68497077f, 0.69980010f,
+0.71433873f, 0.72857055f, 0.74248043f, 0.75605424f, 0.76927895f,
+0.78214257f, 0.79463430f, 0.80674445f, 0.81846456f, 0.82978733f,
+0.84070669f, 0.85121779f, 0.86131698f, 0.87100183f, 0.88027111f,
+0.88912479f, 0.89756398f, 0.90559094f, 0.91320904f, 0.92042270f,
+0.92723738f, 0.93365955f, 0.93969656f, 0.94535671f, 0.95064907f,
+0.95558353f, 0.96017067f, 0.96442171f, 0.96834849f, 0.97196334f,
+0.97527906f, 0.97830883f, 0.98106616f, 0.98356480f, 0.98581869f,
+0.98784191f, 0.98964856f, 0.99125274f, 0.99266849f, 0.99390969f,
+0.99499004f, 0.99592297f, 0.99672162f, 0.99739874f, 0.99796667f,
+0.99843728f, 0.99882195f, 0.99913147f, 0.99937606f, 0.99956527f,
+0.99970802f, 0.99981248f, 0.99988613f, 0.99993565f, 0.99996697f,
+0.99998518f, 0.99999457f, 0.99999859f, 0.99999982f, 1.0000000f,
+};
+#endif
+
+#ifndef DEF_LOGN400
+#define DEF_LOGN400
+static const opus_int16 logN400[21] = {
+0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 16, 16, 16, 21, 21, 24, 29, 34, 36, };
+#endif
+
+#ifndef DEF_PULSE_CACHE50
+#define DEF_PULSE_CACHE50
+static const opus_int16 cache_index50[105] = {
+-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 41, 41, 41,
+82, 82, 123, 164, 200, 222, 0, 0, 0, 0, 0, 0, 0, 0, 41,
+41, 41, 41, 123, 123, 123, 164, 164, 240, 266, 283, 295, 41, 41, 41,
+41, 41, 41, 41, 41, 123, 123, 123, 123, 240, 240, 240, 266, 266, 305,
+318, 328, 336, 123, 123, 123, 123, 123, 123, 123, 123, 240, 240, 240, 240,
+305, 305, 305, 318, 318, 343, 351, 358, 364, 240, 240, 240, 240, 240, 240,
+240, 240, 305, 305, 305, 305, 343, 343, 343, 351, 351, 370, 376, 382, 387,
+};
+static const unsigned char cache_bits50[392] = {
+40, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 40, 15, 23, 28,
+31, 34, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50,
+51, 52, 53, 54, 55, 55, 57, 58, 59, 60, 61, 62, 63, 63, 65,
+66, 67, 68, 69, 70, 71, 71, 40, 20, 33, 41, 48, 53, 57, 61,
+64, 66, 69, 71, 73, 75, 76, 78, 80, 82, 85, 87, 89, 91, 92,
+94, 96, 98, 101, 103, 105, 107, 108, 110, 112, 114, 117, 119, 121, 123,
+124, 126, 128, 40, 23, 39, 51, 60, 67, 73, 79, 83, 87, 91, 94,
+97, 100, 102, 105, 107, 111, 115, 118, 121, 124, 126, 129, 131, 135, 139,
+142, 145, 148, 150, 153, 155, 159, 163, 166, 169, 172, 174, 177, 179, 35,
+28, 49, 65, 78, 89, 99, 107, 114, 120, 126, 132, 136, 141, 145, 149,
+153, 159, 165, 171, 176, 180, 185, 189, 192, 199, 205, 211, 216, 220, 225,
+229, 232, 239, 245, 251, 21, 33, 58, 79, 97, 112, 125, 137, 148, 157,
+166, 174, 182, 189, 195, 201, 207, 217, 227, 235, 243, 251, 17, 35, 63,
+86, 106, 123, 139, 152, 165, 177, 187, 197, 206, 214, 222, 230, 237, 250,
+25, 31, 55, 75, 91, 105, 117, 128, 138, 146, 154, 161, 168, 174, 180,
+185, 190, 200, 208, 215, 222, 229, 235, 240, 245, 255, 16, 36, 65, 89,
+110, 128, 144, 159, 173, 185, 196, 207, 217, 226, 234, 242, 250, 11, 41,
+74, 103, 128, 151, 172, 191, 209, 225, 241, 255, 9, 43, 79, 110, 138,
+163, 186, 207, 227, 246, 12, 39, 71, 99, 123, 144, 164, 182, 198, 214,
+228, 241, 253, 9, 44, 81, 113, 142, 168, 192, 214, 235, 255, 7, 49,
+90, 127, 160, 191, 220, 247, 6, 51, 95, 134, 170, 203, 234, 7, 47,
+87, 123, 155, 184, 212, 237, 6, 52, 97, 137, 174, 208, 240, 5, 57,
+106, 151, 192, 231, 5, 59, 111, 158, 202, 243, 5, 55, 103, 147, 187,
+224, 5, 60, 113, 161, 206, 248, 4, 65, 122, 175, 224, 4, 67, 127,
+182, 234, };
+static const unsigned char cache_caps50[168] = {
+224, 224, 224, 224, 224, 224, 224, 224, 160, 160, 160, 160, 185, 185, 185,
+178, 178, 168, 134, 61, 37, 224, 224, 224, 224, 224, 224, 224, 224, 240,
+240, 240, 240, 207, 207, 207, 198, 198, 183, 144, 66, 40, 160, 160, 160,
+160, 160, 160, 160, 160, 185, 185, 185, 185, 193, 193, 193, 183, 183, 172,
+138, 64, 38, 240, 240, 240, 240, 240, 240, 240, 240, 207, 207, 207, 207,
+204, 204, 204, 193, 193, 180, 143, 66, 40, 185, 185, 185, 185, 185, 185,
+185, 185, 193, 193, 193, 193, 193, 193, 193, 183, 183, 172, 138, 65, 39,
+207, 207, 207, 207, 207, 207, 207, 207, 204, 204, 204, 204, 201, 201, 201,
+188, 188, 176, 141, 66, 40, 193, 193, 193, 193, 193, 193, 193, 193, 193,
+193, 193, 193, 194, 194, 194, 184, 184, 173, 139, 65, 39, 204, 204, 204,
+204, 204, 204, 204, 204, 201, 201, 201, 201, 198, 198, 198, 187, 187, 175,
+140, 66, 40, };
+#endif
+
+#ifndef FFT_TWIDDLES48000_960
+#define FFT_TWIDDLES48000_960
+static const kiss_twiddle_cpx fft_twiddles48000_960[480] = {
+{1.0000000f, -0.0000000f}, {0.99991433f, -0.013089596f},
+{0.99965732f, -0.026176948f}, {0.99922904f, -0.039259816f},
+{0.99862953f, -0.052335956f}, {0.99785892f, -0.065403129f},
+{0.99691733f, -0.078459096f}, {0.99580493f, -0.091501619f},
+{0.99452190f, -0.10452846f}, {0.99306846f, -0.11753740f},
+{0.99144486f, -0.13052619f}, {0.98965139f, -0.14349262f},
+{0.98768834f, -0.15643447f}, {0.98555606f, -0.16934950f},
+{0.98325491f, -0.18223553f}, {0.98078528f, -0.19509032f},
+{0.97814760f, -0.20791169f}, {0.97534232f, -0.22069744f},
+{0.97236992f, -0.23344536f}, {0.96923091f, -0.24615329f},
+{0.96592583f, -0.25881905f}, {0.96245524f, -0.27144045f},
+{0.95881973f, -0.28401534f}, {0.95501994f, -0.29654157f},
+{0.95105652f, -0.30901699f}, {0.94693013f, -0.32143947f},
+{0.94264149f, -0.33380686f}, {0.93819134f, -0.34611706f},
+{0.93358043f, -0.35836795f}, {0.92880955f, -0.37055744f},
+{0.92387953f, -0.38268343f}, {0.91879121f, -0.39474386f},
+{0.91354546f, -0.40673664f}, {0.90814317f, -0.41865974f},
+{0.90258528f, -0.43051110f}, {0.89687274f, -0.44228869f},
+{0.89100652f, -0.45399050f}, {0.88498764f, -0.46561452f},
+{0.87881711f, -0.47715876f}, {0.87249601f, -0.48862124f},
+{0.86602540f, -0.50000000f}, {0.85940641f, -0.51129309f},
+{0.85264016f, -0.52249856f}, {0.84572782f, -0.53361452f},
+{0.83867057f, -0.54463904f}, {0.83146961f, -0.55557023f},
+{0.82412619f, -0.56640624f}, {0.81664156f, -0.57714519f},
+{0.80901699f, -0.58778525f}, {0.80125381f, -0.59832460f},
+{0.79335334f, -0.60876143f}, {0.78531693f, -0.61909395f},
+{0.77714596f, -0.62932039f}, {0.76884183f, -0.63943900f},
+{0.76040597f, -0.64944805f}, {0.75183981f, -0.65934582f},
+{0.74314483f, -0.66913061f}, {0.73432251f, -0.67880075f},
+{0.72537437f, -0.68835458f}, {0.71630194f, -0.69779046f},
+{0.70710678f, -0.70710678f}, {0.69779046f, -0.71630194f},
+{0.68835458f, -0.72537437f}, {0.67880075f, -0.73432251f},
+{0.66913061f, -0.74314483f}, {0.65934582f, -0.75183981f},
+{0.64944805f, -0.76040597f}, {0.63943900f, -0.76884183f},
+{0.62932039f, -0.77714596f}, {0.61909395f, -0.78531693f},
+{0.60876143f, -0.79335334f}, {0.59832460f, -0.80125381f},
+{0.58778525f, -0.80901699f}, {0.57714519f, -0.81664156f},
+{0.56640624f, -0.82412619f}, {0.55557023f, -0.83146961f},
+{0.54463904f, -0.83867057f}, {0.53361452f, -0.84572782f},
+{0.52249856f, -0.85264016f}, {0.51129309f, -0.85940641f},
+{0.50000000f, -0.86602540f}, {0.48862124f, -0.87249601f},
+{0.47715876f, -0.87881711f}, {0.46561452f, -0.88498764f},
+{0.45399050f, -0.89100652f}, {0.44228869f, -0.89687274f},
+{0.43051110f, -0.90258528f}, {0.41865974f, -0.90814317f},
+{0.40673664f, -0.91354546f}, {0.39474386f, -0.91879121f},
+{0.38268343f, -0.92387953f}, {0.37055744f, -0.92880955f},
+{0.35836795f, -0.93358043f}, {0.34611706f, -0.93819134f},
+{0.33380686f, -0.94264149f}, {0.32143947f, -0.94693013f},
+{0.30901699f, -0.95105652f}, {0.29654157f, -0.95501994f},
+{0.28401534f, -0.95881973f}, {0.27144045f, -0.96245524f},
+{0.25881905f, -0.96592583f}, {0.24615329f, -0.96923091f},
+{0.23344536f, -0.97236992f}, {0.22069744f, -0.97534232f},
+{0.20791169f, -0.97814760f}, {0.19509032f, -0.98078528f},
+{0.18223553f, -0.98325491f}, {0.16934950f, -0.98555606f},
+{0.15643447f, -0.98768834f}, {0.14349262f, -0.98965139f},
+{0.13052619f, -0.99144486f}, {0.11753740f, -0.99306846f},
+{0.10452846f, -0.99452190f}, {0.091501619f, -0.99580493f},
+{0.078459096f, -0.99691733f}, {0.065403129f, -0.99785892f},
+{0.052335956f, -0.99862953f}, {0.039259816f, -0.99922904f},
+{0.026176948f, -0.99965732f}, {0.013089596f, -0.99991433f},
+{6.1230318e-17f, -1.0000000f}, {-0.013089596f, -0.99991433f},
+{-0.026176948f, -0.99965732f}, {-0.039259816f, -0.99922904f},
+{-0.052335956f, -0.99862953f}, {-0.065403129f, -0.99785892f},
+{-0.078459096f, -0.99691733f}, {-0.091501619f, -0.99580493f},
+{-0.10452846f, -0.99452190f}, {-0.11753740f, -0.99306846f},
+{-0.13052619f, -0.99144486f}, {-0.14349262f, -0.98965139f},
+{-0.15643447f, -0.98768834f}, {-0.16934950f, -0.98555606f},
+{-0.18223553f, -0.98325491f}, {-0.19509032f, -0.98078528f},
+{-0.20791169f, -0.97814760f}, {-0.22069744f, -0.97534232f},
+{-0.23344536f, -0.97236992f}, {-0.24615329f, -0.96923091f},
+{-0.25881905f, -0.96592583f}, {-0.27144045f, -0.96245524f},
+{-0.28401534f, -0.95881973f}, {-0.29654157f, -0.95501994f},
+{-0.30901699f, -0.95105652f}, {-0.32143947f, -0.94693013f},
+{-0.33380686f, -0.94264149f}, {-0.34611706f, -0.93819134f},
+{-0.35836795f, -0.93358043f}, {-0.37055744f, -0.92880955f},
+{-0.38268343f, -0.92387953f}, {-0.39474386f, -0.91879121f},
+{-0.40673664f, -0.91354546f}, {-0.41865974f, -0.90814317f},
+{-0.43051110f, -0.90258528f}, {-0.44228869f, -0.89687274f},
+{-0.45399050f, -0.89100652f}, {-0.46561452f, -0.88498764f},
+{-0.47715876f, -0.87881711f}, {-0.48862124f, -0.87249601f},
+{-0.50000000f, -0.86602540f}, {-0.51129309f, -0.85940641f},
+{-0.52249856f, -0.85264016f}, {-0.53361452f, -0.84572782f},
+{-0.54463904f, -0.83867057f}, {-0.55557023f, -0.83146961f},
+{-0.56640624f, -0.82412619f}, {-0.57714519f, -0.81664156f},
+{-0.58778525f, -0.80901699f}, {-0.59832460f, -0.80125381f},
+{-0.60876143f, -0.79335334f}, {-0.61909395f, -0.78531693f},
+{-0.62932039f, -0.77714596f}, {-0.63943900f, -0.76884183f},
+{-0.64944805f, -0.76040597f}, {-0.65934582f, -0.75183981f},
+{-0.66913061f, -0.74314483f}, {-0.67880075f, -0.73432251f},
+{-0.68835458f, -0.72537437f}, {-0.69779046f, -0.71630194f},
+{-0.70710678f, -0.70710678f}, {-0.71630194f, -0.69779046f},
+{-0.72537437f, -0.68835458f}, {-0.73432251f, -0.67880075f},
+{-0.74314483f, -0.66913061f}, {-0.75183981f, -0.65934582f},
+{-0.76040597f, -0.64944805f}, {-0.76884183f, -0.63943900f},
+{-0.77714596f, -0.62932039f}, {-0.78531693f, -0.61909395f},
+{-0.79335334f, -0.60876143f}, {-0.80125381f, -0.59832460f},
+{-0.80901699f, -0.58778525f}, {-0.81664156f, -0.57714519f},
+{-0.82412619f, -0.56640624f}, {-0.83146961f, -0.55557023f},
+{-0.83867057f, -0.54463904f}, {-0.84572782f, -0.53361452f},
+{-0.85264016f, -0.52249856f}, {-0.85940641f, -0.51129309f},
+{-0.86602540f, -0.50000000f}, {-0.87249601f, -0.48862124f},
+{-0.87881711f, -0.47715876f}, {-0.88498764f, -0.46561452f},
+{-0.89100652f, -0.45399050f}, {-0.89687274f, -0.44228869f},
+{-0.90258528f, -0.43051110f}, {-0.90814317f, -0.41865974f},
+{-0.91354546f, -0.40673664f}, {-0.91879121f, -0.39474386f},
+{-0.92387953f, -0.38268343f}, {-0.92880955f, -0.37055744f},
+{-0.93358043f, -0.35836795f}, {-0.93819134f, -0.34611706f},
+{-0.94264149f, -0.33380686f}, {-0.94693013f, -0.32143947f},
+{-0.95105652f, -0.30901699f}, {-0.95501994f, -0.29654157f},
+{-0.95881973f, -0.28401534f}, {-0.96245524f, -0.27144045f},
+{-0.96592583f, -0.25881905f}, {-0.96923091f, -0.24615329f},
+{-0.97236992f, -0.23344536f}, {-0.97534232f, -0.22069744f},
+{-0.97814760f, -0.20791169f}, {-0.98078528f, -0.19509032f},
+{-0.98325491f, -0.18223553f}, {-0.98555606f, -0.16934950f},
+{-0.98768834f, -0.15643447f}, {-0.98965139f, -0.14349262f},
+{-0.99144486f, -0.13052619f}, {-0.99306846f, -0.11753740f},
+{-0.99452190f, -0.10452846f}, {-0.99580493f, -0.091501619f},
+{-0.99691733f, -0.078459096f}, {-0.99785892f, -0.065403129f},
+{-0.99862953f, -0.052335956f}, {-0.99922904f, -0.039259816f},
+{-0.99965732f, -0.026176948f}, {-0.99991433f, -0.013089596f},
+{-1.0000000f, -1.2246064e-16f}, {-0.99991433f, 0.013089596f},
+{-0.99965732f, 0.026176948f}, {-0.99922904f, 0.039259816f},
+{-0.99862953f, 0.052335956f}, {-0.99785892f, 0.065403129f},
+{-0.99691733f, 0.078459096f}, {-0.99580493f, 0.091501619f},
+{-0.99452190f, 0.10452846f}, {-0.99306846f, 0.11753740f},
+{-0.99144486f, 0.13052619f}, {-0.98965139f, 0.14349262f},
+{-0.98768834f, 0.15643447f}, {-0.98555606f, 0.16934950f},
+{-0.98325491f, 0.18223553f}, {-0.98078528f, 0.19509032f},
+{-0.97814760f, 0.20791169f}, {-0.97534232f, 0.22069744f},
+{-0.97236992f, 0.23344536f}, {-0.96923091f, 0.24615329f},
+{-0.96592583f, 0.25881905f}, {-0.96245524f, 0.27144045f},
+{-0.95881973f, 0.28401534f}, {-0.95501994f, 0.29654157f},
+{-0.95105652f, 0.30901699f}, {-0.94693013f, 0.32143947f},
+{-0.94264149f, 0.33380686f}, {-0.93819134f, 0.34611706f},
+{-0.93358043f, 0.35836795f}, {-0.92880955f, 0.37055744f},
+{-0.92387953f, 0.38268343f}, {-0.91879121f, 0.39474386f},
+{-0.91354546f, 0.40673664f}, {-0.90814317f, 0.41865974f},
+{-0.90258528f, 0.43051110f}, {-0.89687274f, 0.44228869f},
+{-0.89100652f, 0.45399050f}, {-0.88498764f, 0.46561452f},
+{-0.87881711f, 0.47715876f}, {-0.87249601f, 0.48862124f},
+{-0.86602540f, 0.50000000f}, {-0.85940641f, 0.51129309f},
+{-0.85264016f, 0.52249856f}, {-0.84572782f, 0.53361452f},
+{-0.83867057f, 0.54463904f}, {-0.83146961f, 0.55557023f},
+{-0.82412619f, 0.56640624f}, {-0.81664156f, 0.57714519f},
+{-0.80901699f, 0.58778525f}, {-0.80125381f, 0.59832460f},
+{-0.79335334f, 0.60876143f}, {-0.78531693f, 0.61909395f},
+{-0.77714596f, 0.62932039f}, {-0.76884183f, 0.63943900f},
+{-0.76040597f, 0.64944805f}, {-0.75183981f, 0.65934582f},
+{-0.74314483f, 0.66913061f}, {-0.73432251f, 0.67880075f},
+{-0.72537437f, 0.68835458f}, {-0.71630194f, 0.69779046f},
+{-0.70710678f, 0.70710678f}, {-0.69779046f, 0.71630194f},
+{-0.68835458f, 0.72537437f}, {-0.67880075f, 0.73432251f},
+{-0.66913061f, 0.74314483f}, {-0.65934582f, 0.75183981f},
+{-0.64944805f, 0.76040597f}, {-0.63943900f, 0.76884183f},
+{-0.62932039f, 0.77714596f}, {-0.61909395f, 0.78531693f},
+{-0.60876143f, 0.79335334f}, {-0.59832460f, 0.80125381f},
+{-0.58778525f, 0.80901699f}, {-0.57714519f, 0.81664156f},
+{-0.56640624f, 0.82412619f}, {-0.55557023f, 0.83146961f},
+{-0.54463904f, 0.83867057f}, {-0.53361452f, 0.84572782f},
+{-0.52249856f, 0.85264016f}, {-0.51129309f, 0.85940641f},
+{-0.50000000f, 0.86602540f}, {-0.48862124f, 0.87249601f},
+{-0.47715876f, 0.87881711f}, {-0.46561452f, 0.88498764f},
+{-0.45399050f, 0.89100652f}, {-0.44228869f, 0.89687274f},
+{-0.43051110f, 0.90258528f}, {-0.41865974f, 0.90814317f},
+{-0.40673664f, 0.91354546f}, {-0.39474386f, 0.91879121f},
+{-0.38268343f, 0.92387953f}, {-0.37055744f, 0.92880955f},
+{-0.35836795f, 0.93358043f}, {-0.34611706f, 0.93819134f},
+{-0.33380686f, 0.94264149f}, {-0.32143947f, 0.94693013f},
+{-0.30901699f, 0.95105652f}, {-0.29654157f, 0.95501994f},
+{-0.28401534f, 0.95881973f}, {-0.27144045f, 0.96245524f},
+{-0.25881905f, 0.96592583f}, {-0.24615329f, 0.96923091f},
+{-0.23344536f, 0.97236992f}, {-0.22069744f, 0.97534232f},
+{-0.20791169f, 0.97814760f}, {-0.19509032f, 0.98078528f},
+{-0.18223553f, 0.98325491f}, {-0.16934950f, 0.98555606f},
+{-0.15643447f, 0.98768834f}, {-0.14349262f, 0.98965139f},
+{-0.13052619f, 0.99144486f}, {-0.11753740f, 0.99306846f},
+{-0.10452846f, 0.99452190f}, {-0.091501619f, 0.99580493f},
+{-0.078459096f, 0.99691733f}, {-0.065403129f, 0.99785892f},
+{-0.052335956f, 0.99862953f}, {-0.039259816f, 0.99922904f},
+{-0.026176948f, 0.99965732f}, {-0.013089596f, 0.99991433f},
+{-1.8369095e-16f, 1.0000000f}, {0.013089596f, 0.99991433f},
+{0.026176948f, 0.99965732f}, {0.039259816f, 0.99922904f},
+{0.052335956f, 0.99862953f}, {0.065403129f, 0.99785892f},
+{0.078459096f, 0.99691733f}, {0.091501619f, 0.99580493f},
+{0.10452846f, 0.99452190f}, {0.11753740f, 0.99306846f},
+{0.13052619f, 0.99144486f}, {0.14349262f, 0.98965139f},
+{0.15643447f, 0.98768834f}, {0.16934950f, 0.98555606f},
+{0.18223553f, 0.98325491f}, {0.19509032f, 0.98078528f},
+{0.20791169f, 0.97814760f}, {0.22069744f, 0.97534232f},
+{0.23344536f, 0.97236992f}, {0.24615329f, 0.96923091f},
+{0.25881905f, 0.96592583f}, {0.27144045f, 0.96245524f},
+{0.28401534f, 0.95881973f}, {0.29654157f, 0.95501994f},
+{0.30901699f, 0.95105652f}, {0.32143947f, 0.94693013f},
+{0.33380686f, 0.94264149f}, {0.34611706f, 0.93819134f},
+{0.35836795f, 0.93358043f}, {0.37055744f, 0.92880955f},
+{0.38268343f, 0.92387953f}, {0.39474386f, 0.91879121f},
+{0.40673664f, 0.91354546f}, {0.41865974f, 0.90814317f},
+{0.43051110f, 0.90258528f}, {0.44228869f, 0.89687274f},
+{0.45399050f, 0.89100652f}, {0.46561452f, 0.88498764f},
+{0.47715876f, 0.87881711f}, {0.48862124f, 0.87249601f},
+{0.50000000f, 0.86602540f}, {0.51129309f, 0.85940641f},
+{0.52249856f, 0.85264016f}, {0.53361452f, 0.84572782f},
+{0.54463904f, 0.83867057f}, {0.55557023f, 0.83146961f},
+{0.56640624f, 0.82412619f}, {0.57714519f, 0.81664156f},
+{0.58778525f, 0.80901699f}, {0.59832460f, 0.80125381f},
+{0.60876143f, 0.79335334f}, {0.61909395f, 0.78531693f},
+{0.62932039f, 0.77714596f}, {0.63943900f, 0.76884183f},
+{0.64944805f, 0.76040597f}, {0.65934582f, 0.75183981f},
+{0.66913061f, 0.74314483f}, {0.67880075f, 0.73432251f},
+{0.68835458f, 0.72537437f}, {0.69779046f, 0.71630194f},
+{0.70710678f, 0.70710678f}, {0.71630194f, 0.69779046f},
+{0.72537437f, 0.68835458f}, {0.73432251f, 0.67880075f},
+{0.74314483f, 0.66913061f}, {0.75183981f, 0.65934582f},
+{0.76040597f, 0.64944805f}, {0.76884183f, 0.63943900f},
+{0.77714596f, 0.62932039f}, {0.78531693f, 0.61909395f},
+{0.79335334f, 0.60876143f}, {0.80125381f, 0.59832460f},
+{0.80901699f, 0.58778525f}, {0.81664156f, 0.57714519f},
+{0.82412619f, 0.56640624f}, {0.83146961f, 0.55557023f},
+{0.83867057f, 0.54463904f}, {0.84572782f, 0.53361452f},
+{0.85264016f, 0.52249856f}, {0.85940641f, 0.51129309f},
+{0.86602540f, 0.50000000f}, {0.87249601f, 0.48862124f},
+{0.87881711f, 0.47715876f}, {0.88498764f, 0.46561452f},
+{0.89100652f, 0.45399050f}, {0.89687274f, 0.44228869f},
+{0.90258528f, 0.43051110f}, {0.90814317f, 0.41865974f},
+{0.91354546f, 0.40673664f}, {0.91879121f, 0.39474386f},
+{0.92387953f, 0.38268343f}, {0.92880955f, 0.37055744f},
+{0.93358043f, 0.35836795f}, {0.93819134f, 0.34611706f},
+{0.94264149f, 0.33380686f}, {0.94693013f, 0.32143947f},
+{0.95105652f, 0.30901699f}, {0.95501994f, 0.29654157f},
+{0.95881973f, 0.28401534f}, {0.96245524f, 0.27144045f},
+{0.96592583f, 0.25881905f}, {0.96923091f, 0.24615329f},
+{0.97236992f, 0.23344536f}, {0.97534232f, 0.22069744f},
+{0.97814760f, 0.20791169f}, {0.98078528f, 0.19509032f},
+{0.98325491f, 0.18223553f}, {0.98555606f, 0.16934950f},
+{0.98768834f, 0.15643447f}, {0.98965139f, 0.14349262f},
+{0.99144486f, 0.13052619f}, {0.99306846f, 0.11753740f},
+{0.99452190f, 0.10452846f}, {0.99580493f, 0.091501619f},
+{0.99691733f, 0.078459096f}, {0.99785892f, 0.065403129f},
+{0.99862953f, 0.052335956f}, {0.99922904f, 0.039259816f},
+{0.99965732f, 0.026176948f}, {0.99991433f, 0.013089596f},
+};
+#ifndef FFT_BITREV480
+#define FFT_BITREV480
+static const opus_int16 fft_bitrev480[480] = {
+0, 120, 240, 360, 30, 150, 270, 390, 60, 180, 300, 420, 90, 210, 330,
+450, 15, 135, 255, 375, 45, 165, 285, 405, 75, 195, 315, 435, 105, 225,
+345, 465, 5, 125, 245, 365, 35, 155, 275, 395, 65, 185, 305, 425, 95,
+215, 335, 455, 20, 140, 260, 380, 50, 170, 290, 410, 80, 200, 320, 440,
+110, 230, 350, 470, 10, 130, 250, 370, 40, 160, 280, 400, 70, 190, 310,
+430, 100, 220, 340, 460, 25, 145, 265, 385, 55, 175, 295, 415, 85, 205,
+325, 445, 115, 235, 355, 475, 1, 121, 241, 361, 31, 151, 271, 391, 61,
+181, 301, 421, 91, 211, 331, 451, 16, 136, 256, 376, 46, 166, 286, 406,
+76, 196, 316, 436, 106, 226, 346, 466, 6, 126, 246, 366, 36, 156, 276,
+396, 66, 186, 306, 426, 96, 216, 336, 456, 21, 141, 261, 381, 51, 171,
+291, 411, 81, 201, 321, 441, 111, 231, 351, 471, 11, 131, 251, 371, 41,
+161, 281, 401, 71, 191, 311, 431, 101, 221, 341, 461, 26, 146, 266, 386,
+56, 176, 296, 416, 86, 206, 326, 446, 116, 236, 356, 476, 2, 122, 242,
+362, 32, 152, 272, 392, 62, 182, 302, 422, 92, 212, 332, 452, 17, 137,
+257, 377, 47, 167, 287, 407, 77, 197, 317, 437, 107, 227, 347, 467, 7,
+127, 247, 367, 37, 157, 277, 397, 67, 187, 307, 427, 97, 217, 337, 457,
+22, 142, 262, 382, 52, 172, 292, 412, 82, 202, 322, 442, 112, 232, 352,
+472, 12, 132, 252, 372, 42, 162, 282, 402, 72, 192, 312, 432, 102, 222,
+342, 462, 27, 147, 267, 387, 57, 177, 297, 417, 87, 207, 327, 447, 117,
+237, 357, 477, 3, 123, 243, 363, 33, 153, 273, 393, 63, 183, 303, 423,
+93, 213, 333, 453, 18, 138, 258, 378, 48, 168, 288, 408, 78, 198, 318,
+438, 108, 228, 348, 468, 8, 128, 248, 368, 38, 158, 278, 398, 68, 188,
+308, 428, 98, 218, 338, 458, 23, 143, 263, 383, 53, 173, 293, 413, 83,
+203, 323, 443, 113, 233, 353, 473, 13, 133, 253, 373, 43, 163, 283, 403,
+73, 193, 313, 433, 103, 223, 343, 463, 28, 148, 268, 388, 58, 178, 298,
+418, 88, 208, 328, 448, 118, 238, 358, 478, 4, 124, 244, 364, 34, 154,
+274, 394, 64, 184, 304, 424, 94, 214, 334, 454, 19, 139, 259, 379, 49,
+169, 289, 409, 79, 199, 319, 439, 109, 229, 349, 469, 9, 129, 249, 369,
+39, 159, 279, 399, 69, 189, 309, 429, 99, 219, 339, 459, 24, 144, 264,
+384, 54, 174, 294, 414, 84, 204, 324, 444, 114, 234, 354, 474, 14, 134,
+254, 374, 44, 164, 284, 404, 74, 194, 314, 434, 104, 224, 344, 464, 29,
+149, 269, 389, 59, 179, 299, 419, 89, 209, 329, 449, 119, 239, 359, 479,
+};
+#endif
+
+#ifndef FFT_BITREV240
+#define FFT_BITREV240
+static const opus_int16 fft_bitrev240[240] = {
+0, 60, 120, 180, 15, 75, 135, 195, 30, 90, 150, 210, 45, 105, 165,
+225, 5, 65, 125, 185, 20, 80, 140, 200, 35, 95, 155, 215, 50, 110,
+170, 230, 10, 70, 130, 190, 25, 85, 145, 205, 40, 100, 160, 220, 55,
+115, 175, 235, 1, 61, 121, 181, 16, 76, 136, 196, 31, 91, 151, 211,
+46, 106, 166, 226, 6, 66, 126, 186, 21, 81, 141, 201, 36, 96, 156,
+216, 51, 111, 171, 231, 11, 71, 131, 191, 26, 86, 146, 206, 41, 101,
+161, 221, 56, 116, 176, 236, 2, 62, 122, 182, 17, 77, 137, 197, 32,
+92, 152, 212, 47, 107, 167, 227, 7, 67, 127, 187, 22, 82, 142, 202,
+37, 97, 157, 217, 52, 112, 172, 232, 12, 72, 132, 192, 27, 87, 147,
+207, 42, 102, 162, 222, 57, 117, 177, 237, 3, 63, 123, 183, 18, 78,
+138, 198, 33, 93, 153, 213, 48, 108, 168, 228, 8, 68, 128, 188, 23,
+83, 143, 203, 38, 98, 158, 218, 53, 113, 173, 233, 13, 73, 133, 193,
+28, 88, 148, 208, 43, 103, 163, 223, 58, 118, 178, 238, 4, 64, 124,
+184, 19, 79, 139, 199, 34, 94, 154, 214, 49, 109, 169, 229, 9, 69,
+129, 189, 24, 84, 144, 204, 39, 99, 159, 219, 54, 114, 174, 234, 14,
+74, 134, 194, 29, 89, 149, 209, 44, 104, 164, 224, 59, 119, 179, 239,
+};
+#endif
+
+#ifndef FFT_BITREV120
+#define FFT_BITREV120
+static const opus_int16 fft_bitrev120[120] = {
+0, 30, 60, 90, 15, 45, 75, 105, 5, 35, 65, 95, 20, 50, 80,
+110, 10, 40, 70, 100, 25, 55, 85, 115, 1, 31, 61, 91, 16, 46,
+76, 106, 6, 36, 66, 96, 21, 51, 81, 111, 11, 41, 71, 101, 26,
+56, 86, 116, 2, 32, 62, 92, 17, 47, 77, 107, 7, 37, 67, 97,
+22, 52, 82, 112, 12, 42, 72, 102, 27, 57, 87, 117, 3, 33, 63,
+93, 18, 48, 78, 108, 8, 38, 68, 98, 23, 53, 83, 113, 13, 43,
+73, 103, 28, 58, 88, 118, 4, 34, 64, 94, 19, 49, 79, 109, 9,
+39, 69, 99, 24, 54, 84, 114, 14, 44, 74, 104, 29, 59, 89, 119,
+};
+#endif
+
+#ifndef FFT_BITREV60
+#define FFT_BITREV60
+static const opus_int16 fft_bitrev60[60] = {
+0, 15, 30, 45, 5, 20, 35, 50, 10, 25, 40, 55, 1, 16, 31,
+46, 6, 21, 36, 51, 11, 26, 41, 56, 2, 17, 32, 47, 7, 22,
+37, 52, 12, 27, 42, 57, 3, 18, 33, 48, 8, 23, 38, 53, 13,
+28, 43, 58, 4, 19, 34, 49, 9, 24, 39, 54, 14, 29, 44, 59,
+};
+#endif
+
+#ifndef FFT_STATE48000_960_0
+#define FFT_STATE48000_960_0
+static const kiss_fft_state fft_state48000_960_0 = {
+480,    /* nfft */
+0.002083333f,   /* scale */
+-1,     /* shift */
+{4, 120, 4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, }, /* factors */
+fft_bitrev480,  /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#ifndef FFT_STATE48000_960_1
+#define FFT_STATE48000_960_1
+static const kiss_fft_state fft_state48000_960_1 = {
+240,    /* nfft */
+0.004166667f,   /* scale */
+1,      /* shift */
+{4, 60, 4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+fft_bitrev240,  /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#ifndef FFT_STATE48000_960_2
+#define FFT_STATE48000_960_2
+static const kiss_fft_state fft_state48000_960_2 = {
+120,    /* nfft */
+0.008333333f,   /* scale */
+2,      /* shift */
+{4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+fft_bitrev120,  /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#ifndef FFT_STATE48000_960_3
+#define FFT_STATE48000_960_3
+static const kiss_fft_state fft_state48000_960_3 = {
+60,     /* nfft */
+0.016666667f,   /* scale */
+3,      /* shift */
+{4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
+fft_bitrev60,   /* bitrev */
+fft_twiddles48000_960,  /* bitrev */
+};
+#endif
+
+#endif
+
+#ifndef MDCT_TWIDDLES960
+#define MDCT_TWIDDLES960
+static const opus_val16 mdct_twiddles960[481] = {
+1.0000000f, 0.99999465f, 0.99997858f, 0.99995181f, 0.99991433f,
+0.99986614f, 0.99980724f, 0.99973764f, 0.99965732f, 0.99956631f,
+0.99946459f, 0.99935216f, 0.99922904f, 0.99909521f, 0.99895068f,
+0.99879546f, 0.99862953f, 0.99845292f, 0.99826561f, 0.99806761f,
+0.99785892f, 0.99763955f, 0.99740949f, 0.99716875f, 0.99691733f,
+0.99665524f, 0.99638247f, 0.99609903f, 0.99580493f, 0.99550016f,
+0.99518473f, 0.99485864f, 0.99452190f, 0.99417450f, 0.99381646f,
+0.99344778f, 0.99306846f, 0.99267850f, 0.99227791f, 0.99186670f,
+0.99144486f, 0.99101241f, 0.99056934f, 0.99011566f, 0.98965139f,
+0.98917651f, 0.98869104f, 0.98819498f, 0.98768834f, 0.98717112f,
+0.98664333f, 0.98610497f, 0.98555606f, 0.98499659f, 0.98442657f,
+0.98384600f, 0.98325491f, 0.98265328f, 0.98204113f, 0.98141846f,
+0.98078528f, 0.98014159f, 0.97948742f, 0.97882275f, 0.97814760f,
+0.97746197f, 0.97676588f, 0.97605933f, 0.97534232f, 0.97461487f,
+0.97387698f, 0.97312866f, 0.97236992f, 0.97160077f, 0.97082121f,
+0.97003125f, 0.96923091f, 0.96842019f, 0.96759909f, 0.96676764f,
+0.96592582f, 0.96507367f, 0.96421118f, 0.96333837f, 0.96245523f,
+0.96156180f, 0.96065806f, 0.95974403f, 0.95881973f, 0.95788517f,
+0.95694034f, 0.95598526f, 0.95501995f, 0.95404440f, 0.95305864f,
+0.95206267f, 0.95105651f, 0.95004016f, 0.94901364f, 0.94797697f,
+0.94693013f, 0.94587315f, 0.94480604f, 0.94372882f, 0.94264149f,
+0.94154406f, 0.94043656f, 0.93931897f, 0.93819133f, 0.93705365f,
+0.93590592f, 0.93474818f, 0.93358042f, 0.93240268f, 0.93121493f,
+0.93001722f, 0.92880955f, 0.92759193f, 0.92636438f, 0.92512690f,
+0.92387953f, 0.92262225f, 0.92135509f, 0.92007809f, 0.91879121f,
+0.91749449f, 0.91618795f, 0.91487161f, 0.91354545f, 0.91220952f,
+0.91086382f, 0.90950836f, 0.90814316f, 0.90676824f, 0.90538363f,
+0.90398929f, 0.90258528f, 0.90117161f, 0.89974828f, 0.89831532f,
+0.89687273f, 0.89542055f, 0.89395877f, 0.89248742f, 0.89100652f,
+0.88951606f, 0.88801610f, 0.88650661f, 0.88498764f, 0.88345918f,
+0.88192125f, 0.88037390f, 0.87881711f, 0.87725090f, 0.87567531f,
+0.87409035f, 0.87249599f, 0.87089232f, 0.86927933f, 0.86765699f,
+0.86602540f, 0.86438453f, 0.86273437f, 0.86107503f, 0.85940641f,
+0.85772862f, 0.85604161f, 0.85434547f, 0.85264014f, 0.85092572f,
+0.84920218f, 0.84746955f, 0.84572781f, 0.84397704f, 0.84221721f,
+0.84044838f, 0.83867056f, 0.83688375f, 0.83508799f, 0.83328325f,
+0.83146961f, 0.82964704f, 0.82781562f, 0.82597530f, 0.82412620f,
+0.82226820f, 0.82040144f, 0.81852589f, 0.81664154f, 0.81474847f,
+0.81284665f, 0.81093620f, 0.80901698f, 0.80708914f, 0.80515262f,
+0.80320752f, 0.80125378f, 0.79929149f, 0.79732067f, 0.79534125f,
+0.79335335f, 0.79135691f, 0.78935204f, 0.78733867f, 0.78531691f,
+0.78328674f, 0.78124818f, 0.77920122f, 0.77714595f, 0.77508232f,
+0.77301043f, 0.77093026f, 0.76884183f, 0.76674517f, 0.76464026f,
+0.76252720f, 0.76040593f, 0.75827656f, 0.75613907f, 0.75399349f,
+0.75183978f, 0.74967807f, 0.74750833f, 0.74533054f, 0.74314481f,
+0.74095112f, 0.73874950f, 0.73653993f, 0.73432251f, 0.73209718f,
+0.72986405f, 0.72762307f, 0.72537438f, 0.72311787f, 0.72085359f,
+0.71858162f, 0.71630192f, 0.71401459f, 0.71171956f, 0.70941701f,
+0.70710677f, 0.70478900f, 0.70246363f, 0.70013079f, 0.69779041f,
+0.69544260f, 0.69308738f, 0.69072466f, 0.68835458f, 0.68597709f,
+0.68359229f, 0.68120013f, 0.67880072f, 0.67639404f, 0.67398011f,
+0.67155892f, 0.66913059f, 0.66669509f, 0.66425240f, 0.66180265f,
+0.65934581f, 0.65688191f, 0.65441092f, 0.65193298f, 0.64944801f,
+0.64695613f, 0.64445727f, 0.64195160f, 0.63943902f, 0.63691954f,
+0.63439328f, 0.63186019f, 0.62932037f, 0.62677377f, 0.62422055f,
+0.62166055f, 0.61909394f, 0.61652065f, 0.61394081f, 0.61135435f,
+0.60876139f, 0.60616195f, 0.60355593f, 0.60094349f, 0.59832457f,
+0.59569929f, 0.59306758f, 0.59042957f, 0.58778523f, 0.58513460f,
+0.58247766f, 0.57981452f, 0.57714518f, 0.57446961f, 0.57178793f,
+0.56910013f, 0.56640624f, 0.56370623f, 0.56100023f, 0.55828818f,
+0.55557020f, 0.55284627f, 0.55011641f, 0.54738067f, 0.54463901f,
+0.54189157f, 0.53913828f, 0.53637921f, 0.53361450f, 0.53084398f,
+0.52806787f, 0.52528601f, 0.52249852f, 0.51970543f, 0.51690688f,
+0.51410279f, 0.51129310f, 0.50847793f, 0.50565732f, 0.50283139f,
+0.49999997f, 0.49716321f, 0.49432122f, 0.49147383f, 0.48862118f,
+0.48576340f, 0.48290042f, 0.48003216f, 0.47715876f, 0.47428025f,
+0.47139677f, 0.46850813f, 0.46561448f, 0.46271584f, 0.45981235f,
+0.45690383f, 0.45399042f, 0.45107214f, 0.44814915f, 0.44522124f,
+0.44228868f, 0.43935137f, 0.43640926f, 0.43346247f, 0.43051104f,
+0.42755511f, 0.42459449f, 0.42162932f, 0.41865964f, 0.41568558f,
+0.41270697f, 0.40972393f, 0.40673661f, 0.40374494f, 0.40074884f,
+0.39774844f, 0.39474390f, 0.39173501f, 0.38872193f, 0.38570469f,
+0.38268343f, 0.37965796f, 0.37662842f, 0.37359496f, 0.37055739f,
+0.36751585f, 0.36447038f, 0.36142122f, 0.35836797f, 0.35531089f,
+0.35225000f, 0.34918544f, 0.34611704f, 0.34304493f, 0.33996926f,
+0.33688983f, 0.33380680f, 0.33072019f, 0.32763015f, 0.32453650f,
+0.32143936f, 0.31833890f, 0.31523503f, 0.31212767f, 0.30901696f,
+0.30590306f, 0.30278577f, 0.29966524f, 0.29654150f, 0.29341470f,
+0.29028464f, 0.28715147f, 0.28401522f, 0.28087605f, 0.27773376f,
+0.27458861f, 0.27144052f, 0.26828940f, 0.26513541f, 0.26197859f,
+0.25881907f, 0.25565666f, 0.25249152f, 0.24932367f, 0.24615327f,
+0.24298012f, 0.23980436f, 0.23662604f, 0.23344530f, 0.23026206f,
+0.22707623f, 0.22388809f, 0.22069744f, 0.21750443f, 0.21430908f,
+0.21111156f, 0.20791165f, 0.20470953f, 0.20150520f, 0.19829884f,
+0.19509024f, 0.19187955f, 0.18866692f, 0.18545227f, 0.18223552f,
+0.17901681f, 0.17579631f, 0.17257380f, 0.16934945f, 0.16612328f,
+0.16289546f, 0.15966577f, 0.15643437f, 0.15320141f, 0.14996669f,
+0.14673037f, 0.14349260f, 0.14025329f, 0.13701235f, 0.13376995f,
+0.13052612f, 0.12728101f, 0.12403442f, 0.12078650f, 0.11753740f,
+0.11428693f, 0.11103523f, 0.10778234f, 0.10452842f, 0.10127326f,
+0.098017137f, 0.094759842f, 0.091501652f, 0.088242363f, 0.084982129f,
+0.081721103f, 0.078459084f, 0.075196224f, 0.071932560f, 0.068668243f,
+0.065403073f, 0.062137201f, 0.058870665f, 0.055603617f, 0.052335974f,
+0.049067651f, 0.045798921f, 0.042529582f, 0.039259788f, 0.035989573f,
+0.032719092f, 0.029448142f, 0.026176876f, 0.022905329f, 0.019633657f,
+0.016361655f, 0.013089478f, 0.0098171604f, 0.0065449764f, 0.0032724839f,
+-4.3711390e-08f, };
+#endif
+
+static const CELTMode mode48000_960_120 = {
+48000,  /* Fs */
+120,    /* overlap */
+21,     /* nbEBands */
+21,     /* effEBands */
+{0.85000610f, 0.0000000f, 1.0000000f, 1.0000000f, },    /* preemph */
+eband5ms,       /* eBands */
+3,      /* maxLM */
+8,      /* nbShortMdcts */
+120,    /* shortMdctSize */
+11,     /* nbAllocVectors */
+band_allocation,        /* allocVectors */
+logN400,        /* logN */
+window120,      /* window */
+{1920, 3, {&fft_state48000_960_0, &fft_state48000_960_1, &fft_state48000_960_2, &fft_state48000_960_3, }, mdct_twiddles960},    /* mdct */
+{392, cache_index50, cache_bits50, cache_caps50},       /* cache */
+};
+
+/* List of all the available modes */
+#define TOTAL_MODES 1
+static const CELTMode * const static_mode_list[TOTAL_MODES] = {
+&mode48000_960_120,
+};

+ 161 - 0
drivers/opus/celt/tests/test_unit_cwrs32.c

@@ -0,0 +1,161 @@
+/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
+                           Gregory Maxwell
+   Written by Jean-Marc Valin, Gregory Maxwell, and Timothy B. Terriberry */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#else
+#define TEST_CUSTOM_MODES
+#endif
+
+#define CELT_C
+#include "stack_alloc.h"
+#include "entenc.c"
+#include "entdec.c"
+#include "entcode.c"
+#include "cwrs.c"
+#include "mathops.c"
+#include "rate.h"
+
+#define NMAX (240)
+#define KMAX (128)
+
+#ifdef TEST_CUSTOM_MODES
+
+#define NDIMS (44)
+static const int pn[NDIMS]={
+   2,   3,   4,   5,   6,   7,   8,   9,  10,
+  11,  12,  13,  14,  15,  16,  18,  20,  22,
+  24,  26,  28,  30,  32,  36,  40,  44,  48,
+  52,  56,  60,  64,  72,  80,  88,  96, 104,
+ 112, 120, 128, 144, 160, 176, 192, 208
+};
+static const int pkmax[NDIMS]={
+ 128, 128, 128, 128,  88,  52,  36,  26,  22,
+  18,  16,  15,  13,  12,  12,  11,  10,   9,
+   9,   8,   8,   7,   7,   7,   7,   6,   6,
+   6,   6,   6,   5,   5,   5,   5,   5,   5,
+   4,   4,   4,   4,   4,   4,   4,   4
+};
+
+#else /* TEST_CUSTOM_MODES */
+
+#define NDIMS (22)
+static const int pn[NDIMS]={
+   2,   3,   4,   6,   8,   9,  11,  12,  16,
+  18,  22,  24,  32,  36,  44,  48,  64,  72,
+  88,  96, 144, 176
+};
+static const int pkmax[NDIMS]={
+ 128, 128, 128,  88,  36,  26,  18,  16,  12,
+  11,   9,   9,   7,   7,   6,   6,   5,   5,
+   5,   5,   4,   4
+};
+
+#endif
+
+int main(void){
+  int t;
+  int n;
+  ALLOC_STACK;
+  for(t=0;t<NDIMS;t++){
+    int pseudo;
+    n=pn[t];
+    for(pseudo=1;pseudo<41;pseudo++)
+    {
+      int k;
+#if defined(SMALL_FOOTPRINT)
+      opus_uint32 uu[KMAX+2U];
+#endif
+      opus_uint32 inc;
+      opus_uint32 nc;
+      opus_uint32 i;
+      k=get_pulses(pseudo);
+      if (k>pkmax[t])break;
+      printf("Testing CWRS with N=%i, K=%i...\n",n,k);
+#if defined(SMALL_FOOTPRINT)
+      nc=ncwrs_urow(n,k,uu);
+#else
+      nc=CELT_PVQ_V(n,k);
+#endif
+      inc=nc/20000;
+      if(inc<1)inc=1;
+      for(i=0;i<nc;i+=inc){
+#if defined(SMALL_FOOTPRINT)
+        opus_uint32 u[KMAX+2U];
+#endif
+        int         y[NMAX];
+        int         sy;
+        opus_uint32 v;
+        opus_uint32 ii;
+        int         j;
+#if defined(SMALL_FOOTPRINT)
+        memcpy(u,uu,(k+2U)*sizeof(*u));
+        cwrsi(n,k,i,y,u);
+#else
+        cwrsi(n,k,i,y);
+#endif
+        sy=0;
+        for(j=0;j<n;j++)sy+=ABS(y[j]);
+        if(sy!=k){
+          fprintf(stderr,"N=%d Pulse count mismatch in cwrsi (%d!=%d).\n",
+           n,sy,k);
+          return 99;
+        }
+        /*printf("%6u of %u:",i,nc);
+        for(j=0;j<n;j++)printf(" %+3i",y[j]);
+        printf(" ->");*/
+#if defined(SMALL_FOOTPRINT)
+        ii=icwrs(n,k,&v,y,u);
+#else
+        ii=icwrs(n,y);
+        v=CELT_PVQ_V(n,k);
+#endif
+        if(ii!=i){
+          fprintf(stderr,"Combination-index mismatch (%lu!=%lu).\n",
+           (long)ii,(long)i);
+          return 1;
+        }
+        if(v!=nc){
+          fprintf(stderr,"Combination count mismatch (%lu!=%lu).\n",
+           (long)v,(long)nc);
+          return 2;
+        }
+        /*printf(" %6u\n",i);*/
+      }
+      /*printf("\n");*/
+    }
+  }
+  return 0;
+}

+ 164 - 0
drivers/opus/celt/tests/test_unit_dft.c

@@ -0,0 +1,164 @@
+/* Copyright (c) 2008 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#define CELT_C
+#include "stack_alloc.h"
+#include "kiss_fft.h"
+#include "kiss_fft.c"
+#include "mathops.c"
+#include "entcode.c"
+
+
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+int ret = 0;
+
+void check(kiss_fft_cpx  * in,kiss_fft_cpx  * out,int nfft,int isinverse)
+{
+    int bin,k;
+    double errpow=0,sigpow=0, snr;
+
+    for (bin=0;bin<nfft;++bin) {
+        double ansr = 0;
+        double ansi = 0;
+        double difr;
+        double difi;
+
+        for (k=0;k<nfft;++k) {
+            double phase = -2*M_PI*bin*k/nfft;
+            double re = cos(phase);
+            double im = sin(phase);
+            if (isinverse)
+                im = -im;
+
+            if (!isinverse)
+            {
+               re /= nfft;
+               im /= nfft;
+            }
+
+            ansr += in[k].r * re - in[k].i * im;
+            ansi += in[k].r * im + in[k].i * re;
+        }
+        /*printf ("%d %d ", (int)ansr, (int)ansi);*/
+        difr = ansr - out[bin].r;
+        difi = ansi - out[bin].i;
+        errpow += difr*difr + difi*difi;
+        sigpow += ansr*ansr+ansi*ansi;
+    }
+    snr = 10*log10(sigpow/errpow);
+    printf("nfft=%d inverse=%d,snr = %f\n",nfft,isinverse,snr );
+    if (snr<60) {
+       printf( "** poor snr: %f ** \n", snr);
+       ret = 1;
+    }
+}
+
+void test1d(int nfft,int isinverse)
+{
+    size_t buflen = sizeof(kiss_fft_cpx)*nfft;
+
+    kiss_fft_cpx  * in = (kiss_fft_cpx*)malloc(buflen);
+    kiss_fft_cpx  * out= (kiss_fft_cpx*)malloc(buflen);
+    kiss_fft_state *cfg = opus_fft_alloc(nfft,0,0);
+    int k;
+
+    for (k=0;k<nfft;++k) {
+        in[k].r = (rand() % 32767) - 16384;
+        in[k].i = (rand() % 32767) - 16384;
+    }
+
+    for (k=0;k<nfft;++k) {
+       in[k].r *= 32768;
+       in[k].i *= 32768;
+    }
+
+    if (isinverse)
+    {
+       for (k=0;k<nfft;++k) {
+          in[k].r /= nfft;
+          in[k].i /= nfft;
+       }
+    }
+
+    /*for (k=0;k<nfft;++k) printf("%d %d ", in[k].r, in[k].i);printf("\n");*/
+
+    if (isinverse)
+       opus_ifft(cfg,in,out);
+    else
+       opus_fft(cfg,in,out);
+
+    /*for (k=0;k<nfft;++k) printf("%d %d ", out[k].r, out[k].i);printf("\n");*/
+
+    check(in,out,nfft,isinverse);
+
+    free(in);
+    free(out);
+    free(cfg);
+}
+
+int main(int argc,char ** argv)
+{
+    ALLOC_STACK;
+    if (argc>1) {
+        int k;
+        for (k=1;k<argc;++k) {
+            test1d(atoi(argv[k]),0);
+            test1d(atoi(argv[k]),1);
+        }
+    }else{
+        test1d(32,0);
+        test1d(32,1);
+        test1d(128,0);
+        test1d(128,1);
+        test1d(256,0);
+        test1d(256,1);
+#ifndef RADIX_TWO_ONLY
+        test1d(36,0);
+        test1d(36,1);
+        test1d(50,0);
+        test1d(50,1);
+        test1d(120,0);
+        test1d(120,1);
+#endif
+    }
+    return ret;
+}

+ 382 - 0
drivers/opus/celt/tests/test_unit_entropy.c

@@ -0,0 +1,382 @@
+/* Copyright (c) 2007-2011 Xiph.Org Foundation, Mozilla Corporation,
+                           Gregory Maxwell
+   Written by Jean-Marc Valin, Gregory Maxwell, and Timothy B. Terriberry */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "entcode.h"
+#include "entenc.h"
+#include "entdec.h"
+#include <string.h>
+
+#include "entenc.c"
+#include "entdec.c"
+#include "entcode.c"
+
+#ifndef M_LOG2E
+# define M_LOG2E    1.4426950408889634074
+#endif
+#define DATA_SIZE 10000000
+#define DATA_SIZE2 10000
+
+int main(int _argc,char **_argv){
+  ec_enc         enc;
+  ec_dec         dec;
+  long           nbits;
+  long           nbits2;
+  double         entropy;
+  int            ft;
+  int            ftb;
+  int            sz;
+  int            i;
+  int            ret;
+  unsigned int   sym;
+  unsigned int   seed;
+  unsigned char *ptr;
+  const char    *env_seed;
+  ret=0;
+  entropy=0;
+    if (_argc > 2) {
+	fprintf(stderr, "Usage: %s [<seed>]\n", _argv[0]);
+	return 1;
+    }
+  env_seed = getenv("SEED");
+  if (_argc > 1)
+    seed = atoi(_argv[1]);
+  else if (env_seed)
+    seed = atoi(env_seed);
+  else
+    seed = time(NULL);
+  /*Testing encoding of raw bit values.*/
+  ptr = (unsigned char *)malloc(DATA_SIZE);
+  ec_enc_init(&enc,ptr, DATA_SIZE);
+  for(ft=2;ft<1024;ft++){
+    for(i=0;i<ft;i++){
+      entropy+=log(ft)*M_LOG2E;
+      ec_enc_uint(&enc,i,ft);
+    }
+  }
+  /*Testing encoding of raw bit values.*/
+  for(ftb=1;ftb<16;ftb++){
+    for(i=0;i<(1<<ftb);i++){
+      entropy+=ftb;
+      nbits=ec_tell(&enc);
+      ec_enc_bits(&enc,i,ftb);
+      nbits2=ec_tell(&enc);
+      if(nbits2-nbits!=ftb){
+        fprintf(stderr,"Used %li bits to encode %i bits directly.\n",
+         nbits2-nbits,ftb);
+        ret=-1;
+      }
+    }
+  }
+  nbits=ec_tell_frac(&enc);
+  ec_enc_done(&enc);
+  fprintf(stderr,
+   "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n",
+   entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits);
+  fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc));
+  ec_dec_init(&dec,ptr,DATA_SIZE);
+  for(ft=2;ft<1024;ft++){
+    for(i=0;i<ft;i++){
+      sym=ec_dec_uint(&dec,ft);
+      if(sym!=(unsigned)i){
+        fprintf(stderr,"Decoded %i instead of %i with ft of %i.\n",sym,i,ft);
+        ret=-1;
+      }
+    }
+  }
+  for(ftb=1;ftb<16;ftb++){
+    for(i=0;i<(1<<ftb);i++){
+      sym=ec_dec_bits(&dec,ftb);
+      if(sym!=(unsigned)i){
+        fprintf(stderr,"Decoded %i instead of %i with ftb of %i.\n",sym,i,ftb);
+        ret=-1;
+      }
+    }
+  }
+  nbits2=ec_tell_frac(&dec);
+  if(nbits!=nbits2){
+    fprintf(stderr,
+     "Reported number of bits used was %0.2lf, should be %0.2lf.\n",
+     ldexp(nbits2,-3),ldexp(nbits,-3));
+    ret=-1;
+  }
+  /*Testing an encoder bust prefers range coder data over raw bits.
+    This isn't a general guarantee, will only work for data that is buffered in
+     the encoder state and not yet stored in the user buffer, and should never
+     get used in practice.
+    It's mostly here for code coverage completeness.*/
+  /*Start with a 16-bit buffer.*/
+  ec_enc_init(&enc,ptr,2);
+  /*Write 7 raw bits.*/
+  ec_enc_bits(&enc,0x55,7);
+  /*Write 12.3 bits of range coder data.*/
+  ec_enc_uint(&enc,1,2);
+  ec_enc_uint(&enc,1,3);
+  ec_enc_uint(&enc,1,4);
+  ec_enc_uint(&enc,1,5);
+  ec_enc_uint(&enc,2,6);
+  ec_enc_uint(&enc,6,7);
+  ec_enc_done(&enc);
+  ec_dec_init(&dec,ptr,2);
+  if(!enc.error
+   /*The raw bits should have been overwritten by the range coder data.*/
+   ||ec_dec_bits(&dec,7)!=0x05
+   /*And all the range coder data should have been encoded correctly.*/
+   ||ec_dec_uint(&dec,2)!=1
+   ||ec_dec_uint(&dec,3)!=1
+   ||ec_dec_uint(&dec,4)!=1
+   ||ec_dec_uint(&dec,5)!=1
+   ||ec_dec_uint(&dec,6)!=2
+   ||ec_dec_uint(&dec,7)!=6){
+    fprintf(stderr,"Encoder bust overwrote range coder data with raw bits.\n");
+    ret=-1;
+  }
+  srand(seed);
+  fprintf(stderr,"Testing random streams... Random seed: %u (%.4X)\n", seed, rand() % 65536);
+  for(i=0;i<409600;i++){
+    unsigned *data;
+    unsigned *tell;
+    unsigned tell_bits;
+    int       j;
+    int zeros;
+    ft=rand()/((RAND_MAX>>(rand()%11U))+1U)+10;
+    sz=rand()/((RAND_MAX>>(rand()%9U))+1U);
+    data=(unsigned *)malloc(sz*sizeof(*data));
+    tell=(unsigned *)malloc((sz+1)*sizeof(*tell));
+    ec_enc_init(&enc,ptr,DATA_SIZE2);
+    zeros = rand()%13==0;
+    tell[0]=ec_tell_frac(&enc);
+    for(j=0;j<sz;j++){
+      if (zeros)
+        data[j]=0;
+      else
+        data[j]=rand()%ft;
+      ec_enc_uint(&enc,data[j],ft);
+      tell[j+1]=ec_tell_frac(&enc);
+    }
+    if (rand()%2==0)
+      while(ec_tell(&enc)%8 != 0)
+        ec_enc_uint(&enc, rand()%2, 2);
+    tell_bits = ec_tell(&enc);
+    ec_enc_done(&enc);
+    if(tell_bits!=(unsigned)ec_tell(&enc)){
+      fprintf(stderr,"ec_tell() changed after ec_enc_done(): %i instead of %i (Random seed: %u)\n",
+       ec_tell(&enc),tell_bits,seed);
+      ret=-1;
+    }
+    if ((tell_bits+7)/8 < ec_range_bytes(&enc))
+    {
+      fprintf (stderr, "ec_tell() lied, there's %i bytes instead of %d (Random seed: %u)\n",
+               ec_range_bytes(&enc), (tell_bits+7)/8,seed);
+      ret=-1;
+    }
+    ec_dec_init(&dec,ptr,DATA_SIZE2);
+    if(ec_tell_frac(&dec)!=tell[0]){
+      fprintf(stderr,
+       "Tell mismatch between encoder and decoder at symbol %i: %i instead of %i (Random seed: %u).\n",
+       0,ec_tell_frac(&dec),tell[0],seed);
+    }
+    for(j=0;j<sz;j++){
+      sym=ec_dec_uint(&dec,ft);
+      if(sym!=data[j]){
+        fprintf(stderr,
+         "Decoded %i instead of %i with ft of %i at position %i of %i (Random seed: %u).\n",
+         sym,data[j],ft,j,sz,seed);
+        ret=-1;
+      }
+      if(ec_tell_frac(&dec)!=tell[j+1]){
+        fprintf(stderr,
+         "Tell mismatch between encoder and decoder at symbol %i: %i instead of %i (Random seed: %u).\n",
+         j+1,ec_tell_frac(&dec),tell[j+1],seed);
+      }
+    }
+    free(tell);
+    free(data);
+  }
+  /*Test compatibility between multiple different encode/decode routines.*/
+  for(i=0;i<409600;i++){
+    unsigned *logp1;
+    unsigned *data;
+    unsigned *tell;
+    unsigned *enc_method;
+    int       j;
+    sz=rand()/((RAND_MAX>>(rand()%9U))+1U);
+    logp1=(unsigned *)malloc(sz*sizeof(*logp1));
+    data=(unsigned *)malloc(sz*sizeof(*data));
+    tell=(unsigned *)malloc((sz+1)*sizeof(*tell));
+    enc_method=(unsigned *)malloc(sz*sizeof(*enc_method));
+    ec_enc_init(&enc,ptr,DATA_SIZE2);
+    tell[0]=ec_tell_frac(&enc);
+    for(j=0;j<sz;j++){
+      data[j]=rand()/((RAND_MAX>>1)+1);
+      logp1[j]=(rand()%15)+1;
+      enc_method[j]=rand()/((RAND_MAX>>2)+1);
+      switch(enc_method[j]){
+        case 0:{
+          ec_encode(&enc,data[j]?(1<<logp1[j])-1:0,
+           (1<<logp1[j])-(data[j]?0:1),1<<logp1[j]);
+        }break;
+        case 1:{
+          ec_encode_bin(&enc,data[j]?(1<<logp1[j])-1:0,
+           (1<<logp1[j])-(data[j]?0:1),logp1[j]);
+        }break;
+        case 2:{
+          ec_enc_bit_logp(&enc,data[j],logp1[j]);
+        }break;
+        case 3:{
+          unsigned char icdf[2];
+          icdf[0]=1;
+          icdf[1]=0;
+          ec_enc_icdf(&enc,data[j],icdf,logp1[j]);
+        }break;
+      }
+      tell[j+1]=ec_tell_frac(&enc);
+    }
+    ec_enc_done(&enc);
+    if((ec_tell(&enc)+7U)/8U<ec_range_bytes(&enc)){
+      fprintf(stderr,"tell() lied, there's %i bytes instead of %d (Random seed: %u)\n",
+       ec_range_bytes(&enc),(ec_tell(&enc)+7)/8,seed);
+      ret=-1;
+    }
+    ec_dec_init(&dec,ptr,DATA_SIZE2);
+    if(ec_tell_frac(&dec)!=tell[0]){
+      fprintf(stderr,
+       "Tell mismatch between encoder and decoder at symbol %i: %i instead of %i (Random seed: %u).\n",
+       0,ec_tell_frac(&dec),tell[0],seed);
+    }
+    for(j=0;j<sz;j++){
+      int fs;
+      int dec_method;
+      dec_method=rand()/((RAND_MAX>>2)+1);
+      switch(dec_method){
+        case 0:{
+          fs=ec_decode(&dec,1<<logp1[j]);
+          sym=fs>=(1<<logp1[j])-1;
+          ec_dec_update(&dec,sym?(1<<logp1[j])-1:0,
+           (1<<logp1[j])-(sym?0:1),1<<logp1[j]);
+        }break;
+        case 1:{
+          fs=ec_decode_bin(&dec,logp1[j]);
+          sym=fs>=(1<<logp1[j])-1;
+          ec_dec_update(&dec,sym?(1<<logp1[j])-1:0,
+           (1<<logp1[j])-(sym?0:1),1<<logp1[j]);
+        }break;
+        case 2:{
+          sym=ec_dec_bit_logp(&dec,logp1[j]);
+        }break;
+        case 3:{
+          unsigned char icdf[2];
+          icdf[0]=1;
+          icdf[1]=0;
+          sym=ec_dec_icdf(&dec,icdf,logp1[j]);
+        }break;
+      }
+      if(sym!=data[j]){
+        fprintf(stderr,
+         "Decoded %i instead of %i with logp1 of %i at position %i of %i (Random seed: %u).\n",
+         sym,data[j],logp1[j],j,sz,seed);
+        fprintf(stderr,"Encoding method: %i, decoding method: %i\n",
+         enc_method[j],dec_method);
+        ret=-1;
+      }
+      if(ec_tell_frac(&dec)!=tell[j+1]){
+        fprintf(stderr,
+         "Tell mismatch between encoder and decoder at symbol %i: %i instead of %i (Random seed: %u).\n",
+         j+1,ec_tell_frac(&dec),tell[j+1],seed);
+      }
+    }
+    free(enc_method);
+    free(tell);
+    free(data);
+    free(logp1);
+  }
+  ec_enc_init(&enc,ptr,DATA_SIZE2);
+  ec_enc_bit_logp(&enc,0,1);
+  ec_enc_bit_logp(&enc,0,1);
+  ec_enc_bit_logp(&enc,0,1);
+  ec_enc_bit_logp(&enc,0,1);
+  ec_enc_bit_logp(&enc,0,2);
+  ec_enc_patch_initial_bits(&enc,3,2);
+  if(enc.error){
+    fprintf(stderr,"patch_initial_bits failed");
+    ret=-1;
+  }
+  ec_enc_patch_initial_bits(&enc,0,5);
+  if(!enc.error){
+    fprintf(stderr,"patch_initial_bits didn't fail when it should have");
+    ret=-1;
+  }
+  ec_enc_done(&enc);
+  if(ec_range_bytes(&enc)!=1||ptr[0]!=192){
+    fprintf(stderr,"Got %d when expecting 192 for patch_initial_bits",ptr[0]);
+    ret=-1;
+  }
+  ec_enc_init(&enc,ptr,DATA_SIZE2);
+  ec_enc_bit_logp(&enc,0,1);
+  ec_enc_bit_logp(&enc,0,1);
+  ec_enc_bit_logp(&enc,1,6);
+  ec_enc_bit_logp(&enc,0,2);
+  ec_enc_patch_initial_bits(&enc,0,2);
+  if(enc.error){
+    fprintf(stderr,"patch_initial_bits failed");
+    ret=-1;
+  }
+  ec_enc_done(&enc);
+  if(ec_range_bytes(&enc)!=2||ptr[0]!=63){
+    fprintf(stderr,"Got %d when expecting 63 for patch_initial_bits",ptr[0]);
+    ret=-1;
+  }
+  ec_enc_init(&enc,ptr,2);
+  ec_enc_bit_logp(&enc,0,2);
+  for(i=0;i<48;i++){
+    ec_enc_bits(&enc,0,1);
+  }
+  ec_enc_done(&enc);
+  if(!enc.error){
+    fprintf(stderr,"Raw bits overfill didn't fail when it should have");
+    ret=-1;
+  }
+  ec_enc_init(&enc,ptr,2);
+  for(i=0;i<17;i++){
+    ec_enc_bits(&enc,0,1);
+  }
+  ec_enc_done(&enc);
+  if(!enc.error){
+    fprintf(stderr,"17 raw bits encoded in two bytes");
+    ret=-1;
+  }
+  free(ptr);
+  return ret;
+}

+ 92 - 0
drivers/opus/celt/tests/test_unit_laplace.c

@@ -0,0 +1,92 @@
+/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation
+   Written by Jean-Marc Valin and Timothy B. Terriberry */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "laplace.h"
+#define CELT_C
+#include "stack_alloc.h"
+
+#include "entenc.c"
+#include "entdec.c"
+#include "entcode.c"
+#include "laplace.c"
+
+#define DATA_SIZE 40000
+
+int ec_laplace_get_start_freq(int decay)
+{
+   opus_uint32 ft = 32768 - LAPLACE_MINP*(2*LAPLACE_NMIN+1);
+   int fs = (ft*(16384-decay))/(16384+decay);
+   return fs+LAPLACE_MINP;
+}
+
+int main(void)
+{
+   int i;
+   int ret = 0;
+   ec_enc enc;
+   ec_dec dec;
+   unsigned char *ptr;
+   int val[10000], decay[10000];
+   ALLOC_STACK;
+   ptr = (unsigned char *)malloc(DATA_SIZE);
+   ec_enc_init(&enc,ptr,DATA_SIZE);
+
+   val[0] = 3; decay[0] = 6000;
+   val[1] = 0; decay[1] = 5800;
+   val[2] = -1; decay[2] = 5600;
+   for (i=3;i<10000;i++)
+   {
+      val[i] = rand()%15-7;
+      decay[i] = rand()%11000+5000;
+   }
+   for (i=0;i<10000;i++)
+      ec_laplace_encode(&enc, &val[i],
+            ec_laplace_get_start_freq(decay[i]), decay[i]);
+
+   ec_enc_done(&enc);
+
+   ec_dec_init(&dec,ec_get_buffer(&enc),ec_range_bytes(&enc));
+
+   for (i=0;i<10000;i++)
+   {
+      int d = ec_laplace_decode(&dec,
+            ec_laplace_get_start_freq(decay[i]), decay[i]);
+      if (d != val[i])
+      {
+         fprintf (stderr, "Got %d instead of %d\n", d, val[i]);
+         ret = 1;
+      }
+   }
+
+   return ret;
+}

+ 275 - 0
drivers/opus/celt/tests/test_unit_mathops.c

@@ -0,0 +1,275 @@
+/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
+                           Gregory Maxwell
+   Written by Jean-Marc Valin, Gregory Maxwell, and Timothy B. Terriberry */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#define CELT_C
+
+#include "mathops.c"
+#include "entenc.c"
+#include "entdec.c"
+#include "entcode.c"
+#include "bands.c"
+#include "quant_bands.c"
+#include "laplace.c"
+#include "vq.c"
+#include "cwrs.c"
+#include <stdio.h>
+#include <math.h>
+
+#ifdef OPUS_FIXED_POINT
+#define WORD "%d"
+#else
+#define WORD "%f"
+#endif
+
+int ret = 0;
+
+void testdiv(void)
+{
+   opus_int32 i;
+   for (i=1;i<=327670;i++)
+   {
+      double prod;
+      opus_val32 val;
+      val = celt_rcp(i);
+#ifdef OPUS_FIXED_POINT
+      prod = (1./32768./65526.)*val*i;
+#else
+      prod = val*i;
+#endif
+      if (fabs(prod-1) > .00025)
+      {
+         fprintf (stderr, "div failed: 1/%d="WORD" (product = %f)\n", i, val, prod);
+         ret = 1;
+      }
+   }
+}
+
+void testsqrt(void)
+{
+   opus_int32 i;
+   for (i=1;i<=1000000000;i++)
+   {
+      double ratio;
+      opus_val16 val;
+      val = celt_sqrt(i);
+      ratio = val/sqrt(i);
+      if (fabs(ratio - 1) > .0005 && fabs(val-sqrt(i)) > 2)
+      {
+         fprintf (stderr, "sqrt failed: sqrt(%d)="WORD" (ratio = %f)\n", i, val, ratio);
+         ret = 1;
+      }
+      i+= i>>10;
+   }
+}
+
+void testbitexactcos(void)
+{
+   int i;
+   opus_int32 min_d,max_d,last,chk;
+   chk=max_d=0;
+   last=min_d=32767;
+   for(i=64;i<=16320;i++)
+   {
+      opus_int32 d;
+      opus_int32 q=bitexact_cos(i);
+      chk ^= q*i;
+      d = last - q;
+      if (d>max_d)max_d=d;
+      if (d<min_d)min_d=d;
+      last = q;
+   }
+   if ((chk!=89408644)||(max_d!=5)||(min_d!=0)||(bitexact_cos(64)!=32767)||
+       (bitexact_cos(16320)!=200)||(bitexact_cos(8192)!=23171))
+   {
+      fprintf (stderr, "bitexact_cos failed\n");
+      ret = 1;
+   }
+}
+
+void testbitexactlog2tan(void)
+{
+   int i,fail;
+   opus_int32 min_d,max_d,last,chk;
+   fail=chk=max_d=0;
+   last=min_d=15059;
+   for(i=64;i<8193;i++)
+   {
+      opus_int32 d;
+      opus_int32 mid=bitexact_cos(i);
+      opus_int32 side=bitexact_cos(16384-i);
+      opus_int32 q=bitexact_log2tan(mid,side);
+      chk ^= q*i;
+      d = last - q;
+      if (q!=-1*bitexact_log2tan(side,mid))
+        fail = 1;
+      if (d>max_d)max_d=d;
+      if (d<min_d)min_d=d;
+      last = q;
+   }
+   if ((chk!=15821257)||(max_d!=61)||(min_d!=-2)||fail||
+       (bitexact_log2tan(32767,200)!=15059)||(bitexact_log2tan(30274,12540)!=2611)||
+       (bitexact_log2tan(23171,23171)!=0))
+   {
+      fprintf (stderr, "bitexact_log2tan failed\n");
+      ret = 1;
+   }
+}
+
+#ifndef OPUS_FIXED_POINT
+void testlog2(void)
+{
+   float x;
+   for (x=0.001;x<1677700.0;x+=(x/8.0))
+   {
+      float error = fabs((1.442695040888963387*log(x))-celt_log2(x));
+      if (error>0.0009)
+      {
+         fprintf (stderr, "celt_log2 failed: fabs((1.442695040888963387*log(x))-celt_log2(x))>0.001 (x = %f, error = %f)\n", x,error);
+         ret = 1;
+      }
+   }
+}
+
+void testexp2(void)
+{
+   float x;
+   for (x=-11.0;x<24.0;x+=0.0007)
+   {
+      float error = fabs(x-(1.442695040888963387*log(celt_exp2(x))));
+      if (error>0.0002)
+      {
+         fprintf (stderr, "celt_exp2 failed: fabs(x-(1.442695040888963387*log(celt_exp2(x))))>0.0005 (x = %f, error = %f)\n", x,error);
+         ret = 1;
+      }
+   }
+}
+
+void testexp2log2(void)
+{
+   float x;
+   for (x=-11.0;x<24.0;x+=0.0007)
+   {
+      float error = fabs(x-(celt_log2(celt_exp2(x))));
+      if (error>0.001)
+      {
+         fprintf (stderr, "celt_log2/celt_exp2 failed: fabs(x-(celt_log2(celt_exp2(x))))>0.001 (x = %f, error = %f)\n", x,error);
+         ret = 1;
+      }
+   }
+}
+#else
+void testlog2(void)
+{
+   opus_val32 x;
+   for (x=8;x<1073741824;x+=(x>>3))
+   {
+      float error = fabs((1.442695040888963387*log(x/16384.0))-celt_log2(x)/1024.0);
+      if (error>0.003)
+      {
+         fprintf (stderr, "celt_log2 failed: x = %ld, error = %f\n", (long)x,error);
+         ret = 1;
+      }
+   }
+}
+
+void testexp2(void)
+{
+   opus_val16 x;
+   for (x=-32768;x<15360;x++)
+   {
+      float error1 = fabs(x/1024.0-(1.442695040888963387*log(celt_exp2(x)/65536.0)));
+      float error2 = fabs(exp(0.6931471805599453094*x/1024.0)-celt_exp2(x)/65536.0);
+      if (error1>0.0002&&error2>0.00004)
+      {
+    	 fprintf (stderr, "celt_exp2 failed: x = "WORD", error1 = %f, error2 = %f\n", x,error1,error2);
+         ret = 1;
+      }
+   }
+}
+
+void testexp2log2(void)
+{
+   opus_val32 x;
+   for (x=8;x<65536;x+=(x>>3))
+   {
+      float error = fabs(x-0.25*celt_exp2(celt_log2(x)))/16384;
+      if (error>0.004)
+      {
+         fprintf (stderr, "celt_log2/celt_exp2 failed: fabs(x-(celt_exp2(celt_log2(x))))>0.001 (x = %ld, error = %f)\n", (long)x,error);
+         ret = 1;
+      }
+   }
+}
+
+void testilog2(void)
+{
+   opus_val32 x;
+   for (x=1;x<=268435455;x+=127)
+   {
+      opus_val32 lg;
+      opus_val32 y;
+
+      lg = celt_ilog2(x);
+      if (lg<0 || lg>=31)
+      {
+         printf("celt_ilog2 failed: 0<=celt_ilog2(x)<31 (x = %d, celt_ilog2(x) = %d)\n",x,lg);
+         ret = 1;
+      }
+      y = 1<<lg;
+
+      if (x<y || (x>>1)>=y)
+      {
+         printf("celt_ilog2 failed: 2**celt_ilog2(x)<=x<2**(celt_ilog2(x)+1) (x = %d, 2**celt_ilog2(x) = %d)\n",x,y);
+         ret = 1;
+      }
+   }
+}
+#endif
+
+int main(void)
+{
+   testbitexactcos();
+   testbitexactlog2tan();
+   testdiv();
+   testsqrt();
+   testlog2();
+   testexp2();
+   testexp2log2();
+#ifdef OPUS_FIXED_POINT
+   testilog2();
+#endif
+   return ret;
+}

+ 210 - 0
drivers/opus/celt/tests/test_unit_mdct.c

@@ -0,0 +1,210 @@
+/* Copyright (c) 2008-2011 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#define CELT_C
+#include "mdct.h"
+#include "stack_alloc.h"
+
+#include "kiss_fft.c"
+#include "mdct.c"
+#include "mathops.c"
+#include "entcode.c"
+
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+int ret = 0;
+void check(kiss_fft_scalar  * in,kiss_fft_scalar  * out,int nfft,int isinverse)
+{
+    int bin,k;
+    double errpow=0,sigpow=0;
+    double snr;
+    for (bin=0;bin<nfft/2;++bin) {
+        double ansr = 0;
+        double difr;
+
+        for (k=0;k<nfft;++k) {
+           double phase = 2*M_PI*(k+.5+.25*nfft)*(bin+.5)/nfft;
+           double re = cos(phase);
+
+           re /= nfft/4;
+
+           ansr += in[k] * re;
+        }
+        /*printf ("%f %f\n", ansr, out[bin]);*/
+        difr = ansr - out[bin];
+        errpow += difr*difr;
+        sigpow += ansr*ansr;
+    }
+    snr = 10*log10(sigpow/errpow);
+    printf("nfft=%d inverse=%d,snr = %f\n",nfft,isinverse,snr );
+    if (snr<60) {
+       printf( "** poor snr: %f **\n", snr);
+       ret = 1;
+    }
+}
+
+void check_inv(kiss_fft_scalar  * in,kiss_fft_scalar  * out,int nfft,int isinverse)
+{
+   int bin,k;
+   double errpow=0,sigpow=0;
+   double snr;
+   for (bin=0;bin<nfft;++bin) {
+      double ansr = 0;
+      double difr;
+
+      for (k=0;k<nfft/2;++k) {
+         double phase = 2*M_PI*(bin+.5+.25*nfft)*(k+.5)/nfft;
+         double re = cos(phase);
+
+         /*re *= 2;*/
+
+         ansr += in[k] * re;
+      }
+      /*printf ("%f %f\n", ansr, out[bin]);*/
+      difr = ansr - out[bin];
+      errpow += difr*difr;
+      sigpow += ansr*ansr;
+   }
+   snr = 10*log10(sigpow/errpow);
+   printf("nfft=%d inverse=%d,snr = %f\n",nfft,isinverse,snr );
+   if (snr<60) {
+      printf( "** poor snr: %f **\n", snr);
+      ret = 1;
+   }
+}
+
+
+void test1d(int nfft,int isinverse)
+{
+    celt_mdct_lookup cfg;
+    size_t buflen = sizeof(kiss_fft_scalar)*nfft;
+
+    kiss_fft_scalar  * in = (kiss_fft_scalar*)malloc(buflen);
+    kiss_fft_scalar  * in_copy = (kiss_fft_scalar*)malloc(buflen);
+    kiss_fft_scalar  * out= (kiss_fft_scalar*)malloc(buflen);
+    opus_val16  * window= (opus_val16*)malloc(sizeof(opus_val16)*nfft/2);
+    int k;
+
+    clt_mdct_init(&cfg, nfft, 0);
+    for (k=0;k<nfft;++k) {
+        in[k] = (rand() % 32768) - 16384;
+    }
+
+    for (k=0;k<nfft/2;++k) {
+       window[k] = Q15ONE;
+    }
+    for (k=0;k<nfft;++k) {
+       in[k] *= 32768;
+    }
+
+    if (isinverse)
+    {
+       for (k=0;k<nfft;++k) {
+          in[k] /= nfft;
+       }
+    }
+
+    for (k=0;k<nfft;++k)
+       in_copy[k] = in[k];
+    /*for (k=0;k<nfft;++k) printf("%d %d ", in[k].r, in[k].i);printf("\n");*/
+
+    if (isinverse)
+    {
+       for (k=0;k<nfft;++k)
+          out[k] = 0;
+       clt_mdct_backward(&cfg,in,out, window, nfft/2, 0, 1);
+       /* apply TDAC because clt_mdct_backward() no longer does that */
+       for (k=0;k<nfft/4;++k)
+          out[nfft-k-1] = out[nfft/2+k];
+       check_inv(in,out,nfft,isinverse);
+    } else {
+       clt_mdct_forward(&cfg,in,out,window, nfft/2, 0, 1);
+       check(in_copy,out,nfft,isinverse);
+    }
+    /*for (k=0;k<nfft;++k) printf("%d %d ", out[k].r, out[k].i);printf("\n");*/
+
+
+    free(in);
+    free(out);
+    clt_mdct_clear(&cfg);
+}
+
+int main(int argc,char ** argv)
+{
+    ALLOC_STACK;
+    if (argc>1) {
+        int k;
+        for (k=1;k<argc;++k) {
+            test1d(atoi(argv[k]),0);
+            test1d(atoi(argv[k]),1);
+        }
+    }else{
+        test1d(32,0);
+        test1d(32,1);
+        test1d(256,0);
+        test1d(256,1);
+        test1d(512,0);
+        test1d(512,1);
+        test1d(1024,0);
+        test1d(1024,1);
+        test1d(2048,0);
+        test1d(2048,1);
+#ifndef RADIX_TWO_ONLY
+        test1d(36,0);
+        test1d(36,1);
+        test1d(40,0);
+        test1d(40,1);
+        test1d(60,0);
+        test1d(60,1);
+        test1d(120,0);
+        test1d(120,1);
+        test1d(240,0);
+        test1d(240,1);
+        test1d(480,0);
+        test1d(480,1);
+        test1d(960,0);
+        test1d(960,1);
+        test1d(1920,0);
+        test1d(1920,1);
+#endif
+    }
+    return ret;
+}

+ 90 - 0
drivers/opus/celt/tests/test_unit_rotation.c

@@ -0,0 +1,90 @@
+/* Copyright (c) 2008-2011 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#define CELT_C
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "vq.c"
+#include "cwrs.c"
+#include "entcode.c"
+#include "entenc.c"
+#include "entdec.c"
+#include "mathops.c"
+#include "bands.h"
+#include <math.h>
+#define MAX_SIZE 100
+
+int ret=0;
+void test_rotation(int N, int K)
+{
+   int i;
+   double err = 0, ener = 0, snr, snr0;
+   opus_val16 x0[MAX_SIZE];
+   opus_val16 x1[MAX_SIZE];
+   for (i=0;i<N;i++)
+      x1[i] = x0[i] = rand()%32767-16384;
+   exp_rotation(x1, N, 1, 1, K, SPREAD_NORMAL);
+   for (i=0;i<N;i++)
+   {
+      err += (x0[i]-(double)x1[i])*(x0[i]-(double)x1[i]);
+      ener += x0[i]*(double)x0[i];
+   }
+   snr0 = 20*log10(ener/err);
+   err = ener = 0;
+   exp_rotation(x1, N, -1, 1, K, SPREAD_NORMAL);
+   for (i=0;i<N;i++)
+   {
+      err += (x0[i]-(double)x1[i])*(x0[i]-(double)x1[i]);
+      ener += x0[i]*(double)x0[i];
+   }
+   snr = 20*log10(ener/err);
+   printf ("SNR for size %d (%d pulses) is %f (was %f without inverse)\n", N, K, snr, snr0);
+   if (snr < 60 || snr0 > 20)
+   {
+      fprintf(stderr, "FAIL!\n");
+      ret = 1;
+   }
+}
+
+int main(void)
+{
+   ALLOC_STACK;
+   test_rotation(15, 3);
+   test_rotation(23, 5);
+   test_rotation(50, 3);
+   test_rotation(80, 1);
+   return ret;
+}

+ 50 - 0
drivers/opus/celt/tests/test_unit_types.c

@@ -0,0 +1,50 @@
+/* Copyright (c) 2008-2011 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "opus_types.h"
+#include <stdio.h>
+
+int main(void)
+{
+   opus_int16 i = 1;
+   i <<= 14;
+   if (i>>14 != 1)
+   {
+      fprintf(stderr, "opus_int16 isn't 16 bits\n");
+      return 1;
+   }
+   if (sizeof(opus_int16)*2 != sizeof(opus_int32))
+   {
+      fprintf(stderr, "16*2 != 32\n");
+      return 1;
+   }
+   return 0;
+}

+ 415 - 0
drivers/opus/celt/vq.c

@@ -0,0 +1,415 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "mathops.h"
+#include "cwrs.h"
+#include "vq.h"
+#include "arch.h"
+#include "os_support.h"
+#include "bands.h"
+#include "rate.h"
+
+static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
+{
+   int i;
+   celt_norm *Xptr;
+   Xptr = X;
+   for (i=0;i<len-stride;i++)
+   {
+      celt_norm x1, x2;
+      x1 = Xptr[0];
+      x2 = Xptr[stride];
+      Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
+      *Xptr++      = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
+   }
+   Xptr = &X[len-2*stride-1];
+   for (i=len-2*stride-1;i>=0;i--)
+   {
+      celt_norm x1, x2;
+      x1 = Xptr[0];
+      x2 = Xptr[stride];
+      Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
+      *Xptr--      = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
+   }
+}
+
+static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread)
+{
+   static const int SPREAD_FACTOR[3]={15,10,5};
+   int i;
+   opus_val16 c, s;
+   opus_val16 gain, theta;
+   int stride2=0;
+   int factor;
+
+   if (2*K>=len || spread==SPREAD_NONE)
+      return;
+   factor = SPREAD_FACTOR[spread-1];
+
+   gain = celt_div((opus_val32)MULT16_16(Q15_ONE,len),(opus_val32)(len+factor*K));
+   theta = HALF16(MULT16_16_Q15(gain,gain));
+
+   c = celt_cos_norm(EXTEND32(theta));
+   s = celt_cos_norm(EXTEND32(SUB16(Q15ONE,theta))); /*  sin(theta) */
+
+   if (len>=8*stride)
+   {
+      stride2 = 1;
+      /* This is just a simple (equivalent) way of computing sqrt(len/stride) with rounding.
+         It's basically incrementing long as (stride2+0.5)^2 < len/stride. */
+      while ((stride2*stride2+stride2)*stride + (stride>>2) < len)
+         stride2++;
+   }
+   /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
+      extract_collapse_mask().*/
+   len /= stride;
+   for (i=0;i<stride;i++)
+   {
+      if (dir < 0)
+      {
+         if (stride2)
+            exp_rotation1(X+i*len, len, stride2, s, c);
+         exp_rotation1(X+i*len, len, 1, c, s);
+      } else {
+         exp_rotation1(X+i*len, len, 1, c, -s);
+         if (stride2)
+            exp_rotation1(X+i*len, len, stride2, s, -c);
+      }
+   }
+}
+
+/** Takes the pitch vector and the decoded residual vector, computes the gain
+    that will give ||p+g*y||=1 and mixes the residual with the pitch. */
+static void normalise_residual(int * OPUS_RESTRICT iy, celt_norm * OPUS_RESTRICT X,
+      int N, opus_val32 Ryy, opus_val16 gain)
+{
+   int i;
+#ifdef OPUS_FIXED_POINT
+   int k;
+#endif
+   opus_val32 t;
+   opus_val16 g;
+
+#ifdef OPUS_FIXED_POINT
+   k = celt_ilog2(Ryy)>>1;
+#endif
+   t = VSHR32(Ryy, 2*(k-7));
+   g = MULT16_16_P15(celt_rsqrt_norm(t),gain);
+
+   i=0;
+   do
+      X[i] = EXTRACT16(PSHR32(MULT16_16(g, iy[i]), k+1));
+   while (++i < N);
+}
+
+static unsigned extract_collapse_mask(int *iy, int N, int B)
+{
+   unsigned collapse_mask;
+   int N0;
+   int i;
+   if (B<=1)
+      return 1;
+   /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
+      exp_rotation().*/
+   N0 = N/B;
+   collapse_mask = 0;
+   i=0; do {
+      int j;
+      j=0; do {
+         collapse_mask |= (iy[i*N0+j]!=0)<<i;
+      } while (++j<N0);
+   } while (++i<B);
+   return collapse_mask;
+}
+
+unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
+#ifdef RESYNTH
+   , opus_val16 gain
+#endif
+   )
+{
+   VARDECL(celt_norm, y);
+   VARDECL(int, iy);
+   VARDECL(opus_val16, signx);
+   int i, j;
+   opus_val16 s;
+   int pulsesLeft;
+   opus_val32 sum;
+   opus_val32 xy;
+   opus_val16 yy;
+   unsigned collapse_mask;
+   SAVE_STACK;
+
+   celt_assert2(K>0, "alg_quant() needs at least one pulse");
+   celt_assert2(N>1, "alg_quant() needs at least two dimensions");
+
+   ALLOC(y, N, celt_norm);
+   ALLOC(iy, N, int);
+   ALLOC(signx, N, opus_val16);
+
+   exp_rotation(X, N, 1, B, K, spread);
+
+   /* Get rid of the sign */
+   sum = 0;
+   j=0; do {
+      if (X[j]>0)
+         signx[j]=1;
+      else {
+         signx[j]=-1;
+         X[j]=-X[j];
+      }
+      iy[j] = 0;
+      y[j] = 0;
+   } while (++j<N);
+
+   xy = yy = 0;
+
+   pulsesLeft = K;
+
+   /* Do a pre-search by projecting on the pyramid */
+   if (K > (N>>1))
+   {
+      opus_val16 rcp;
+      j=0; do {
+         sum += X[j];
+      }  while (++j<N);
+
+      /* If X is too small, just replace it with a pulse at 0 */
+#ifdef OPUS_FIXED_POINT
+      if (sum <= K)
+#else
+      /* Prevents infinities and NaNs from causing too many pulses
+         to be allocated. 64 is an approximation of infinity here. */
+      if (!(sum > EPSILON && sum < 64))
+#endif
+      {
+         X[0] = QCONST16(1.f,14);
+         j=1; do
+            X[j]=0;
+         while (++j<N);
+         sum = QCONST16(1.f,14);
+      }
+      rcp = EXTRACT16(MULT16_32_Q16(K-1, celt_rcp(sum)));
+      j=0; do {
+#ifdef OPUS_FIXED_POINT
+         /* It's really important to round *towards zero* here */
+         iy[j] = MULT16_16_Q15(X[j],rcp);
+#else
+         iy[j] = (int)floor(rcp*X[j]);
+#endif
+         y[j] = (celt_norm)iy[j];
+         yy = MAC16_16(yy, y[j],y[j]);
+         xy = MAC16_16(xy, X[j],y[j]);
+         y[j] *= 2;
+         pulsesLeft -= iy[j];
+      }  while (++j<N);
+   }
+   celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass");
+
+   /* This should never happen, but just in case it does (e.g. on silence)
+      we fill the first bin with pulses. */
+#ifdef OPUS_FIXED_POINT_DEBUG
+   celt_assert2(pulsesLeft<=N+3, "Not enough pulses in the quick pass");
+#endif
+   if (pulsesLeft > N+3)
+   {
+      opus_val16 tmp = (opus_val16)pulsesLeft;
+      yy = MAC16_16(yy, tmp, tmp);
+      yy = MAC16_16(yy, tmp, y[0]);
+      iy[0] += pulsesLeft;
+      pulsesLeft=0;
+   }
+
+   s = 1;
+   for (i=0;i<pulsesLeft;i++)
+   {
+      int best_id;
+      opus_val32 best_num = -VERY_LARGE16;
+      opus_val16 best_den = 0;
+#ifdef OPUS_FIXED_POINT
+      int rshift;
+#endif
+#ifdef OPUS_FIXED_POINT
+      rshift = 1+celt_ilog2(K-pulsesLeft+i+1);
+#endif
+      best_id = 0;
+      /* The squared magnitude term gets added anyway, so we might as well
+         add it outside the loop */
+      yy = ADD32(yy, 1);
+      j=0;
+      do {
+         opus_val16 Rxy, Ryy;
+         /* Temporary sums of the new pulse(s) */
+         Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[j])),rshift));
+         /* We're multiplying y[j] by two so we don't have to do it here */
+         Ryy = ADD16(yy, y[j]);
+
+         /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that
+            Rxy is positive because the sign is pre-computed) */
+         Rxy = MULT16_16_Q15(Rxy,Rxy);
+         /* The idea is to check for num/den >= best_num/best_den, but that way
+            we can do it without any division */
+         /* OPT: Make sure to use conditional moves here */
+         if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num))
+         {
+            best_den = Ryy;
+            best_num = Rxy;
+            best_id = j;
+         }
+      } while (++j<N);
+
+      /* Updating the sums of the new pulse(s) */
+      xy = ADD32(xy, EXTEND32(X[best_id]));
+      /* We're multiplying y[j] by two so we don't have to do it here */
+      yy = ADD16(yy, y[best_id]);
+
+      /* Only now that we've made the final choice, update y/iy */
+      /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
+      y[best_id] += 2*s;
+      iy[best_id]++;
+   }
+
+   /* Put the original sign back */
+   j=0;
+   do {
+      X[j] = MULT16_16(signx[j],X[j]);
+      if (signx[j] < 0)
+         iy[j] = -iy[j];
+   } while (++j<N);
+   encode_pulses(iy, N, K, enc);
+
+#ifdef RESYNTH
+   normalise_residual(iy, X, N, yy, gain);
+   exp_rotation(X, N, -1, B, K, spread);
+#endif
+
+   collapse_mask = extract_collapse_mask(iy, N, B);
+   RESTORE_STACK;
+   return collapse_mask;
+}
+
+/** Decode pulse vector and combine the result with the pitch vector to produce
+    the final normalised signal in the current band. */
+unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
+      ec_dec *dec, opus_val16 gain)
+{
+   int i;
+   opus_val32 Ryy;
+   unsigned collapse_mask;
+   VARDECL(int, iy);
+   SAVE_STACK;
+
+   celt_assert2(K>0, "alg_unquant() needs at least one pulse");
+   celt_assert2(N>1, "alg_unquant() needs at least two dimensions");
+   ALLOC(iy, N, int);
+   decode_pulses(iy, N, K, dec);
+   Ryy = 0;
+   i=0;
+   do {
+      Ryy = MAC16_16(Ryy, iy[i], iy[i]);
+   } while (++i < N);
+   normalise_residual(iy, X, N, Ryy, gain);
+   exp_rotation(X, N, -1, B, K, spread);
+   collapse_mask = extract_collapse_mask(iy, N, B);
+   RESTORE_STACK;
+   return collapse_mask;
+}
+
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
+{
+   int i;
+#ifdef OPUS_FIXED_POINT
+   int k;
+#endif
+   opus_val32 E = EPSILON;
+   opus_val16 g;
+   opus_val32 t;
+   celt_norm *xptr = X;
+   for (i=0;i<N;i++)
+   {
+      E = MAC16_16(E, *xptr, *xptr);
+      xptr++;
+   }
+#ifdef OPUS_FIXED_POINT
+   k = celt_ilog2(E)>>1;
+#endif
+   t = VSHR32(E, 2*(k-7));
+   g = MULT16_16_P15(celt_rsqrt_norm(t),gain);
+
+   xptr = X;
+   for (i=0;i<N;i++)
+   {
+      *xptr = EXTRACT16(PSHR32(MULT16_16(g, *xptr), k+1));
+      xptr++;
+   }
+   /*return celt_sqrt(E);*/
+}
+
+int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
+{
+   int i;
+   int itheta;
+   opus_val16 mid, side;
+   opus_val32 Emid, Eside;
+
+   Emid = Eside = EPSILON;
+   if (stereo)
+   {
+      for (i=0;i<N;i++)
+      {
+         celt_norm m, s;
+         m = ADD16(SHR16(X[i],1),SHR16(Y[i],1));
+         s = SUB16(SHR16(X[i],1),SHR16(Y[i],1));
+         Emid = MAC16_16(Emid, m, m);
+         Eside = MAC16_16(Eside, s, s);
+      }
+   } else {
+      for (i=0;i<N;i++)
+      {
+         celt_norm m, s;
+         m = X[i];
+         s = Y[i];
+         Emid = MAC16_16(Emid, m, m);
+         Eside = MAC16_16(Eside, s, s);
+      }
+   }
+   mid = celt_sqrt(Emid);
+   side = celt_sqrt(Eside);
+#ifdef OPUS_FIXED_POINT
+   /* 0.63662 = 2/pi */
+   itheta = MULT16_16_Q15(QCONST16(0.63662f,15),celt_atan2p(side, mid));
+#else
+   itheta = (int)floor(.5f+16384*0.63662f*atan2(side,mid));
+#endif
+
+   return itheta;
+}

+ 70 - 0
drivers/opus/celt/vq.h

@@ -0,0 +1,70 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file vq.h
+   @brief Vector quantisation of the residual
+ */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef VQ_H
+#define VQ_H
+
+#include "entenc.h"
+#include "entdec.h"
+#include "opus_modes.h"
+
+/** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of
+  * the pitch and a combination of pulses such that its norm is still equal
+  * to 1. This is the function that will typically require the most CPU.
+ * @param X Residual signal to quantise/encode (returns quantised version)
+ * @param N Number of samples to encode
+ * @param K Number of pulses to use
+ * @param enc Entropy encoder state
+ * @ret A mask indicating which blocks in the band received pulses
+*/
+unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B,
+      ec_enc *enc
+#ifdef RESYNTH
+      , opus_val16 gain
+#endif
+      );
+
+/** Algebraic pulse decoder
+ * @param X Decoded normalised spectrum (returned)
+ * @param N Number of samples to decode
+ * @param K Number of pulses to use
+ * @param dec Entropy decoder state
+ * @ret A mask indicating which blocks in the band received pulses
+ */
+unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
+      ec_dec *dec, opus_val16 gain);
+
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain);
+
+int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N);
+
+#endif /* VQ_H */

+ 156 - 0
drivers/opus/celt/x86/pitch_sse.h

@@ -0,0 +1,156 @@
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
+/**
+   @file pitch_sse.h
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_SSE_H
+#define PITCH_SSE_H
+
+#include <xmmintrin.h>
+#include "arch.h"
+
+#define OVERRIDE_XCORR_KERNEL
+static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+{
+   int j;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_loadu_ps(sum);
+   xsum2 = _mm_setzero_ps();
+
+   for (j = 0; j < len-3; j += 4)
+   {
+      __m128 x0 = _mm_loadu_ps(x+j);
+      __m128 yj = _mm_loadu_ps(y+j);
+      __m128 y3 = _mm_loadu_ps(y+j+3);
+
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
+                                          _mm_shuffle_ps(yj,y3,0x49)));
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
+                                          _mm_shuffle_ps(yj,y3,0x9e)));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
+   }
+   if (j < len)
+   {
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+      if (++j < len)
+      {
+         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         if (++j < len)
+         {
+            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         }
+      }
+   }
+   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
+}
+
+#define OVERRIDE_DUAL_INNER_PROD
+static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_setzero_ps();
+   xsum2 = _mm_setzero_ps();
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 xi = _mm_loadu_ps(x+i);
+      __m128 y1i = _mm_loadu_ps(y01+i);
+      __m128 y2i = _mm_loadu_ps(y02+i);
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
+   }
+   /* Horizontal sum */
+   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
+   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
+   _mm_store_ss(xy1, xsum1);
+   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
+   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
+   _mm_store_ss(xy2, xsum2);
+   for (;i<N;i++)
+   {
+      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
+      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
+   }
+}
+
+#define OVERRIDE_COMB_FILTER_CONST
+static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   int i;
+   __m128 x0v;
+   __m128 g10v, g11v, g12v;
+   g10v = _mm_load1_ps(&g10);
+   g11v = _mm_load1_ps(&g11);
+   g12v = _mm_load1_ps(&g12);
+   x0v = _mm_loadu_ps(&x[-T-2]);
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 yi, yi2, x1v, x2v, x3v, x4v;
+      const opus_val32 *xp = &x[i-T-2];
+      yi = _mm_loadu_ps(x+i);
+      x4v = _mm_loadu_ps(xp+4);
+#if 0
+      /* Slower version with all loads */
+      x1v = _mm_loadu_ps(xp+1);
+      x2v = _mm_loadu_ps(xp+2);
+      x3v = _mm_loadu_ps(xp+3);
+#else
+      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+      /* Use partial sums */
+      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+      yi = _mm_add_ps(yi, yi2);
+#endif
+      x0v=x4v;
+      _mm_storeu_ps(y+i, yi);
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x[i-T])
+               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+   }
+#endif
+}
+
+#endif

+ 3391 - 0
drivers/opus/http.c

@@ -0,0 +1,3391 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 2012                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************/
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "internal.h"
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+
+/*RFCs referenced in this file:
+  RFC  761: DOD Standard Transmission Control Protocol
+  RFC 1535: A Security Problem and Proposed Correction With Widely Deployed DNS
+   Software
+  RFC 1738: Uniform Resource Locators (URL)
+  RFC 1945: Hypertext Transfer Protocol -- HTTP/1.0
+  RFC 2068: Hypertext Transfer Protocol -- HTTP/1.1
+  RFC 2145: Use and Interpretation of HTTP Version Numbers
+  RFC 2246: The TLS Protocol Version 1.0
+  RFC 2459: Internet X.509 Public Key Infrastructure Certificate and
+   Certificate Revocation List (CRL) Profile
+  RFC 2616: Hypertext Transfer Protocol -- HTTP/1.1
+  RFC 2617: HTTP Authentication: Basic and Digest Access Authentication
+  RFC 2817: Upgrading to TLS Within HTTP/1.1
+  RFC 2818: HTTP Over TLS
+  RFC 3492: Punycode: A Bootstring encoding of Unicode for Internationalized
+   Domain Names in Applications (IDNA)
+  RFC 3986: Uniform Resource Identifier (URI): Generic Syntax
+  RFC 3987: Internationalized Resource Identifiers (IRIs)
+  RFC 4343: Domain Name System (DNS) Case Insensitivity Clarification
+  RFC 5894: Internationalized Domain Names for Applications (IDNA):
+   Background, Explanation, and Rationale
+  RFC 6066: Transport Layer Security (TLS) Extensions: Extension Definitions
+  RFC 6125: Representation and Verification of Domain-Based Application Service
+   Identity within Internet Public Key Infrastructure Using X.509 (PKIX)
+   Certificates in the Context of Transport Layer Security (TLS)
+  RFC 6555: Happy Eyeballs: Success with Dual-Stack Hosts*/
+
+typedef struct OpusParsedURL   OpusParsedURL;
+typedef struct OpusStringBuf   OpusStringBuf;
+typedef struct OpusHTTPConn    OpusHTTPConn;
+typedef struct OpusHTTPStream  OpusHTTPStream;
+
+static char *op_string_range_dup(const char *_start,const char *_end){
+  size_t  len;
+  char   *ret;
+  OP_ASSERT(_start<=_end);
+  len=_end-_start;
+  /*This is to help avoid overflow elsewhere, later.*/
+  if(OP_UNLIKELY(len>=INT_MAX))return NULL;
+  ret=(char *)_ogg_malloc(sizeof(*ret)*(len+1));
+  if(OP_LIKELY(ret!=NULL)){
+    ret=(char *)memcpy(ret,_start,sizeof(*ret)*(len));
+    ret[len]='\0';
+  }
+  return ret;
+}
+
+static char *op_string_dup(const char *_s){
+  return op_string_range_dup(_s,_s+strlen(_s));
+}
+
+static char *op_string_tolower(char *_s){
+  int i;
+  for(i=0;_s[i]!='\0';i++){
+    int c;
+    c=_s[i];
+    if(c>='A'&&c<='Z')c+='a'-'A';
+    _s[i]=(char)c;
+  }
+  return _s;
+}
+
+/*URI character classes (from RFC 3986).*/
+#define OP_URL_ALPHA \
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+#define OP_URL_DIGIT       "0123456789"
+#define OP_URL_HEXDIGIT    "0123456789ABCDEFabcdef"
+/*Not a character class, but the characters allowed in <scheme>.*/
+#define OP_URL_SCHEME      OP_URL_ALPHA OP_URL_DIGIT "+-."
+#define OP_URL_GEN_DELIMS  "#/:?@[]"
+#define OP_URL_SUB_DELIMS  "!$&'()*+,;="
+#define OP_URL_RESERVED    OP_URL_GEN_DELIMS OP_URL_SUB_DELIMS
+#define OP_URL_UNRESERVED  OP_URL_ALPHA OP_URL_DIGIT "-._~"
+/*Not a character class, but the characters allowed in <pct-encoded>.*/
+#define OP_URL_PCT_ENCODED "%"
+/*Not a character class or production rule, but for convenience.*/
+#define OP_URL_PCHAR_BASE \
+ OP_URL_UNRESERVED OP_URL_PCT_ENCODED OP_URL_SUB_DELIMS
+#define OP_URL_PCHAR       OP_URL_PCHAR_BASE ":@"
+/*Not a character class, but the characters allowed in <userinfo> and
+   <IP-literal>.*/
+#define OP_URL_PCHAR_NA    OP_URL_PCHAR_BASE ":"
+/*Not a character class, but the characters allowed in <segment-nz-nc>.*/
+#define OP_URL_PCHAR_NC    OP_URL_PCHAR_BASE "@"
+/*Not a character clsss, but the characters allowed in <path>.*/
+#define OP_URL_PATH        OP_URL_PCHAR "/"
+/*Not a character class, but the characters allowed in <query> / <fragment>.*/
+#define OP_URL_QUERY_FRAG  OP_URL_PCHAR "/?"
+
+/*Check the <% HEXDIG HEXDIG> escapes of a URL for validity.
+  Return: 0 if valid, or a negative value on failure.*/
+static int op_validate_url_escapes(const char *_s){
+  int i;
+  for(i=0;_s[i];i++){
+    if(_s[i]=='%'){
+      if(OP_UNLIKELY(!isxdigit(_s[i+1]))
+       ||OP_UNLIKELY(!isxdigit(_s[i+2]))
+       /*RFC 3986 says %00 "should be rejected if the application is not
+          expecting to receive raw data within a component."*/
+       ||OP_UNLIKELY(_s[i+1]=='0'&&_s[i+2]=='0')){
+        return OP_FALSE;
+      }
+      i+=2;
+    }
+  }
+  return 0;
+}
+
+/*Convert a hex digit to its actual value.
+  _c: The hex digit to convert.
+      Presumed to be valid ('0'...'9', 'A'...'F', or 'a'...'f').
+  Return: The value of the digit, in the range [0,15].*/
+static int op_hex_value(int _c){
+  return _c>='a'?_c-'a'+10:_c>='A'?_c-'A'+10:_c-'0';
+}
+
+/*Unescape all the <% HEXDIG HEXDIG> sequences in a string in-place.
+  This does no validity checking.*/
+static char *op_unescape_url_component(char *_s){
+  int i;
+  int j;
+  for(i=j=0;_s[i];i++,j++){
+    if(_s[i]=='%'){
+      _s[i]=(char)(op_hex_value(_s[i+1])<<4|op_hex_value(_s[i+2]));
+      i+=2;
+    }
+  }
+  return _s;
+}
+
+/*Parse a file: URL.
+  This code is not meant to be fast: strspn() with large sets is likely to be
+   slow, but it is very convenient.
+  It is meant to be RFC 1738-compliant (as updated by RFC 3986).*/
+static const char *op_parse_file_url(const char *_src){
+  const char *scheme_end;
+  const char *path;
+  const char *path_end;
+  scheme_end=_src+strspn(_src,OP_URL_SCHEME);
+  if(OP_UNLIKELY(*scheme_end!=':')
+   ||scheme_end-_src!=4||op_strncasecmp(_src,"file",4)!=0){
+    /*Unsupported protocol.*/
+    return NULL;
+  }
+  /*Make sure all escape sequences are valid to simplify unescaping later.*/
+  if(OP_UNLIKELY(op_validate_url_escapes(scheme_end+1)<0))return NULL;
+  if(scheme_end[1]=='/'&&scheme_end[2]=='/'){
+    const char *host;
+    /*file: URLs can have a host!
+      Yeah, I was surprised, too, but that's what RFC 1738 says.
+      It also says, "The file URL scheme is unusual in that it does not specify
+       an Internet protocol or access method for such files; as such, its
+       utility in network protocols between hosts is limited," which is a mild
+       understatement.*/
+    host=scheme_end+3;
+    /*The empty host is what we expect.*/
+    if(OP_LIKELY(*host=='/'))path=host;
+    else{
+      const char *host_end;
+      char        host_buf[28];
+      /*RFC 1738 says localhost "is interpreted as `the machine from which the
+         URL is being interpreted,'" so let's check for it.*/
+      host_end=host+strspn(host,OP_URL_PCHAR_BASE);
+      /*No <port> allowed.
+        This also rejects IP-Literals.*/
+      if(*host_end!='/')return NULL;
+      /*An escaped "localhost" can take at most 27 characters.*/
+      if(OP_UNLIKELY(host_end-host>27))return NULL;
+      memcpy(host_buf,host,sizeof(*host_buf)*(host_end-host));
+      host_buf[host_end-host]='\0';
+      op_unescape_url_component(host_buf);
+      op_string_tolower(host_buf);
+      /*Some other host: give up.*/
+      if(OP_UNLIKELY(strcmp(host_buf,"localhost")!=0))return NULL;
+      path=host_end;
+    }
+  }
+  else path=scheme_end+1;
+  path_end=path+strspn(path,OP_URL_PATH);
+  /*This will reject a <query> or <fragment> component, too.
+    I don't know what to do with queries, but a temporal fragment would at
+     least make sense.
+    RFC 1738 pretty clearly defines a <searchpart> that's equivalent to the
+     RFC 3986 <query> component for other schemes, but not the file: scheme,
+     so I'm going to just reject it.*/
+  if(*path_end!='\0')return NULL;
+  return path;
+}
+
+#if defined(OP_ENABLE_HTTP)
+# if defined(_WIN32)
+#  include <winsock2.h>
+#  include <ws2tcpip.h>
+#  include <openssl/ssl.h>
+#  include "winerrno.h"
+
+typedef SOCKET op_sock;
+
+#  define OP_INVALID_SOCKET (INVALID_SOCKET)
+
+/*Vista and later support WSAPoll(), but we don't want to rely on that.
+  Instead we re-implement it badly using select().
+  Unfortunately, they define a conflicting struct pollfd, so we only define our
+   own if it looks like that one has not already been defined.*/
+#  if !defined(POLLIN)
+/*Equivalent to POLLIN.*/
+#   define POLLRDNORM (0x0100)
+/*Priority band data can be read.*/
+#   define POLLRDBAND (0x0200)
+/*There is data to read.*/
+#   define POLLIN     (POLLRDNORM|POLLRDBAND)
+/* There is urgent data to read.*/
+#   define POLLPRI    (0x0400)
+/*Equivalent to POLLOUT.*/
+#   define POLLWRNORM (0x0010)
+/*Writing now will not block.*/
+#   define POLLOUT    (POLLWRNORM)
+/*Priority data may be written.*/
+#   define POLLWRBAND (0x0020)
+/*Error condition (output only).*/
+#   define POLLERR    (0x0001)
+/*Hang up (output only).*/
+#   define POLLHUP    (0x0002)
+/*Invalid request: fd not open (output only).*/
+#   define POLLNVAL   (0x0004)
+
+struct pollfd{
+  /*File descriptor.*/
+  op_sock fd;
+  /*Requested events.*/
+  short   events;
+  /*Returned events.*/
+  short   revents;
+};
+#  endif
+
+/*But Winsock never defines nfds_t (it's simply hard-coded to ULONG).*/
+typedef unsigned long nfds_t;
+
+/*The usage of FD_SET() below is O(N^2).
+  This is okay because select() is limited to 64 sockets in Winsock, anyway.
+  In practice, we only ever call it with one or two sockets.*/
+static int op_poll_win32(struct pollfd *_fds,nfds_t _nfds,int _timeout){
+  struct timeval tv;
+  fd_set         ifds;
+  fd_set         ofds;
+  fd_set         efds;
+  nfds_t         i;
+  int            ret;
+  FD_ZERO(&ifds);
+  FD_ZERO(&ofds);
+  FD_ZERO(&efds);
+  for(i=0;i<_nfds;i++){
+    _fds[i].revents=0;
+    if(_fds[i].events&POLLIN)FD_SET(_fds[i].fd,&ifds);
+    if(_fds[i].events&POLLOUT)FD_SET(_fds[i].fd,&ofds);
+    FD_SET(_fds[i].fd,&efds);
+  }
+  if(_timeout>=0){
+    tv.tv_sec=_timeout/1000;
+    tv.tv_usec=(_timeout%1000)*1000;
+  }
+  ret=select(-1,&ifds,&ofds,&efds,_timeout<0?NULL:&tv);
+  if(ret>0){
+    for(i=0;i<_nfds;i++){
+      if(FD_ISSET(_fds[i].fd,&ifds))_fds[i].revents|=POLLIN;
+      if(FD_ISSET(_fds[i].fd,&ofds))_fds[i].revents|=POLLOUT;
+      /*This isn't correct: there are several different things that might have
+         happened to a fd in efds, but I don't know a good way to distinguish
+         them without more context from the caller.
+        It's okay, because we don't actually check any of these bits, we just
+         need _some_ bit set.*/
+      if(FD_ISSET(_fds[i].fd,&efds))_fds[i].revents|=POLLHUP;
+    }
+  }
+  return ret;
+}
+
+/*We define op_errno() to make it clear that it's not an l-value like normal
+   errno is.*/
+#  define op_errno() (WSAGetLastError()?WSAGetLastError()-WSABASEERR:0)
+#  define op_reset_errno() (WSASetLastError(0))
+
+/*The remaining functions don't get an op_ prefix even though they only
+   operate on sockets, because we don't use non-socket I/O here, and this
+   minimizes the changes needed to deal with Winsock.*/
+#  define close(_fd) closesocket(_fd)
+/*This relies on sizeof(u_long)==sizeof(int), which is always true on both
+   Win32 and Win64.*/
+#  define ioctl(_fd,_req,_arg) ioctlsocket(_fd,_req,(u_long *)(_arg))
+#  define getsockopt(_fd,_level,_name,_val,_len) \
+ getsockopt(_fd,_level,_name,(char *)(_val),_len)
+#  define setsockopt(_fd,_level,_name,_val,_len) \
+ setsockopt(_fd,_level,_name,(const char *)(_val),_len)
+#  define poll(_fds,_nfds,_timeout) op_poll_win32(_fds,_nfds,_timeout)
+
+#  if defined(_MSC_VER)
+typedef ptrdiff_t ssize_t;
+#  endif
+
+/*Load certificates from the built-in certificate store.*/
+int SSL_CTX_set_default_verify_paths_win32(SSL_CTX *_ssl_ctx);
+#  define SSL_CTX_set_default_verify_paths \
+ SSL_CTX_set_default_verify_paths_win32
+
+# else
+/*Normal Berkeley sockets.*/
+#  include <sys/ioctl.h>
+#  include <sys/types.h>
+#  include <sys/socket.h>
+#  include <arpa/inet.h>
+#  include <netinet/in.h>
+#  include <netinet/tcp.h>
+#  include <fcntl.h>
+#  include <netdb.h>
+#  include <poll.h>
+#  include <unistd.h>
+#  include <openssl/ssl.h>
+
+typedef int op_sock;
+
+#  define OP_INVALID_SOCKET (-1)
+
+#  define op_errno() (errno)
+#  define op_reset_errno() (errno=0)
+
+# endif
+# include <sys/timeb.h>
+# include <openssl/x509v3.h>
+
+/*The maximum number of simultaneous connections.
+  RFC 2616 says this SHOULD NOT be more than 2, but everyone on the modern web
+   ignores that (e.g., IE 8 bumped theirs up from 2 to 6, Firefox uses 15).
+  If it makes you feel better, we'll only ever actively read from one of these
+   at a time.
+  The others are kept around mainly to avoid slow-starting a new connection
+   when seeking, and time out rapidly.*/
+# define OP_NCONNS_MAX (4)
+
+/*The amount of time before we attempt to re-resolve the host.
+  This is 10 minutes, as recommended in RFC 6555 for expiring cached connection
+   results for dual-stack hosts.*/
+# define OP_RESOLVE_CACHE_TIMEOUT_MS (10*60*(opus_int32)1000)
+
+/*The number of redirections at which we give up.
+  The value here is the current default in Firefox.
+  RFC 2068 mandated a maximum of 5, but RFC 2616 relaxed that to "a client
+   SHOULD detect infinite redirection loops."
+  Fortunately, 20 is less than infinity.*/
+# define OP_REDIRECT_LIMIT (20)
+
+/*The initial size of the buffer used to read a response message (before the
+   body).*/
+# define OP_RESPONSE_SIZE_MIN (510)
+/*The maximum size of a response message (before the body).
+  Responses larger than this will be discarded.
+  I've seen a real server return 20 kB of data for a 302 Found response.
+  Increasing this beyond 32kB will cause problems on platforms with a 16-bit
+   int.*/
+# define OP_RESPONSE_SIZE_MAX (32766)
+
+/*The number of milliseconds we will allow a connection to sit idle before we
+   refuse to resurrect it.
+  Apache as of 2.2 has reduced its default timeout to 5 seconds (from 15), so
+   that's what we'll use here.*/
+# define OP_CONNECTION_IDLE_TIMEOUT_MS (5*1000)
+
+/*The number of milliseconds we will wait to send or receive data before giving
+   up.*/
+# define OP_POLL_TIMEOUT_MS (30*1000)
+
+/*We will always attempt to read ahead at least this much in preference to
+   opening a new connection.*/
+# define OP_READAHEAD_THRESH_MIN (32*(opus_int32)1024)
+
+/*The amount of data to request after a seek.
+  This is a trade-off between read throughput after a seek vs. the the ability
+   to quickly perform another seek with the same connection.*/
+# define OP_PIPELINE_CHUNK_SIZE     (32*(opus_int32)1024)
+/*Subsequent chunks are requested with larger and larger sizes until they pass
+   this threshold, after which we just ask for the rest of the resource.*/
+# define OP_PIPELINE_CHUNK_SIZE_MAX (1024*(opus_int32)1024)
+/*This is the maximum number of requests we'll make with a single connection.
+  Many servers will simply disconnect after we attempt some number of requests,
+   possibly without sending a Connection: close header, meaning we won't
+   discover it until we try to read beyond the end of the current chunk.
+  We can reconnect when that happens, but this is slow.
+  Instead, we impose a limit ourselves (set to the default for Apache
+   installations and thus likely the most common value in use).*/
+# define OP_PIPELINE_MAX_REQUESTS   (100)
+/*This should be the number of requests, starting from a chunk size of
+   OP_PIPELINE_CHUNK_SIZE and doubling each time, until we exceed
+   OP_PIPELINE_CHUNK_SIZE_MAX and just request the rest of the file.
+  We won't reuse a connection when seeking unless it has at least this many
+   requests left, to reduce the chances we'll have to open a new connection
+   while reading forward afterwards.*/
+# define OP_PIPELINE_MIN_REQUESTS   (7)
+
+/*Is this an https URL?
+  For now we can simply check the last letter of the scheme.*/
+# define OP_URL_IS_SSL(_url) ((_url)->scheme[4]=='s')
+
+/*Does this URL use the default port for its scheme?*/
+# define OP_URL_IS_DEFAULT_PORT(_url) \
+ (!OP_URL_IS_SSL(_url)&&(_url)->port==80 \
+ ||OP_URL_IS_SSL(_url)&&(_url)->port==443)
+
+struct OpusParsedURL{
+  /*Either "http" or "https".*/
+  char     *scheme;
+  /*The user name from the <userinfo> component, or NULL.*/
+  char     *user;
+  /*The password from the <userinfo> component, or NULL.*/
+  char     *pass;
+  /*The <host> component.
+    This may not be NULL.*/
+  char     *host;
+  /*The <path> and <query> components.
+    This may not be NULL.*/
+  char     *path;
+  /*The <port> component.
+    This is set to the default port if the URL did not contain one.*/
+  unsigned  port;
+};
+
+/*Parse a URL.
+  This code is not meant to be fast: strspn() with large sets is likely to be
+   slow, but it is very convenient.
+  It is meant to be RFC 3986-compliant.
+  We currently do not support IRIs (Internationalized Resource Identifiers,
+   RFC 3987).
+  Callers should translate them to URIs first.*/
+static int op_parse_url_impl(OpusParsedURL *_dst,const char *_src){
+  const char  *scheme_end;
+  const char  *authority;
+  const char  *userinfo_end;
+  const char  *user;
+  const char  *user_end;
+  const char  *pass;
+  const char  *hostport;
+  const char  *hostport_end;
+  const char  *host_end;
+  const char  *port;
+  opus_int32   port_num;
+  const char  *port_end;
+  const char  *path;
+  const char  *path_end;
+  const char  *uri_end;
+  scheme_end=_src+strspn(_src,OP_URL_SCHEME);
+  if(OP_UNLIKELY(*scheme_end!=':')
+   ||OP_UNLIKELY(scheme_end-_src<4)||OP_UNLIKELY(scheme_end-_src>5)
+   ||OP_UNLIKELY(op_strncasecmp(_src,"https",scheme_end-_src)!=0)){
+    /*Unsupported protocol.*/
+    return OP_EIMPL;
+  }
+  if(OP_UNLIKELY(scheme_end[1]!='/')||OP_UNLIKELY(scheme_end[2]!='/')){
+    /*We require an <authority> component.*/
+    return OP_EINVAL;
+  }
+  authority=scheme_end+3;
+  /*Make sure all escape sequences are valid to simplify unescaping later.*/
+  if(OP_UNLIKELY(op_validate_url_escapes(authority)<0))return OP_EINVAL;
+  /*Look for a <userinfo> component.*/
+  userinfo_end=authority+strspn(authority,OP_URL_PCHAR_NA);
+  if(*userinfo_end=='@'){
+    /*Found one.*/
+    user=authority;
+    /*Look for a password (yes, clear-text passwords are deprecated, I know,
+       but what else are people supposed to use? use SSL if you care).*/
+    user_end=authority+strspn(authority,OP_URL_PCHAR_BASE);
+    if(*user_end==':')pass=user_end+1;
+    else pass=NULL;
+    hostport=userinfo_end+1;
+  }
+  else{
+    /*We shouldn't have to initialize user_end, but gcc is too dumb to figure
+       out that user!=NULL below means we didn't take this else branch.*/
+    user=user_end=NULL;
+    pass=NULL;
+    hostport=authority;
+  }
+  /*Try to figure out where the <host> component ends.*/
+  if(hostport[0]=='['){
+    hostport++;
+    /*We have an <IP-literal>, which can contain colons.*/
+    hostport_end=host_end=hostport+strspn(hostport,OP_URL_PCHAR_NA);
+    if(OP_UNLIKELY(*hostport_end++!=']'))return OP_EINVAL;
+  }
+  /*Currently we don't support IDNA (RFC 5894), because I don't want to deal
+     with the policy about which domains should not be internationalized to
+     avoid confusing similarities.
+    Give this API Punycode (RFC 3492) domain names instead.*/
+  else hostport_end=host_end=hostport+strspn(hostport,OP_URL_PCHAR_BASE);
+  /*TODO: Validate host.*/
+  /*Is there a port number?*/
+  port_num=-1;
+  if(*hostport_end==':'){
+    int i;
+    port=hostport_end+1;
+    port_end=port+strspn(port,OP_URL_DIGIT);
+    path=port_end;
+    /*Not part of RFC 3986, but require port numbers in the range 0...65535.*/
+    if(OP_LIKELY(port_end-port>0)){
+      while(*port=='0')port++;
+      if(OP_UNLIKELY(port_end-port>5))return OP_EINVAL;
+      port_num=0;
+      for(i=0;i<port_end-port;i++)port_num=port_num*10+port[i]-'0';
+      if(OP_UNLIKELY(port_num>65535))return OP_EINVAL;
+    }
+  }
+  else path=hostport_end;
+  path_end=path+strspn(path,OP_URL_PATH);
+  /*If the path is not empty, it must begin with a '/'.*/
+  if(OP_LIKELY(path_end>path)&&OP_UNLIKELY(path[0]!='/'))return OP_EINVAL;
+  /*Consume the <query> component, if any (right now we don't split this out
+     from the <path> component).*/
+  if(*path_end=='?')path_end=path_end+strspn(path_end,OP_URL_QUERY_FRAG);
+  /*Discard the <fragment> component, if any.
+    This doesn't get sent to the server.
+    Some day we should add support for Media Fragment URIs
+     <http://www.w3.org/TR/media-frags/>.*/
+  if(*path_end=='#')uri_end=path_end+1+strspn(path_end+1,OP_URL_QUERY_FRAG);
+  else uri_end=path_end;
+  /*If there's anything left, this was not a valid URL.*/
+  if(OP_UNLIKELY(*uri_end!='\0'))return OP_EINVAL;
+  _dst->scheme=op_string_range_dup(_src,scheme_end);
+  if(OP_UNLIKELY(_dst->scheme==NULL))return OP_EFAULT;
+  op_string_tolower(_dst->scheme);
+  if(user!=NULL){
+    _dst->user=op_string_range_dup(user,user_end);
+    if(OP_UNLIKELY(_dst->user==NULL))return OP_EFAULT;
+    op_unescape_url_component(_dst->user);
+    /*Unescaping might have created a ':' in the username.
+      That's not allowed by RFC 2617's Basic Authentication Scheme.*/
+    if(OP_UNLIKELY(strchr(_dst->user,':')!=NULL))return OP_EINVAL;
+  }
+  else _dst->user=NULL;
+  if(pass!=NULL){
+    _dst->pass=op_string_range_dup(pass,userinfo_end);
+    if(OP_UNLIKELY(_dst->pass==NULL))return OP_EFAULT;
+    op_unescape_url_component(_dst->pass);
+  }
+  else _dst->pass=NULL;
+  _dst->host=op_string_range_dup(hostport,host_end);
+  if(OP_UNLIKELY(_dst->host==NULL))return OP_EFAULT;
+  if(port_num<0){
+    if(_src[4]=='s')port_num=443;
+    else port_num=80;
+  }
+  _dst->port=(unsigned)port_num;
+  /*RFC 2616 says an empty <abs-path> component is equivalent to "/", and we
+     MUST use the latter in the Request-URI.
+    Reserve space for the slash here.*/
+  if(path==path_end||path[0]=='?')path--;
+  _dst->path=op_string_range_dup(path,path_end);
+  if(OP_UNLIKELY(_dst->path==NULL))return OP_EFAULT;
+  /*And force-set it here.*/
+  _dst->path[0]='/';
+  return 0;
+}
+
+static void op_parsed_url_init(OpusParsedURL *_url){
+  memset(_url,0,sizeof(*_url));
+}
+
+static void op_parsed_url_clear(OpusParsedURL *_url){
+  _ogg_free(_url->scheme);
+  _ogg_free(_url->user);
+  _ogg_free(_url->pass);
+  _ogg_free(_url->host);
+  _ogg_free(_url->path);
+}
+
+static int op_parse_url(OpusParsedURL *_dst,const char *_src){
+  OpusParsedURL url;
+  int           ret;
+  op_parsed_url_init(&url);
+  ret=op_parse_url_impl(&url,_src);
+  if(OP_UNLIKELY(ret<0))op_parsed_url_clear(&url);
+  else *_dst=*&url;
+  return ret;
+}
+
+/*A buffer to hold growing strings.
+  The main purpose of this is to consolidate allocation checks and simplify
+   cleanup on a failed allocation.*/
+struct OpusStringBuf{
+  char *buf;
+  int   nbuf;
+  int   cbuf;
+};
+
+static void op_sb_init(OpusStringBuf *_sb){
+  _sb->buf=NULL;
+  _sb->nbuf=0;
+  _sb->cbuf=0;
+}
+
+static void op_sb_clear(OpusStringBuf *_sb){
+  _ogg_free(_sb->buf);
+}
+
+/*Make sure we have room for at least _capacity characters (plus 1 more for the
+   terminating NUL).*/
+static int op_sb_ensure_capacity(OpusStringBuf *_sb,int _capacity){
+  char *buf;
+  int   cbuf;
+  buf=_sb->buf;
+  cbuf=_sb->cbuf;
+  if(_capacity>=cbuf-1){
+    if(OP_UNLIKELY(cbuf>INT_MAX-1>>1))return OP_EFAULT;
+    if(OP_UNLIKELY(_capacity>=INT_MAX-1))return OP_EFAULT;
+    cbuf=OP_MAX(2*cbuf+1,_capacity+1);
+    buf=_ogg_realloc(buf,sizeof(*buf)*cbuf);
+    if(OP_UNLIKELY(buf==NULL))return OP_EFAULT;
+    _sb->buf=buf;
+    _sb->cbuf=cbuf;
+  }
+  return 0;
+}
+
+/*Increase the capacity of the buffer, but not to more than _max_size
+   characters (plus 1 more for the terminating NUL).*/
+static int op_sb_grow(OpusStringBuf *_sb,int _max_size){
+  char *buf;
+  int   cbuf;
+  buf=_sb->buf;
+  cbuf=_sb->cbuf;
+  OP_ASSERT(_max_size<=INT_MAX-1);
+  cbuf=cbuf<=_max_size-1>>1?2*cbuf+1:_max_size+1;
+  buf=_ogg_realloc(buf,sizeof(*buf)*cbuf);
+  if(OP_UNLIKELY(buf==NULL))return OP_EFAULT;
+  _sb->buf=buf;
+  _sb->cbuf=cbuf;
+  return 0;
+}
+
+static int op_sb_append(OpusStringBuf *_sb,const char *_s,int _len){
+  char *buf;
+  int   nbuf;
+  int   ret;
+  nbuf=_sb->nbuf;
+  if(OP_UNLIKELY(nbuf>INT_MAX-_len))return OP_EFAULT;
+  ret=op_sb_ensure_capacity(_sb,nbuf+_len);
+  if(OP_UNLIKELY(ret<0))return ret;
+  buf=_sb->buf;
+  memcpy(buf+nbuf,_s,sizeof(*buf)*_len);
+  nbuf+=_len;
+  buf[nbuf]='\0';
+  _sb->nbuf=nbuf;
+  return 0;
+}
+
+static int op_sb_append_string(OpusStringBuf *_sb,const char *_s){
+  return op_sb_append(_sb,_s,strlen(_s));
+}
+
+static int op_sb_append_port(OpusStringBuf *_sb,unsigned _port){
+  char port_buf[7];
+  OP_ASSERT(_port<=65535U);
+  sprintf(port_buf,":%u",_port);
+  return op_sb_append_string(_sb,port_buf);
+}
+
+static int op_sb_append_nonnegative_int64(OpusStringBuf *_sb,opus_int64 _i){
+  char digit;
+  int  nbuf_start;
+  int  ret;
+  OP_ASSERT(_i>=0);
+  nbuf_start=_sb->nbuf;
+  ret=0;
+  do{
+    digit='0'+_i%10;
+    ret|=op_sb_append(_sb,&digit,1);
+    _i/=10;
+  }
+  while(_i>0);
+  if(OP_LIKELY(ret>=0)){
+    char *buf;
+    int   nbuf_end;
+    buf=_sb->buf;
+    nbuf_end=_sb->nbuf-1;
+    /*We've added the digits backwards.
+      Reverse them.*/
+    while(nbuf_start<nbuf_end){
+      digit=buf[nbuf_start];
+      buf[nbuf_start]=buf[nbuf_end];
+      buf[nbuf_end]=digit;
+      nbuf_start++;
+      nbuf_end--;
+    }
+  }
+  return ret;
+}
+
+static struct addrinfo *op_resolve(const char *_host,unsigned _port){
+  struct addrinfo *addrs;
+  struct addrinfo  hints;
+  char             service[6];
+  memset(&hints,0,sizeof(hints));
+  hints.ai_socktype=SOCK_STREAM;
+#if !defined(_WIN32)
+  hints.ai_flags=AI_NUMERICSERV;
+#endif
+  OP_ASSERT(_port<=65535U);
+  sprintf(service,"%u",_port);
+  if(OP_LIKELY(!getaddrinfo(_host,service,&hints,&addrs)))return addrs;
+  return NULL;
+}
+
+static int op_sock_set_nonblocking(op_sock _fd,int _nonblocking){
+#if !defined(_WIN32)
+  int flags;
+  flags=fcntl(_fd,F_GETFL);
+  if(OP_UNLIKELY(flags<0))return flags;
+  if(_nonblocking)flags|=O_NONBLOCK;
+  else flags&=~O_NONBLOCK;
+  return fcntl(_fd,F_SETFL,flags);
+#else
+  return ioctl(_fd,FIONBIO,&_nonblocking);
+#endif
+}
+
+/*Disable/enable write coalescing if we can.
+  We always send whole requests at once and always parse the response headers
+   before sending another one, so normally write coalescing just causes added
+   delay.*/
+static void op_sock_set_tcp_nodelay(op_sock _fd,int _nodelay){
+# if defined(TCP_NODELAY)&&(defined(IPPROTO_TCP)||defined(SOL_TCP))
+#  if defined(IPPROTO_TCP)
+#   define OP_SO_LEVEL IPPROTO_TCP
+#  else
+#   define OP_SO_LEVEL SOL_TCP
+#  endif
+  /*It doesn't really matter if this call fails, but it would be interesting
+     to hit a case where it does.*/
+  OP_ALWAYS_TRUE(!setsockopt(_fd,OP_SO_LEVEL,TCP_NODELAY,
+   &_nodelay,sizeof(_nodelay)));
+# endif
+}
+
+#if defined(_WIN32)
+static void op_init_winsock(){
+  static LONG    count;
+  static WSADATA wsadata;
+  if(InterlockedIncrement(&count)==1)WSAStartup(0x0202,&wsadata);
+}
+#endif
+
+/*A single physical connection to an HTTP server.
+  We may have several of these open at once.*/
+struct OpusHTTPConn{
+  /*The current position indicator for this connection.*/
+  opus_int64    pos;
+  /*The position where the current request will end, or -1 if we're reading
+     until EOF (an unseekable stream or the initial HTTP/1.0 request).*/
+  opus_int64    end_pos;
+  /*The position where next request we've sent will start, or -1 if we haven't
+     sent the next request yet.*/
+  opus_int64    next_pos;
+  /*The end of the next request or -1 if we requested the rest of the resource.
+    This is only set to a meaningful value if next_pos is not -1.*/
+  opus_int64    next_end;
+  /*The SSL connection, if this is https.*/
+  SSL          *ssl_conn;
+  /*The next connection in either the LRU or free list.*/
+  OpusHTTPConn *next;
+  /*The last time we blocked for reading from this connection.*/
+  struct timeb  read_time;
+  /*The number of bytes we've read since the last time we blocked.*/
+  opus_int64    read_bytes;
+  /*The estimated throughput of this connection, in bytes/s.*/
+  opus_int64    read_rate;
+  /*The socket we're reading from.*/
+  op_sock       fd;
+  /*The number of remaining requests we are allowed on this connection.*/
+  int           nrequests_left;
+  /*The chunk size to use for pipelining requests.*/
+  opus_int32    chunk_size;
+};
+
+static void op_http_conn_init(OpusHTTPConn *_conn){
+  _conn->next_pos=-1;
+  _conn->ssl_conn=NULL;
+  _conn->next=NULL;
+  _conn->fd=OP_INVALID_SOCKET;
+}
+
+static void op_http_conn_clear(OpusHTTPConn *_conn){
+  if(_conn->ssl_conn!=NULL)SSL_free(_conn->ssl_conn);
+  /*SSL frees the BIO for us.*/
+  if(_conn->fd!=OP_INVALID_SOCKET)close(_conn->fd);
+}
+
+/*The global stream state.*/
+struct OpusHTTPStream{
+  /*The list of connections.*/
+  OpusHTTPConn     conns[OP_NCONNS_MAX];
+  /*The context object used as a framework for TLS/SSL functions.*/
+  SSL_CTX         *ssl_ctx;
+  /*The cached session to reuse for future connections.*/
+  SSL_SESSION     *ssl_session;
+  /*The LRU list (ordered from MRU to LRU) of currently connected
+     connections.*/
+  OpusHTTPConn    *lru_head;
+  /*The free list.*/
+  OpusHTTPConn    *free_head;
+  /*The URL to connect to.*/
+  OpusParsedURL    url;
+  /*Information about the address we connected to.*/
+  struct addrinfo  addr_info;
+  /*The address we connected to.*/
+  union{
+    struct sockaddr     s;
+    struct sockaddr_in  v4;
+    struct sockaddr_in6 v6;
+  }                addr;
+  /*The last time we re-resolved the host.*/
+  struct timeb     resolve_time;
+  /*A buffer used to build HTTP requests.*/
+  OpusStringBuf    request;
+  /*A buffer used to build proxy CONNECT requests.*/
+  OpusStringBuf    proxy_connect;
+  /*A buffer used to receive the response headers.*/
+  OpusStringBuf    response;
+  /*The Content-Length, if specified, or -1 otherwise.
+    This will always be specified for seekable streams.*/
+  opus_int64       content_length;
+  /*The position indicator used when no connection is active.*/
+  opus_int64       pos;
+  /*The host we actually connected to.*/
+  char            *connect_host;
+  /*The port we actually connected to.*/
+  unsigned         connect_port;
+  /*The connection we're currently reading from.
+    This can be -1 if no connection is active.*/
+  int              cur_conni;
+  /*Whether or not the server supports range requests.*/
+  int              seekable;
+  /*Whether or not the server supports HTTP/1.1 with persistent connections.*/
+  int              pipeline;
+  /*Whether or not we should skip certificate checks.*/
+  int              skip_certificate_check;
+  /*The offset of the tail of the request.
+    Only the offset in the Range: header appears after this, allowing us to
+     quickly edit the request to ask for a new range.*/
+  int              request_tail;
+  /*The estimated time required to open a new connection, in milliseconds.*/
+  opus_int32       connect_rate;
+};
+
+static void op_http_stream_init(OpusHTTPStream *_stream){
+  OpusHTTPConn **pnext;
+  int            ci;
+  pnext=&_stream->free_head;
+  for(ci=0;ci<OP_NCONNS_MAX;ci++){
+    op_http_conn_init(_stream->conns+ci);
+    *pnext=_stream->conns+ci;
+    pnext=&_stream->conns[ci].next;
+  }
+  _stream->ssl_ctx=NULL;
+  _stream->ssl_session=NULL;
+  _stream->lru_head=NULL;
+  op_parsed_url_init(&_stream->url);
+  op_sb_init(&_stream->request);
+  op_sb_init(&_stream->proxy_connect);
+  op_sb_init(&_stream->response);
+  _stream->connect_host=NULL;
+  _stream->seekable=0;
+}
+
+/*Close the connection and move it to the free list.
+  _stream:     The stream containing the free list.
+  _conn:       The connection to close.
+  _penxt:      The linked-list pointer currently pointing to this connection.
+  _gracefully: Whether or not to shut down cleanly.*/
+static void op_http_conn_close(OpusHTTPStream *_stream,OpusHTTPConn *_conn,
+ OpusHTTPConn **_pnext,int _gracefully){
+  /*If we don't shut down gracefully, the server MUST NOT re-use our session
+     according to RFC 2246, because it can't tell the difference between an
+     abrupt close and a truncation attack.
+    So we shut down gracefully if we can.
+    However, we will not wait if this would block (it's not worth the savings
+     from session resumption to do so).
+    Clients (that's us) MAY resume a TLS session that ended with an incomplete
+     close, according to RFC 2818, so there's no reason to make sure the server
+     shut things down gracefully.*/
+  if(_gracefully&&_conn->ssl_conn!=NULL)SSL_shutdown(_conn->ssl_conn);
+  op_http_conn_clear(_conn);
+  _conn->next_pos=-1;
+  _conn->ssl_conn=NULL;
+  _conn->fd=OP_INVALID_SOCKET;
+  OP_ASSERT(*_pnext==_conn);
+  *_pnext=_conn->next;
+  _conn->next=_stream->free_head;
+  _stream->free_head=_conn;
+}
+
+static void op_http_stream_clear(OpusHTTPStream *_stream){
+  while(_stream->lru_head!=NULL){
+    op_http_conn_close(_stream,_stream->lru_head,&_stream->lru_head,0);
+  }
+  if(_stream->ssl_session!=NULL)SSL_SESSION_free(_stream->ssl_session);
+  if(_stream->ssl_ctx!=NULL)SSL_CTX_free(_stream->ssl_ctx);
+  op_sb_clear(&_stream->response);
+  op_sb_clear(&_stream->proxy_connect);
+  op_sb_clear(&_stream->request);
+  if(_stream->connect_host!=_stream->url.host)_ogg_free(_stream->connect_host);
+  op_parsed_url_clear(&_stream->url);
+}
+
+static int op_http_conn_write_fully(OpusHTTPConn *_conn,
+ const char *_buf,int _buf_size){
+  struct pollfd  fd;
+  SSL           *ssl_conn;
+  fd.fd=_conn->fd;
+  ssl_conn=_conn->ssl_conn;
+  while(_buf_size>0){
+    int err;
+    if(ssl_conn!=NULL){
+      int ret;
+      ret=SSL_write(ssl_conn,_buf,_buf_size);
+      if(ret>0){
+        /*Wrote some data.*/
+        _buf+=ret;
+        _buf_size-=ret;
+        continue;
+      }
+      /*Connection closed.*/
+      else if(ret==0)return OP_FALSE;
+      err=SSL_get_error(ssl_conn,ret);
+      /*Yes, renegotiations can cause SSL_write() to block for reading.*/
+      if(err==SSL_ERROR_WANT_READ)fd.events=POLLIN;
+      else if(err==SSL_ERROR_WANT_WRITE)fd.events=POLLOUT;
+      else return OP_FALSE;
+    }
+    else{
+      ssize_t ret;
+      op_reset_errno();
+      ret=send(fd.fd,_buf,_buf_size,0);
+      if(ret>0){
+        _buf+=ret;
+        _buf_size-=ret;
+        continue;
+      }
+      err=op_errno();
+      if(err!=EAGAIN&&err!=EWOULDBLOCK)return OP_FALSE;
+      fd.events=POLLOUT;
+    }
+    if(poll(&fd,1,OP_POLL_TIMEOUT_MS)<=0)return OP_FALSE;
+  }
+  return 0;
+}
+
+static int op_http_conn_estimate_available(OpusHTTPConn *_conn){
+  int available;
+  int ret;
+  ret=ioctl(_conn->fd,FIONREAD,&available);
+  if(ret<0)available=0;
+  /*This requires the SSL read_ahead flag to be unset to work.
+    We ignore partial records as well as the protocol overhead for any pending
+     bytes.
+    This means we might return somewhat less than can truly be read without
+     blocking (if there's a partial record).
+    This is okay, because we're using this value to estimate network transfer
+     time, and we _have_ already received those bytes.
+    We also might return slightly more (due to protocol overhead), but that's
+     small enough that it probably doesn't matter.*/
+  if(_conn->ssl_conn!=NULL)available+=SSL_pending(_conn->ssl_conn);
+  return available;
+}
+
+static opus_int32 op_time_diff_ms(const struct timeb *_end,
+ const struct timeb *_start){
+  opus_int64 dtime;
+  dtime=_end->time-(opus_int64)_start->time;
+  OP_ASSERT(_end->millitm<1000);
+  OP_ASSERT(_start->millitm<1000);
+  if(OP_UNLIKELY(dtime>(OP_INT32_MAX-1000)/1000))return OP_INT32_MAX;
+  if(OP_UNLIKELY(dtime<(OP_INT32_MIN+1000)/1000))return OP_INT32_MIN;
+  return (opus_int32)dtime*1000+_end->millitm-_start->millitm;
+}
+
+/*Update the read rate estimate for this connection.*/
+static void op_http_conn_read_rate_update(OpusHTTPConn *_conn){
+  struct timeb read_time;
+  opus_int32   read_delta_ms;
+  opus_int64   read_delta_bytes;
+  opus_int64   read_rate;
+  read_delta_bytes=_conn->read_bytes;
+  if(read_delta_bytes<=0)return;
+  ftime(&read_time);
+  read_delta_ms=op_time_diff_ms(&read_time,&_conn->read_time);
+  read_rate=_conn->read_rate;
+  read_delta_ms=OP_MAX(read_delta_ms,1);
+  read_rate+=read_delta_bytes*1000/read_delta_ms-read_rate+4>>3;
+  *&_conn->read_time=*&read_time;
+  _conn->read_bytes=0;
+  _conn->read_rate=read_rate;
+}
+
+/*Tries to read from the given connection.
+  [out] _buf: Returns the data read.
+  _buf_size:  The size of the buffer.
+  _blocking:  Whether or not to block until some data is retrieved.
+  Return: A positive number of bytes read on success.
+          0:        The read would block, or the connection was closed.
+          OP_EREAD: There was a fatal read error.*/
+static int op_http_conn_read(OpusHTTPConn *_conn,
+ char *_buf,int _buf_size,int _blocking){
+  struct pollfd  fd;
+  SSL           *ssl_conn;
+  int            nread;
+  int            nread_unblocked;
+  fd.fd=_conn->fd;
+  ssl_conn=_conn->ssl_conn;
+  nread=nread_unblocked=0;
+  /*RFC 2818 says "client implementations MUST treat any premature closes as
+     errors and the data received as potentially truncated," so we make very
+     sure to report read errors upwards.*/
+  do{
+    int err;
+    if(ssl_conn!=NULL){
+      int ret;
+      ret=SSL_read(ssl_conn,_buf+nread,_buf_size-nread);
+      OP_ASSERT(ret<=_buf_size-nread);
+      if(ret>0){
+        /*Read some data.
+          Keep going to see if there's more.*/
+        nread+=ret;
+        nread_unblocked+=ret;
+        continue;
+      }
+      /*If we already read some data, return it right now.*/
+      if(nread>0)break;
+      err=SSL_get_error(ssl_conn,ret);
+      if(ret==0){
+        /*Connection close.
+          Check for a clean shutdown to prevent truncation attacks.
+          This check always succeeds for SSLv2, as it has no "close notify"
+           message and thus can't verify an orderly shutdown.*/
+        return err==SSL_ERROR_ZERO_RETURN?0:OP_EREAD;
+      }
+      if(err==SSL_ERROR_WANT_READ)fd.events=POLLIN;
+      /*Yes, renegotiations can cause SSL_read() to block for writing.*/
+      else if(err==SSL_ERROR_WANT_WRITE)fd.events=POLLOUT;
+      /*Some other error.*/
+      else return OP_EREAD;
+    }
+    else{
+      ssize_t ret;
+      op_reset_errno();
+      ret=recv(fd.fd,_buf+nread,_buf_size-nread,0);
+      OP_ASSERT(ret<=_buf_size-nread);
+      if(ret>0){
+        /*Read some data.
+          Keep going to see if there's more.*/
+        nread+=ret;
+        nread_unblocked+=ret;
+        continue;
+      }
+      /*If we already read some data or the connection was closed, return
+         right now.*/
+      if(ret==0||nread>0)break;
+      err=op_errno();
+      if(err!=EAGAIN&&err!=EWOULDBLOCK)return OP_EREAD;
+      fd.events=POLLIN;
+    }
+    _conn->read_bytes+=nread_unblocked;
+    op_http_conn_read_rate_update(_conn);
+    nread_unblocked=0;
+    if(!_blocking)break;
+    /*Need to wait to get any data at all.*/
+    if(poll(&fd,1,OP_POLL_TIMEOUT_MS)<=0)return OP_EREAD;
+  }
+  while(nread<_buf_size);
+  _conn->read_bytes+=nread_unblocked;
+  return nread;
+}
+
+/*Tries to look at the pending data for a connection without consuming it.
+  [out] _buf: Returns the data at which we're peeking.
+  _buf_size:  The size of the buffer.*/
+static int op_http_conn_peek(OpusHTTPConn *_conn,char *_buf,int _buf_size){
+  struct pollfd   fd;
+  SSL            *ssl_conn;
+  int             ret;
+  fd.fd=_conn->fd;
+  ssl_conn=_conn->ssl_conn;
+  for(;;){
+    int err;
+    if(ssl_conn!=NULL){
+      ret=SSL_peek(ssl_conn,_buf,_buf_size);
+      /*Either saw some data or the connection was closed.*/
+      if(ret>=0)return ret;
+      err=SSL_get_error(ssl_conn,ret);
+      if(err==SSL_ERROR_WANT_READ)fd.events=POLLIN;
+      /*Yes, renegotiations can cause SSL_peek() to block for writing.*/
+      else if(err==SSL_ERROR_WANT_WRITE)fd.events=POLLOUT;
+      else return 0;
+    }
+    else{
+      op_reset_errno();
+      ret=(int)recv(fd.fd,_buf,_buf_size,MSG_PEEK);
+      /*Either saw some data or the connection was closed.*/
+      if(ret>=0)return ret;
+      err=op_errno();
+      if(err!=EAGAIN&&err!=EWOULDBLOCK)return 0;
+      fd.events=POLLIN;
+    }
+    /*Need to wait to get any data at all.*/
+    if(poll(&fd,1,OP_POLL_TIMEOUT_MS)<=0)return 0;
+  }
+}
+
+/*When parsing response headers, RFC 2616 mandates that all lines end in CR LF.
+  However, even in the year 2012, I have seen broken servers use just a LF.
+  This is the evil that Postel's advice from RFC 761 breeds.*/
+
+/*Reads the entirety of a response to an HTTP request into the response buffer.
+  Actual parsing and validation is done later.
+  Return: The number of bytes in the response on success, OP_EREAD if the
+           connection was closed before reading any data, or another negative
+           value on any other error.*/
+static int op_http_conn_read_response(OpusHTTPConn *_conn,
+ OpusStringBuf *_response){
+  int ret;
+  _response->nbuf=0;
+  ret=op_sb_ensure_capacity(_response,OP_RESPONSE_SIZE_MIN);
+  if(OP_UNLIKELY(ret<0))return ret;
+  for(;;){
+    char *buf;
+    int   size;
+    int   capacity;
+    int   read_limit;
+    int   terminated;
+    size=_response->nbuf;
+    capacity=_response->cbuf-1;
+    if(OP_UNLIKELY(size>=capacity)){
+      ret=op_sb_grow(_response,OP_RESPONSE_SIZE_MAX);
+      if(OP_UNLIKELY(ret<0))return ret;
+      capacity=_response->cbuf-1;
+      /*The response was too large.
+        This prevents a bad server from running us out of memory.*/
+      if(OP_UNLIKELY(size>=capacity))return OP_EIMPL;
+    }
+    buf=_response->buf;
+    ret=op_http_conn_peek(_conn,buf+size,capacity-size);
+    if(OP_UNLIKELY(ret<=0))return size<=0?OP_EREAD:OP_FALSE;
+    /*We read some data.*/
+    /*Make sure the starting characters are "HTTP".
+      Otherwise we could wind up waiting forever for a response from
+       something that is not an HTTP server.*/
+    if(size<4&&op_strncasecmp(buf,"HTTP",OP_MIN(size+ret,4))!=0){
+      return OP_FALSE;
+    }
+    /*How far can we read without passing the "\r\n\r\n" terminator?*/
+    buf[size+ret]='\0';
+    terminated=0;
+    for(read_limit=OP_MAX(size-3,0);read_limit<size+ret;read_limit++){
+      /*We don't look for the leading '\r' thanks to broken servers.*/
+      if(buf[read_limit]=='\n'){
+        if(buf[read_limit+1]=='\r'&&OP_LIKELY(buf[read_limit+2]=='\n')){
+          terminated=3;
+          break;
+        }
+        /*This case is for broken servers.*/
+        else if(OP_UNLIKELY(buf[read_limit+1]=='\n')){
+          terminated=2;
+          break;
+        }
+      }
+    }
+    read_limit+=terminated;
+    OP_ASSERT(size<=read_limit);
+    OP_ASSERT(read_limit<=size+ret);
+    /*Actually consume that data.*/
+    ret=op_http_conn_read(_conn,buf+size,read_limit-size,1);
+    if(OP_UNLIKELY(ret<=0))return OP_FALSE;
+    size+=ret;
+    buf[size]='\0';
+    _response->nbuf=size;
+    /*We found the terminator and read all the data up to and including it.*/
+    if(terminated&&OP_LIKELY(size>=read_limit))return size;
+  }
+  return OP_EIMPL;
+}
+
+# define OP_HTTP_DIGIT "0123456789"
+
+/*The Reason-Phrase is not allowed to contain control characters, except
+   horizontal tab (HT: \011).*/
+# define OP_HTTP_CREASON_PHRASE \
+ "\001\002\003\004\005\006\007\010\012\013\014\015\016\017\020\021" \
+ "\022\023\024\025\026\027\030\031\032\033\034\035\036\037\177"
+
+# define OP_HTTP_CTLS \
+ "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020" \
+ "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\177"
+
+/*This also includes '\t', but we get that from OP_HTTP_CTLS.*/
+# define OP_HTTP_SEPARATORS " \"(),/:;<=>?@[\\]{}"
+
+/*TEXT can also include LWS, but that has structure, so we parse it
+   separately.*/
+# define OP_HTTP_CTOKEN OP_HTTP_CTLS OP_HTTP_SEPARATORS
+
+/*Return: The amount of linear white space (LWS) at the start of _s.*/
+static int op_http_lwsspn(const char *_s){
+  int i;
+  for(i=0;;){
+    if(_s[0]=='\r'&&_s[1]=='\n'&&(_s[2]=='\t'||_s[2]==' '))i+=3;
+    /*This case is for broken servers.*/
+    else if(_s[0]=='\n'&&(_s[1]=='\t'||_s[1]==' '))i+=2;
+    else if(_s[i]=='\t'||_s[i]==' ')i++;
+    else return i;
+  }
+}
+
+static char *op_http_parse_status_line(int *_v1_1_compat,
+ char **_status_code,char *_response){
+  char   *next;
+  char   *status_code;
+  int     v1_1_compat;
+  size_t  d;
+  /*RFC 2616 Section 6.1 does not say that the tokens in the Status-Line cannot
+     be separated by optional LWS, but since it specifically calls out where
+     spaces are to be placed and that CR and LF are not allowed except at the
+     end, I am assuming this to be true.*/
+  /*We already validated that this starts with "HTTP"*/
+  OP_ASSERT(op_strncasecmp(_response,"HTTP",4)==0);
+  next=_response+4;
+  if(OP_UNLIKELY(*next++!='/'))return NULL;
+  d=strspn(next,OP_HTTP_DIGIT);
+  /*"Leading zeros MUST be ignored by recipients."*/
+  while(*next=='0'){
+    next++;
+    OP_ASSERT(d>0);
+    d--;
+  }
+  /*We only support version 1.x*/
+  if(OP_UNLIKELY(d!=1)||OP_UNLIKELY(*next++!='1'))return NULL;
+  if(OP_UNLIKELY(*next++!='.'))return NULL;
+  d=strspn(next,OP_HTTP_DIGIT);
+  if(OP_UNLIKELY(d<=0))return NULL;
+  /*"Leading zeros MUST be ignored by recipients."*/
+  while(*next=='0'){
+    next++;
+    OP_ASSERT(d>0);
+    d--;
+  }
+  /*We don't need to parse the version number.
+    Any non-zero digit means it's greater than 1.*/
+  v1_1_compat=d>0;
+  next+=d;
+  if(OP_UNLIKELY(*next++!=' '))return NULL;
+  status_code=next;
+  d=strspn(next,OP_HTTP_DIGIT);
+  if(OP_UNLIKELY(d!=3))return NULL;
+  next+=d;
+  /*The Reason-Phrase can be empty, but the space must be here.*/
+  if(OP_UNLIKELY(*next++!=' '))return NULL;
+  next+=strcspn(next,OP_HTTP_CREASON_PHRASE);
+  /*We are not mandating this be present thanks to broken servers.*/
+  if(OP_LIKELY(*next=='\r'))next++;
+  if(OP_UNLIKELY(*next++!='\n'))return NULL;
+  if(_v1_1_compat!=NULL)*_v1_1_compat=v1_1_compat;
+  *_status_code=status_code;
+  return next;
+}
+
+/*Get the next response header.
+  [out] _header: The header token, NUL-terminated, with leading and trailing
+                  whitespace stripped, and converted to lower case (to simplify
+                  case-insensitive comparisons), or NULL if there are no more
+                  response headers.
+  [out] _cdr:    The remaining contents of the header, excluding the initial
+                  colon (':') and the terminating CRLF ("\r\n"),
+                  NUL-terminated, and with leading and trailing whitespace
+                  stripped, or NULL if there are no more response headers.
+  [inout] _s:    On input, this points to the start of the current line of the
+                  response headers.
+                 On output, it points to the start of the first line following
+                  this header, or NULL if there are no more response headers.
+  Return: 0 on success, or a negative value on failure.*/
+static int op_http_get_next_header(char **_header,char **_cdr,char **_s){
+  char   *header;
+  char   *header_end;
+  char   *cdr;
+  char   *cdr_end;
+  char   *next;
+  size_t  d;
+  next=*_s;
+  /*The second case is for broken servers.*/
+  if(next[0]=='\r'&&next[1]=='\n'||OP_UNLIKELY(next[0]=='\n')){
+    /*No more headers.*/
+    *_header=NULL;
+    *_cdr=NULL;
+    *_s=NULL;
+    return 0;
+  }
+  header=next+op_http_lwsspn(next);
+  d=strcspn(header,OP_HTTP_CTOKEN);
+  if(OP_UNLIKELY(d<=0))return OP_FALSE;
+  header_end=header+d;
+  next=header_end+op_http_lwsspn(header_end);
+  if(OP_UNLIKELY(*next++!=':'))return OP_FALSE;
+  next+=op_http_lwsspn(next);
+  cdr=next;
+  do{
+    cdr_end=next+strcspn(next,OP_HTTP_CTLS);
+    next=cdr_end+op_http_lwsspn(cdr_end);
+  }
+  while(next>cdr_end);
+  /*We are not mandating this be present thanks to broken servers.*/
+  if(OP_LIKELY(*next=='\r'))next++;
+  if(OP_UNLIKELY(*next++!='\n'))return OP_FALSE;
+  *header_end='\0';
+  *cdr_end='\0';
+  /*Field names are case-insensitive.*/
+  op_string_tolower(header);
+  *_header=header;
+  *_cdr=cdr;
+  *_s=next;
+  return 0;
+}
+
+static opus_int64 op_http_parse_nonnegative_int64(const char **_next,
+ const char *_cdr){
+  const char *next;
+  opus_int64  ret;
+  int         i;
+  next=_cdr+strspn(_cdr,OP_HTTP_DIGIT);
+  *_next=next;
+  if(OP_UNLIKELY(next<=_cdr))return OP_FALSE;
+  while(*_cdr=='0')_cdr++;
+  if(OP_UNLIKELY(next-_cdr>19))return OP_EIMPL;
+  ret=0;
+  for(i=0;i<next-_cdr;i++){
+    int digit;
+    digit=_cdr[i]-'0';
+    /*Check for overflow.*/
+    if(OP_UNLIKELY(ret>(OP_INT64_MAX-9)/10+(digit<=7)))return OP_EIMPL;
+    ret=ret*10+digit;
+  }
+  return ret;
+}
+
+static opus_int64 op_http_parse_content_length(const char *_cdr){
+  const char *next;
+  opus_int64  content_length;
+  content_length=op_http_parse_nonnegative_int64(&next,_cdr);
+  if(OP_UNLIKELY(*next!='\0'))return OP_FALSE;
+  return content_length;
+}
+
+static int op_http_parse_content_range(opus_int64 *_first,opus_int64 *_last,
+ opus_int64 *_length,const char *_cdr){
+  opus_int64 first;
+  opus_int64 last;
+  opus_int64 length;
+  size_t     d;
+  if(OP_UNLIKELY(op_strncasecmp(_cdr,"bytes",5)!=0))return OP_FALSE;
+  _cdr+=5;
+  d=op_http_lwsspn(_cdr);
+  if(OP_UNLIKELY(d<=0))return OP_FALSE;
+  _cdr+=d;
+  if(*_cdr!='*'){
+    first=op_http_parse_nonnegative_int64(&_cdr,_cdr);
+    if(OP_UNLIKELY(first<0))return (int)first;
+    _cdr+=op_http_lwsspn(_cdr);
+    if(*_cdr++!='-')return OP_FALSE;
+    _cdr+=op_http_lwsspn(_cdr);
+    last=op_http_parse_nonnegative_int64(&_cdr,_cdr);
+    if(OP_UNLIKELY(last<0))return (int)last;
+    _cdr+=op_http_lwsspn(_cdr);
+  }
+  else{
+    /*This is for a 416 response (Requested range not satisfiable).*/
+    first=last=-1;
+    _cdr++;
+  }
+  if(OP_UNLIKELY(*_cdr++!='/'))return OP_FALSE;
+  if(*_cdr!='*'){
+    length=op_http_parse_nonnegative_int64(&_cdr,_cdr);
+    if(OP_UNLIKELY(length<0))return (int)length;
+  }
+  else{
+    /*The total length is unspecified.*/
+    _cdr++;
+    length=-1;
+  }
+  if(OP_UNLIKELY(*_cdr!='\0'))return OP_FALSE;
+  if(OP_UNLIKELY(last<first))return OP_FALSE;
+  if(length>=0&&OP_UNLIKELY(last>=length))return OP_FALSE;
+  *_first=first;
+  *_last=last;
+  *_length=length;
+  return 0;
+}
+
+/*Parse the Connection response header and look for a "close" token.
+  Return: 1 if a "close" token is found, 0 if it's not found, and a negative
+           value on error.*/
+static int op_http_parse_connection(char *_cdr){
+  size_t d;
+  int    ret;
+  ret=0;
+  for(;;){
+    d=strcspn(_cdr,OP_HTTP_CTOKEN);
+    if(OP_UNLIKELY(d<=0))return OP_FALSE;
+    if(op_strncasecmp(_cdr,"close",(int)d)==0)ret=1;
+    /*We're supposed to strip and ignore any headers mentioned in the
+       Connection header if this response is from an HTTP/1.0 server (to
+       work around forwarding of hop-by-hop headers by old proxies), but the
+       only hop-by-hop header we look at is Connection itself.
+      Everything else is a well-defined end-to-end header, and going back and
+       undoing the things we did based on already-examined headers would be
+       hard (since we only scan them once, in a destructive manner).
+      Therefore we just ignore all the other tokens.*/
+    _cdr+=d;
+    d=op_http_lwsspn(_cdr);
+    if(d<=0)break;
+    _cdr+=d;
+  }
+  return OP_UNLIKELY(*_cdr!='\0')?OP_FALSE:ret;
+}
+
+typedef int (*op_ssl_step_func)(SSL *_ssl_conn);
+
+/*Try to run an SSL function to completion (blocking if necessary).*/
+static int op_do_ssl_step(SSL *_ssl_conn,op_sock _fd,op_ssl_step_func _step){
+  struct pollfd fd;
+  fd.fd=_fd;
+  for(;;){
+    int ret;
+    int err;
+    ret=(*_step)(_ssl_conn);
+    if(ret>=0)return ret;
+    err=SSL_get_error(_ssl_conn,ret);
+    if(err==SSL_ERROR_WANT_READ)fd.events=POLLIN;
+    else if(err==SSL_ERROR_WANT_WRITE)fd.events=POLLOUT;
+    else return OP_FALSE;
+    if(poll(&fd,1,OP_POLL_TIMEOUT_MS)<=0)return OP_FALSE;
+  }
+}
+
+/*Implement a BIO type that just indicates every operation should be retried.
+  We use this when initializing an SSL connection via a proxy to allow the
+   initial handshake to proceed all the way up to the first read attempt, and
+   then return.
+  This allows the TLS client hello message to be pipelined with the HTTP
+   CONNECT request.*/
+
+static int op_bio_retry_write(BIO *_b,const char *_buf,int _num){
+  (void)_buf;
+  (void)_num;
+  BIO_clear_retry_flags(_b);
+  BIO_set_retry_write(_b);
+  return -1;
+}
+
+static int op_bio_retry_read(BIO *_b,char *_buf,int _num){
+  (void)_buf;
+  (void)_num;
+  BIO_clear_retry_flags(_b);
+  BIO_set_retry_read(_b);
+  return -1;
+}
+
+static int op_bio_retry_puts(BIO *_b,const char *_str){
+  return op_bio_retry_write(_b,_str,0);
+}
+
+static long op_bio_retry_ctrl(BIO *_b,int _cmd,long _num,void *_ptr){
+  long ret;
+  (void)_b;
+  (void)_num;
+  (void)_ptr;
+  ret=0;
+  switch(_cmd){
+    case BIO_CTRL_RESET:
+    case BIO_C_RESET_READ_REQUEST:{
+      BIO_clear_retry_flags(_b);
+      /*Fall through.*/
+    }
+    case BIO_CTRL_EOF:
+    case BIO_CTRL_SET:
+    case BIO_CTRL_SET_CLOSE:
+    case BIO_CTRL_FLUSH:
+    case BIO_CTRL_DUP:{
+      ret=1;
+    }break;
+  }
+  return ret;
+}
+
+static int op_bio_retry_new(BIO *_b){
+  _b->init=1;
+  _b->num=0;
+  _b->ptr=NULL;
+  return 1;
+}
+
+static int op_bio_retry_free(BIO *_b){
+  return _b!=NULL;
+}
+
+/*This is not const because OpenSSL doesn't allow it, even though it won't
+   write to it.*/
+static BIO_METHOD op_bio_retry_method={
+  BIO_TYPE_NULL,
+  "retry",
+  op_bio_retry_write,
+  op_bio_retry_read,
+  op_bio_retry_puts,
+  NULL,
+  op_bio_retry_ctrl,
+  op_bio_retry_new,
+  op_bio_retry_free,
+  NULL
+};
+
+/*Establish a CONNECT tunnel and pipeline the start of the TLS handshake for
+   proxying https URL requests.*/
+static int op_http_conn_establish_tunnel(OpusHTTPStream *_stream,
+ OpusHTTPConn *_conn,op_sock _fd,SSL *_ssl_conn,BIO *_ssl_bio){
+  BIO  *retry_bio;
+  char *status_code;
+  char *next;
+  int   ret;
+  _conn->ssl_conn=NULL;
+  _conn->fd=_fd;
+  OP_ASSERT(_stream->proxy_connect.nbuf>0);
+  ret=op_http_conn_write_fully(_conn,
+   _stream->proxy_connect.buf,_stream->proxy_connect.nbuf);
+  if(OP_UNLIKELY(ret<0))return ret;
+  retry_bio=BIO_new(&op_bio_retry_method);
+  if(OP_UNLIKELY(retry_bio==NULL))return OP_EFAULT;
+  SSL_set_bio(_ssl_conn,retry_bio,_ssl_bio);
+  SSL_set_connect_state(_ssl_conn);
+  /*This shouldn't succeed, since we can't read yet.*/
+  OP_ALWAYS_TRUE(SSL_connect(_ssl_conn)<0);
+  SSL_set_bio(_ssl_conn,_ssl_bio,_ssl_bio);
+  /*Only now do we disable write coalescing, to allow the CONNECT
+     request and the start of the TLS handshake to be combined.*/
+  op_sock_set_tcp_nodelay(_fd,1);
+  ret=op_http_conn_read_response(_conn,&_stream->response);
+  if(OP_UNLIKELY(ret<0))return ret;
+  next=op_http_parse_status_line(NULL,&status_code,_stream->response.buf);
+  /*According to RFC 2817, "Any successful (2xx) response to a
+     CONNECT request indicates that the proxy has established a
+     connection to the requested host and port.*/
+  if(OP_UNLIKELY(next==NULL)||OP_UNLIKELY(status_code[0]!='2'))return OP_FALSE;
+  return 0;
+}
+
+/*Match a host name against a host with a possible wildcard pattern according
+   to the rules of RFC 6125 Section 6.4.3.
+  Return: 0 if the pattern doesn't match, and a non-zero value if it does.*/
+static int op_http_hostname_match(const char *_host,size_t _host_len,
+ ASN1_STRING *_pattern){
+  const char *pattern;
+  size_t      host_label_len;
+  size_t      host_suffix_len;
+  size_t      pattern_len;
+  size_t      pattern_label_len;
+  size_t      pattern_prefix_len;
+  size_t      pattern_suffix_len;
+  pattern=(const char *)ASN1_STRING_data(_pattern);
+  pattern_len=strlen(pattern);
+  /*Check the pattern for embedded NULs.*/
+  if(OP_UNLIKELY(pattern_len!=(size_t)ASN1_STRING_length(_pattern)))return 0;
+  pattern_label_len=strcspn(pattern,".");
+  OP_ASSERT(pattern_label_len<=pattern_len);
+  pattern_prefix_len=strcspn(pattern,"*");
+  if(pattern_prefix_len>=pattern_label_len){
+    /*"The client SHOULD NOT attempt to match a presented identifier in which
+       the wildcard character comprises a label other than the left-most label
+       (e.g., do not match bar.*.example.net)." [RFC 6125 Section 6.4.3]*/
+    if(pattern_prefix_len<pattern_len)return 0;
+    /*If the pattern does not contain a wildcard in the first element, do an
+       exact match.
+      Don't use the system strcasecmp here, as that uses the locale and
+       RFC 4343 makes clear that DNS's case-insensitivity only applies to
+       the ASCII range.*/
+    return _host_len==pattern_len&&op_strncasecmp(_host,pattern,_host_len)==0;
+  }
+  /*"However, the client SHOULD NOT attempt to match a presented identifier
+     where the wildcard character is embedded within an A-label or U-label of
+     an internationalized domain name." [RFC 6125 Section 6.4.3]*/
+  if(op_strncasecmp(pattern,"xn--",4)==0)return 0;
+  host_label_len=strcspn(_host,".");
+  /*Make sure the host has at least two dots, to prevent the wildcard match
+     from being ridiculously wide.
+    We should have already checked to ensure it had at least one.*/
+  if(OP_UNLIKELY(_host[host_label_len]!='.')
+   ||strchr(_host+host_label_len+1,'.')==NULL){
+    return 0;
+  }
+  OP_ASSERT(host_label_len<_host_len);
+  /*"If the wildcard character is the only character of the left-most label in
+     the presented identifier, the client SHOULD NOT compare against anything
+     but the left-most label of the reference identifier (e.g., *.example.com
+     would match foo.example.com but not bar.foo.example.com)." [RFC 6125
+     Section 6.4.3]
+    This is really confusingly worded, as we check this by actually comparing
+     the rest of the pattern for an exact match.
+    We also use the fact that the wildcard must match at least one character,
+     so the left-most label of the hostname must be at least as large as the
+     left-most label of the pattern.*/
+  if(host_label_len<pattern_label_len)return 0;
+  OP_ASSERT(pattern[pattern_prefix_len]=='*');
+  /*"The client MAY match a presented identifier in which the wildcard
+     character is not the only character of the label (e.g., baz*.example.net
+     and *baz.example.net and b*z.example.net would be taken to match
+     baz1.example.net and foobaz.example.net and buzz.example.net,
+     respectively)." [RFC 6125 Section 6.4.3]*/
+  pattern_suffix_len=pattern_len-pattern_prefix_len-1;
+  host_suffix_len=_host_len-host_label_len
+   +pattern_label_len-pattern_prefix_len-1;
+  return pattern_suffix_len==host_suffix_len
+   &&op_strncasecmp(_host,pattern,pattern_prefix_len)==0
+   &&op_strncasecmp(_host+_host_len-host_suffix_len,
+   pattern+pattern_prefix_len+1,host_suffix_len)==0;
+}
+
+/*Convert a host to a numeric address, if possible.
+  Return: A struct addrinfo containing the address, if it was numeric, and NULL
+           otherise.*/
+static struct addrinfo *op_inet_pton(const char *_host){
+  struct addrinfo *addrs;
+  struct addrinfo  hints;
+  memset(&hints,0,sizeof(hints));
+  hints.ai_socktype=SOCK_STREAM;
+  hints.ai_flags=AI_NUMERICHOST;
+  if(!getaddrinfo(_host,NULL,&hints,&addrs))return addrs;
+  return NULL;
+}
+
+/*Verify the server's hostname matches the certificate they presented using
+   the procedure from Section 6 of RFC 6125.
+  Return: 0 if the certificate doesn't match, and a non-zero value if it does.*/
+static int op_http_verify_hostname(OpusHTTPStream *_stream,SSL *_ssl_conn){
+  X509                   *peer_cert;
+  STACK_OF(GENERAL_NAME) *san_names;
+  char                   *host;
+  size_t                  host_len;
+  int                     ret;
+  host=_stream->url.host;
+  host_len=strlen(host);
+  peer_cert=SSL_get_peer_certificate(_ssl_conn);
+  /*We set VERIFY_PEER, so we shouldn't get here without a certificate.*/
+  if(OP_UNLIKELY(peer_cert==NULL))return 0;
+  ret=0;
+  OP_ASSERT(host_len<INT_MAX);
+  /*RFC 2818 says (after correcting for Eratta 1077): "If a subjectAltName
+     extension of type dNSName is present, that MUST be used as the identity.
+    Otherwise, the (most specific) Common Name field in the Subject field of
+     the certificate MUST be used.
+    Although the use of the Common Name is existing practice, it is deprecated
+     and Certification Authorities are encouraged to use the dNSName
+     instead."
+    "Matching is performed using the matching rules specified by RFC 2459.
+    If more than one identity of a given type is present in the certificate
+     (e.g., more than one dNSName name), a match in any one of the set is
+     considered acceptable.
+    Names may contain the wildcard character * which is condered to match any
+     single domain name component or component fragment.
+    E.g., *.a.com matches foo.a.com but not bar.foo.a.com.
+    f*.com matches foo.com but not bar.com."
+    "In some cases, the URI is specified as an IP address rather than a
+     hostname.
+    In this case, the iPAddress subjectAltName must be present in the
+     certificate and must exactly match the IP in the URI."*/
+  san_names=X509_get_ext_d2i(peer_cert,NID_subject_alt_name,NULL,NULL);
+  if(san_names!=NULL){
+    struct addrinfo *addr;
+    unsigned char   *ip;
+    int              ip_len;
+    int              nsan_names;
+    int              sni;
+    /*Check to see if the host was specified as a simple IP address.*/
+    addr=op_inet_pton(host);
+    ip=NULL;
+    ip_len=0;
+    if(addr!=NULL){
+      switch(addr->ai_family){
+        case AF_INET:{
+          struct sockaddr_in *s;
+          s=(struct sockaddr_in *)addr->ai_addr;
+          OP_ASSERT(addr->ai_addrlen>=sizeof(*s));
+          ip=(unsigned char *)&s->sin_addr;
+          ip_len=sizeof(s->sin_addr);
+        }break;
+        case AF_INET6:{
+          struct sockaddr_in6 *s;
+          s=(struct sockaddr_in6 *)addr->ai_addr;
+          OP_ASSERT(addr->ai_addrlen>=sizeof(*s));
+          ip=(unsigned char *)&s->sin6_addr;
+          ip_len=sizeof(s->sin6_addr);
+        }break;
+      }
+    }
+    /*We can only verify fully-qualified domain names.
+      To quote RFC 6125: "The extracted data MUST include only information that
+       can be securely parsed out of the inputs (e.g., parsing the fully
+       qualified DNS domain name out of the "host" component (or its
+       equivalent) of a URI or deriving the application service type from the
+       scheme of a URI) ..."
+      We don't have a way to check (without relying on DNS records, which might
+       be subverted) if this address is fully-qualified.
+      This is particularly problematic when using a CONNECT tunnel, as it is
+       the server that does DNS lookup, not us.
+      However, we are certain that if the hostname has no '.', it is definitely
+       not a fully-qualified domain name (with the exception of crazy TLDs that
+       actually resolve, like "uz", but I am willing to ignore those).
+      RFC 1535 says "...in any event where a '.' exists in a specified name it
+       should be assumed to be a fully qualified domain name (FQDN) and SHOULD
+       be tried as a rooted name first."
+      That doesn't give us any security guarantees, of course (a subverted DNS
+       could fail the original query and our resolver might still retry with a
+       local domain appended).
+      If we don't have a FQDN, just set the number of names to 0, so we'll fail
+       and clean up any resources we allocated.*/
+    if(ip==NULL&&strchr(host,'.')==NULL)nsan_names=0;
+    /*RFC 2459 says there MUST be at least one, but we don't depend on it.*/
+    else nsan_names=sk_GENERAL_NAME_num(san_names);
+    for(sni=0;sni<nsan_names;sni++){
+      const GENERAL_NAME *name;
+      name=sk_GENERAL_NAME_value(san_names,sni);
+      if(ip==NULL){
+        if(name->type==GEN_DNS
+         &&op_http_hostname_match(host,host_len,name->d.dNSName)){
+          ret=1;
+          break;
+        }
+      }
+      else if(name->type==GEN_IPADD){
+        unsigned char *cert_ip;
+        /*If we do have an IP address, compare it directly.
+          RFC 6125: "When the reference identity is an IP address, the identity
+           MUST be converted to the 'network byte order' octet string
+           representation.
+          For IP Version 4, as specified in RFC 791, the octet string will
+           contain exactly four octets.
+          For IP Version 6, as specified in RFC 2460, the octet string will
+           contain exactly sixteen octets.
+          This octet string is then compared against subjectAltName values of
+           type iPAddress.
+          A match occurs if the reference identity octet string and the value
+           octet strings are identical."*/
+        cert_ip=ASN1_STRING_data(name->d.iPAddress);
+        if(ip_len==ASN1_STRING_length(name->d.iPAddress)
+         &&memcmp(ip,cert_ip,ip_len)==0){
+          ret=1;
+          break;
+        }
+      }
+    }
+    sk_GENERAL_NAME_pop_free(san_names,GENERAL_NAME_free);
+    if(addr!=NULL)freeaddrinfo(addr);
+  }
+  /*Do the same FQDN check we did above.
+    We don't do this once in advance for both cases, because in the
+     subjectAltName case we might have an IPv6 address without a dot.*/
+  else if(strchr(host,'.')!=NULL){
+    int last_cn_loc;
+    int cn_loc;
+    /*If there is no subjectAltName, match against commonName.
+      RFC 6125 says that at least one significant CA is known to issue certs
+       with multiple CNs, although it SHOULD NOT.
+      It also says: "The server's identity may also be verified by comparing
+       the reference identity to the Common Name (CN) value in the last
+       Relative Distinguished Name (RDN) of the subject field of the server's
+       certificate (where "last" refers to the DER-encoded order...)."
+      So find the last one and check it.*/
+    cn_loc=-1;
+    do{
+      last_cn_loc=cn_loc;
+      cn_loc=X509_NAME_get_index_by_NID(X509_get_subject_name(peer_cert),
+       NID_commonName,last_cn_loc);
+    }
+    while(cn_loc>=0);
+    ret=last_cn_loc>=0
+     &&op_http_hostname_match(host,host_len,
+     X509_NAME_ENTRY_get_data(
+     X509_NAME_get_entry(X509_get_subject_name(peer_cert),last_cn_loc)));
+  }
+  X509_free(peer_cert);
+  return ret;
+}
+
+/*Perform the TLS handshake on a new connection.*/
+static int op_http_conn_start_tls(OpusHTTPStream *_stream,OpusHTTPConn *_conn,
+ op_sock _fd,SSL *_ssl_conn){
+  SSL_SESSION *ssl_session;
+  BIO         *ssl_bio;
+  int          skip_certificate_check;
+  int          ret;
+  ssl_bio=BIO_new_socket(_fd,BIO_NOCLOSE);
+  if(OP_LIKELY(ssl_bio==NULL))return OP_FALSE;
+# if !defined(OPENSSL_NO_TLSEXT)
+  /*Support for RFC 6066 Server Name Indication.*/
+  SSL_set_tlsext_host_name(_ssl_conn,_stream->url.host);
+# endif
+  /*Resume a previous session if available.*/
+  if(_stream->ssl_session!=NULL){
+    SSL_set_session(_ssl_conn,_stream->ssl_session);
+  }
+  /*If we're proxying, establish the CONNECT tunnel.*/
+  if(_stream->proxy_connect.nbuf>0){
+    ret=op_http_conn_establish_tunnel(_stream,_conn,
+     _fd,_ssl_conn,ssl_bio);
+    if(OP_UNLIKELY(ret<0))return ret;
+  }
+  else{
+    /*Otherwise, just use this socket directly.*/
+    op_sock_set_tcp_nodelay(_fd,1);
+    SSL_set_bio(_ssl_conn,ssl_bio,ssl_bio);
+    SSL_set_connect_state(_ssl_conn);
+  }
+  ret=op_do_ssl_step(_ssl_conn,_fd,SSL_connect);
+  if(OP_UNLIKELY(ret<=0))return OP_FALSE;
+  ssl_session=_stream->ssl_session;
+  skip_certificate_check=_stream->skip_certificate_check;
+  if(ssl_session==NULL||!skip_certificate_check){
+    ret=op_do_ssl_step(_ssl_conn,_fd,SSL_do_handshake);
+    if(OP_UNLIKELY(ret<=0))return OP_FALSE;
+    /*OpenSSL does not do hostname verification, despite the fact that we just
+       passed it the hostname above in the call to SSL_set_tlsext_host_name(),
+       because they are morons.
+      Do it for them.*/
+    if(!skip_certificate_check&&!op_http_verify_hostname(_stream,_ssl_conn)){
+      return OP_FALSE;
+    }
+    if(ssl_session==NULL){
+      /*Save the session for later resumption.*/
+      _stream->ssl_session=SSL_get1_session(_ssl_conn);
+    }
+  }
+  _conn->ssl_conn=_ssl_conn;
+  _conn->fd=_fd;
+  _conn->nrequests_left=OP_PIPELINE_MAX_REQUESTS;
+  return 0;
+}
+
+/*Try to start a connection to the next address in the given list of a given
+   type.
+  _fd:           The socket to connect with.
+  [inout] _addr: A pointer to the list of addresses.
+                 This will be advanced to the first one that matches the given
+                  address family (possibly the current one).
+  _ai_family:    The address family to connect to.
+  Return: 1        If the connection was successful.
+          0        If the connection is in progress.
+          OP_FALSE If the connection failed and there were no more addresses
+                    left to try.
+                    *_addr will be set to NULL in this case.*/
+static int op_sock_connect_next(op_sock _fd,
+ const struct addrinfo **_addr,int _ai_family){
+  const struct addrinfo *addr;
+  int                    err;
+  addr=*_addr;
+  for(;;){
+    /*Move to the next address of the requested type.*/
+    for(;addr!=NULL&&addr->ai_family!=_ai_family;addr=addr->ai_next);
+    *_addr=addr;
+    /*No more: failure.*/
+    if(addr==NULL)return OP_FALSE;
+    if(connect(_fd,addr->ai_addr,addr->ai_addrlen)>=0)return 1;
+    err=op_errno();
+    /*Winsock will set WSAEWOULDBLOCK.*/
+    if(OP_LIKELY(err==EINPROGRESS||err==EWOULDBLOCK))return 0;
+    addr=addr->ai_next;
+  }
+}
+
+/*The number of address families to try connecting to simultaneously.*/
+# define OP_NPROTOS (2)
+
+static int op_http_connect_impl(OpusHTTPStream *_stream,OpusHTTPConn *_conn,
+ const struct addrinfo *_addrs,struct timeb *_start_time){
+  const struct addrinfo *addr;
+  const struct addrinfo *addrs[OP_NPROTOS];
+  struct pollfd          fds[OP_NPROTOS];
+  int                    ai_family;
+  int                    nprotos;
+  int                    ret;
+  int                    pi;
+  int                    pj;
+  for(pi=0;pi<OP_NPROTOS;pi++)addrs[pi]=NULL;
+  /*Try connecting via both IPv4 and IPv6 simultaneously, and keep the first
+     one that succeeds.
+    Start by finding the first address from each family.
+    We order the first connection attempts in the same order the address
+     families were returned in the DNS records in accordance with RFC 6555.*/
+  for(addr=_addrs,nprotos=0;addr!=NULL&&nprotos<OP_NPROTOS;addr=addr->ai_next){
+    if(addr->ai_family==AF_INET6||addr->ai_family==AF_INET){
+      OP_ASSERT(addr->ai_addrlen<=sizeof(struct sockaddr_in6));
+      OP_ASSERT(addr->ai_addrlen<=sizeof(struct sockaddr_in));
+      /*If we've seen this address family before, skip this address for now.*/
+      for(pi=0;pi<nprotos;pi++)if(addrs[pi]->ai_family==addr->ai_family)break;
+      if(pi<nprotos)continue;
+      addrs[nprotos++]=addr;
+    }
+  }
+  /*Pop the connection off the free list and put it on the LRU list.*/
+  OP_ASSERT(_stream->free_head==_conn);
+  _stream->free_head=_conn->next;
+  _conn->next=_stream->lru_head;
+  _stream->lru_head=_conn;
+  ftime(_start_time);
+  *&_conn->read_time=*_start_time;
+  _conn->read_bytes=0;
+  _conn->read_rate=0;
+  /*Try to start a connection to each protocol.
+    RFC 6555 says it is RECOMMENDED that connection attempts be paced
+     150...250 ms apart "to balance human factors against network load", but
+     that "stateful algorithms" (that's us) "are expected to be more
+     aggressive".
+    We are definitely more aggressive: we don't pace at all.*/
+  for(pi=0;pi<nprotos;pi++){
+    ai_family=addrs[pi]->ai_family;
+    fds[pi].fd=socket(ai_family,SOCK_STREAM,addrs[pi]->ai_protocol);
+    fds[pi].events=POLLOUT;
+    if(OP_LIKELY(fds[pi].fd!=OP_INVALID_SOCKET)){
+      if(OP_LIKELY(op_sock_set_nonblocking(fds[pi].fd,1)>=0)){
+        ret=op_sock_connect_next(fds[pi].fd,addrs+pi,ai_family);
+        if(OP_UNLIKELY(ret>0)){
+          /*It succeeded right away (technically possible), so stop.*/
+          nprotos=pi+1;
+          break;
+        }
+        /*Otherwise go on to the next protocol, and skip the clean-up below.*/
+        else if(ret==0)continue;
+        /*Tried all the addresses for this protocol.*/
+      }
+      /*Clean up the socket.*/
+      close(fds[pi].fd);
+    }
+    /*Remove this protocol from the list.*/
+    memmove(addrs+pi,addrs+pi+1,sizeof(*addrs)*(nprotos-pi-1));
+    nprotos--;
+    pi--;
+  }
+  /*Wait for one of the connections to finish.*/
+  while(pi>=nprotos&&nprotos>0&&poll(fds,nprotos,OP_POLL_TIMEOUT_MS)>0){
+    for(pi=0;pi<nprotos;pi++){
+      socklen_t errlen;
+      int       err;
+      /*Still waiting...*/
+      if(!fds[pi].revents)continue;
+      errlen=sizeof(err);
+      /*Some platforms will return the pending error in &err and return 0.
+        Others will put it in errno and return -1.*/
+      ret=getsockopt(fds[pi].fd,SOL_SOCKET,SO_ERROR,&err,&errlen);
+      if(ret<0)err=op_errno();
+      /*Success!*/
+      if(err==0||err==EISCONN)break;
+      /*Move on to the next address for this protocol.*/
+      ai_family=addrs[pi]->ai_family;
+      addrs[pi]=addrs[pi]->ai_next;
+      ret=op_sock_connect_next(fds[pi].fd,addrs+pi,ai_family);
+      /*It succeeded right away, so stop.*/
+      if(ret>0)break;
+      /*Otherwise go on to the next protocol, and skip the clean-up below.*/
+      else if(ret==0)continue;
+      /*Tried all the addresses for this protocol.
+        Remove it from the list.*/
+      close(fds[pi].fd);
+      memmove(fds+pi,fds+pi+1,sizeof(*fds)*(nprotos-pi-1));
+      memmove(addrs+pi,addrs+pi+1,sizeof(*addrs)*(nprotos-pi-1));
+      nprotos--;
+      pi--;
+    }
+  }
+  /*Close all the other sockets.*/
+  for(pj=0;pj<nprotos;pj++)if(pi!=pj)close(fds[pj].fd);
+  /*If none of them succeeded, we're done.*/
+  if(pi>=nprotos)return OP_FALSE;
+  /*Save this address for future connection attempts.*/
+  if(addrs[pi]!=&_stream->addr_info){
+    memcpy(&_stream->addr_info,addrs[pi],sizeof(_stream->addr_info));
+    _stream->addr_info.ai_addr=&_stream->addr.s;
+    _stream->addr_info.ai_next=NULL;
+    memcpy(&_stream->addr,addrs[pi]->ai_addr,addrs[pi]->ai_addrlen);
+  }
+  if(OP_URL_IS_SSL(&_stream->url)){
+    SSL *ssl_conn;
+    /*Start the SSL connection.*/
+    OP_ASSERT(_stream->ssl_ctx!=NULL);
+    ssl_conn=SSL_new(_stream->ssl_ctx);
+    if(OP_LIKELY(ssl_conn!=NULL)){
+      ret=op_http_conn_start_tls(_stream,_conn,fds[pi].fd,ssl_conn);
+      if(OP_LIKELY(ret>=0))return ret;
+      SSL_free(ssl_conn);
+    }
+    close(fds[pi].fd);
+    _conn->fd=OP_INVALID_SOCKET;
+    return OP_FALSE;
+  }
+  /*Just a normal non-SSL connection.*/
+  _conn->ssl_conn=NULL;
+  _conn->fd=fds[pi].fd;
+  _conn->nrequests_left=OP_PIPELINE_MAX_REQUESTS;
+  /*Disable write coalescing.
+    We always send whole requests at once and always parse the response headers
+     before sending another one.*/
+  op_sock_set_tcp_nodelay(fds[pi].fd,1);
+  return 0;
+}
+
+static int op_http_connect(OpusHTTPStream *_stream,OpusHTTPConn *_conn,
+ const struct addrinfo *_addrs,struct timeb *_start_time){
+  struct timeb     resolve_time;
+  struct addrinfo *new_addrs;
+  int              ret;
+  /*Re-resolve the host if we need to (RFC 6555 says we MUST do so
+     occasionally).*/
+  new_addrs=NULL;
+  ftime(&resolve_time);
+  if(_addrs!=&_stream->addr_info||op_time_diff_ms(&resolve_time,
+   &_stream->resolve_time)>=OP_RESOLVE_CACHE_TIMEOUT_MS){
+    new_addrs=op_resolve(_stream->connect_host,_stream->connect_port);
+    if(OP_LIKELY(new_addrs!=NULL)){
+      _addrs=new_addrs;
+      *&_stream->resolve_time=*&resolve_time;
+    }
+    else if(OP_LIKELY(_addrs==NULL))return OP_FALSE;
+  }
+  ret=op_http_connect_impl(_stream,_conn,_addrs,_start_time);
+  if(new_addrs!=NULL)freeaddrinfo(new_addrs);
+  return ret;
+}
+
+# define OP_BASE64_LENGTH(_len) (((_len)+2)/3*4)
+
+static const char BASE64_TABLE[64]={
+  'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
+  'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f',
+  'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v',
+  'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/'
+};
+
+static char *op_base64_encode(char *_dst,const char *_src,int _len){
+  unsigned s0;
+  unsigned s1;
+  unsigned s2;
+  int      ngroups;
+  int      i;
+  ngroups=_len/3;
+  for(i=0;i<ngroups;i++){
+    s0=_src[3*i+0];
+    s1=_src[3*i+1];
+    s2=_src[3*i+2];
+    _dst[4*i+0]=BASE64_TABLE[s0>>2];
+    _dst[4*i+1]=BASE64_TABLE[(s0&3)<<4|s1>>4];
+    _dst[4*i+2]=BASE64_TABLE[(s1&15)<<2|s2>>6];
+    _dst[4*i+3]=BASE64_TABLE[s2&63];
+  }
+  _len-=3*i;
+  if(_len==1){
+    s0=_src[3*i+0];
+    _dst[4*i+0]=BASE64_TABLE[s0>>2];
+    _dst[4*i+1]=BASE64_TABLE[(s0&3)<<4];
+    _dst[4*i+2]='=';
+    _dst[4*i+3]='=';
+    i++;
+  }
+  else if(_len==2){
+    s0=_src[3*i+0];
+    s1=_src[3*i+1];
+    _dst[4*i+0]=BASE64_TABLE[s0>>2];
+    _dst[4*i+1]=BASE64_TABLE[(s0&3)<<4|s1>>4];
+    _dst[4*i+2]=BASE64_TABLE[(s1&15)<<2];
+    _dst[4*i+3]='=';
+    i++;
+  }
+  _dst[4*i]='\0';
+  return _dst+4*i;
+}
+
+/*Construct an HTTP authorization header using RFC 2617's Basic Authentication
+   Scheme and append it to the given string buffer.*/
+static int op_sb_append_basic_auth_header(OpusStringBuf *_sb,
+ const char *_header,const char *_user,const char *_pass){
+  int user_len;
+  int pass_len;
+  int user_pass_len;
+  int base64_len;
+  int nbuf_total;
+  int ret;
+  ret=op_sb_append_string(_sb,_header);
+  ret|=op_sb_append(_sb,": Basic ",8);
+  user_len=strlen(_user);
+  pass_len=strlen(_pass);
+  if(OP_UNLIKELY(pass_len>INT_MAX-user_len))return OP_EFAULT;
+  if(OP_UNLIKELY(user_len+pass_len>(INT_MAX>>2)*3-3))return OP_EFAULT;
+  user_pass_len=user_len+1+pass_len;
+  base64_len=OP_BASE64_LENGTH(user_pass_len);
+  /*Stick "user:pass" at the end of the buffer so we can Base64 encode it
+     in-place.*/
+  nbuf_total=_sb->nbuf;
+  if(OP_UNLIKELY(base64_len>INT_MAX-nbuf_total))return OP_EFAULT;
+  nbuf_total+=base64_len;
+  ret|=op_sb_ensure_capacity(_sb,nbuf_total);
+  if(OP_UNLIKELY(ret<0))return ret;
+  _sb->nbuf=nbuf_total-user_pass_len;
+  OP_ALWAYS_TRUE(!op_sb_append(_sb,_user,user_len));
+  OP_ALWAYS_TRUE(!op_sb_append(_sb,":",1));
+  OP_ALWAYS_TRUE(!op_sb_append(_sb,_pass,pass_len));
+  op_base64_encode(_sb->buf+nbuf_total-base64_len,
+   _sb->buf+nbuf_total-user_pass_len,user_pass_len);
+  return op_sb_append(_sb,"\r\n",2);
+}
+
+static int op_http_allow_pipelining(const char *_server){
+  /*Servers known to do bad things with pipelined requests.
+    This list is taken from Gecko's nsHttpConnection::SupportsPipelining() (in
+     netwerk/protocol/http/nsHttpConnection.cpp).*/
+  static const char *BAD_SERVERS[]={
+    "EFAServer/",
+    "Microsoft-IIS/4.",
+    "Microsoft-IIS/5.",
+    "Netscape-Enterprise/3.",
+    "Netscape-Enterprise/4.",
+    "Netscape-Enterprise/5.",
+    "Netscape-Enterprise/6.",
+    "WebLogic 3.",
+    "WebLogic 4.",
+    "WebLogic 5.",
+    "WebLogic 6.",
+    "Winstone Servlet Engine v0."
+  };
+# define NBAD_SERVERS ((int)(sizeof(BAD_SERVERS)/sizeof(*BAD_SERVERS)))
+  if(*_server>='E'&&*_server<='W'){
+    int si;
+    for(si=0;si<NBAD_SERVERS;si++){
+      if(strncmp(_server,BAD_SERVERS[si],strlen(BAD_SERVERS[si]))==0){
+        return 0;
+      }
+    }
+  }
+  return 1;
+# undef NBAD_SERVERS
+}
+
+static int op_http_stream_open(OpusHTTPStream *_stream,const char *_url,
+ int _skip_certificate_check,const char *_proxy_host,unsigned _proxy_port,
+ const char *_proxy_user,const char *_proxy_pass,OpusServerInfo *_info){
+  struct addrinfo *addrs;
+  int              nredirs;
+  int              ret;
+#if defined(_WIN32)
+  op_init_winsock();
+#endif
+  ret=op_parse_url(&_stream->url,_url);
+  if(OP_UNLIKELY(ret<0))return ret;
+  if(_proxy_host!=NULL){
+    if(OP_UNLIKELY(_proxy_port>65535U))return OP_EINVAL;
+    _stream->connect_host=op_string_dup(_proxy_host);
+    _stream->connect_port=_proxy_port;
+  }
+  else{
+    _stream->connect_host=_stream->url.host;
+    _stream->connect_port=_stream->url.port;
+  }
+  addrs=NULL;
+  for(nredirs=0;nredirs<OP_REDIRECT_LIMIT;nredirs++){
+    OpusParsedURL  next_url;
+    struct timeb   start_time;
+    struct timeb   end_time;
+    char          *next;
+    char          *status_code;
+    int            minor_version_pos;
+    int            v1_1_compat;
+    /*Initialize the SSL library if necessary.*/
+    if(OP_URL_IS_SSL(&_stream->url)&&_stream->ssl_ctx==NULL){
+      SSL_CTX *ssl_ctx;
+# if !defined(OPENSSL_NO_LOCKING)
+      /*The documentation says SSL_library_init() is not reentrant.
+        We don't want to add our own depenencies on a threading library, and it
+         appears that it's safe to call OpenSSL's locking functions before the
+         library is initialized, so that's what we'll do (really OpenSSL should
+         do this for us).
+        This doesn't guarantee that _other_ threads in the application aren't
+         calling SSL_library_init() at the same time, but there's not much we
+         can do about that.*/
+      CRYPTO_w_lock(CRYPTO_LOCK_SSL);
+# endif
+      SSL_library_init();
+      /*Needed to get SHA2 algorithms with old OpenSSL versions.*/
+      OpenSSL_add_ssl_algorithms();
+# if !defined(OPENSSL_NO_LOCKING)
+      CRYPTO_w_unlock(CRYPTO_LOCK_SSL);
+# endif
+      ssl_ctx=SSL_CTX_new(SSLv23_client_method());
+      if(ssl_ctx==NULL)return OP_EFAULT;
+      if(!_skip_certificate_check){
+        /*We don't do anything if this fails, since it just means we won't load
+           any certificates (and thus all checks will fail).
+          However, as that is probably the result of a system
+           mis-configuration, assert here to make it easier to identify.*/
+        OP_ALWAYS_TRUE(SSL_CTX_set_default_verify_paths(ssl_ctx));
+        SSL_CTX_set_verify(ssl_ctx,SSL_VERIFY_PEER,NULL);
+      }
+      _stream->ssl_ctx=ssl_ctx;
+      _stream->skip_certificate_check=_skip_certificate_check;
+      if(_proxy_host!=NULL){
+        /*We need to establish a CONNECT tunnel to handle https proxying.
+          Build the request we'll send to do so.*/
+        _stream->proxy_connect.nbuf=0;
+        ret=op_sb_append(&_stream->proxy_connect,"CONNECT ",8);
+        ret|=op_sb_append_string(&_stream->proxy_connect,_stream->url.host);
+        ret|=op_sb_append_port(&_stream->proxy_connect,_stream->url.port);
+        /*CONNECT requires at least HTTP 1.1.*/
+        ret|=op_sb_append(&_stream->proxy_connect," HTTP/1.1\r\n",11);
+        ret|=op_sb_append(&_stream->proxy_connect,"Host: ",6);
+        ret|=op_sb_append_string(&_stream->proxy_connect,_stream->url.host);
+        /*The example in RFC 2817 Section 5.2 specifies an explicit port even
+           when connecting to the default port.
+          Given that the proxy doesn't know whether we're trying to connect to
+           an http or an https URL except by the port number, this seems like a
+           good idea.*/
+        ret|=op_sb_append_port(&_stream->proxy_connect,_stream->url.port);
+        ret|=op_sb_append(&_stream->proxy_connect,"\r\n",2);
+        ret|=op_sb_append(&_stream->proxy_connect,"User-Agent: .\r\n",15);
+        if(_proxy_user!=NULL&&_proxy_pass!=NULL){
+          ret|=op_sb_append_basic_auth_header(&_stream->proxy_connect,
+           "Proxy-Authorization",_proxy_user,_proxy_pass);
+        }
+        /*For backwards compatibility.*/
+        ret|=op_sb_append(&_stream->proxy_connect,
+         "Proxy-Connection: keep-alive\r\n",30);
+        ret|=op_sb_append(&_stream->proxy_connect,"\r\n",2);
+        if(OP_UNLIKELY(ret<0))return ret;
+      }
+    }
+    /*Actually make the connection.*/
+    ret=op_http_connect(_stream,_stream->conns+0,addrs,&start_time);
+    if(OP_UNLIKELY(ret<0))return ret;
+    /*Build the request to send.*/
+    _stream->request.nbuf=0;
+    ret=op_sb_append(&_stream->request,"GET ",4);
+    ret|=op_sb_append_string(&_stream->request,
+     _proxy_host!=NULL?_url:_stream->url.path);
+    /*Send HTTP/1.0 by default for maximum compatibility (so we don't have to
+       re-try if HTTP/1.1 fails, though it shouldn't, even for a 1.0 server).
+      This means we aren't conditionally compliant with RFC 2145, because we
+       violate the requirement that "An HTTP client SHOULD send a request
+       version equal to the highest version for which the client is at least
+       conditionally compliant...".
+      According to RFC 2145, that means we can't claim any compliance with any
+       IETF HTTP specification.*/
+    ret|=op_sb_append(&_stream->request," HTTP/1.0\r\n",11);
+    /*Remember where this is so we can upgrade to HTTP/1.1 if the server
+       supports it.*/
+    minor_version_pos=_stream->request.nbuf-3;
+    ret|=op_sb_append(&_stream->request,"Host: ",6);
+    ret|=op_sb_append_string(&_stream->request,_stream->url.host);
+    if(!OP_URL_IS_DEFAULT_PORT(&_stream->url)){
+      ret|=op_sb_append_port(&_stream->request,_stream->url.port);
+    }
+    ret|=op_sb_append(&_stream->request,"\r\n",2);
+    /*User-Agents have been a bad idea, so send as little as possible.
+      RFC 2616 requires at least one token in the User-Agent, which must have
+       at least one character.*/
+    ret|=op_sb_append(&_stream->request,"User-Agent: .\r\n",15);
+    if(_proxy_host!=NULL&&!OP_URL_IS_SSL(&_stream->url)
+     &&_proxy_user!=NULL&&_proxy_pass!=NULL){
+      ret|=op_sb_append_basic_auth_header(&_stream->request,
+       "Proxy-Authorization",_proxy_user,_proxy_pass);
+    }
+    if(_stream->url.user!=NULL&&_stream->url.pass!=NULL){
+      ret|=op_sb_append_basic_auth_header(&_stream->request,
+       "Authorization",_stream->url.user,_stream->url.pass);
+    }
+    /*Always send a Referer [sic] header.
+      It's common to refuse to serve a resource unless one is present.
+      We just use the relative "/" URI to suggest we came from the same domain,
+       as this is the most common check.
+      This might violate RFC 2616's mandate that the field "MUST NOT be sent if
+       the Request-URI was obtained from a source that does not have its own
+       URI, such as input from the user keyboard," but we don't really have any
+       way to know.*/
+    /*TODO: Should we update this on redirects?*/
+    ret|=op_sb_append(&_stream->request,"Referer: /\r\n",12);
+    /*Always send a Range request header to find out if we're seekable.
+      This requires an HTTP/1.1 server to succeed, but we'll still get what we
+       want with an HTTP/1.0 server that ignores this request header.*/
+    ret|=op_sb_append(&_stream->request,"Range: bytes=0-\r\n",17);
+    /*Remember where this is so we can append offsets to it later.*/
+    _stream->request_tail=_stream->request.nbuf-4;
+    ret|=op_sb_append(&_stream->request,"\r\n",2);
+    if(OP_UNLIKELY(ret<0))return ret;
+    ret=op_http_conn_write_fully(_stream->conns+0,
+     _stream->request.buf,_stream->request.nbuf);
+    if(OP_UNLIKELY(ret<0))return ret;
+    ret=op_http_conn_read_response(_stream->conns+0,&_stream->response);
+    if(OP_UNLIKELY(ret<0))return ret;
+    ftime(&end_time);
+    next=op_http_parse_status_line(&v1_1_compat,&status_code,
+     _stream->response.buf);
+    if(OP_UNLIKELY(next==NULL))return OP_FALSE;
+    if(status_code[0]=='2'){
+      opus_int64 content_length;
+      opus_int64 range_length;
+      int        pipeline_supported;
+      int        pipeline_disabled;
+      /*We only understand 20x codes.*/
+      if(status_code[1]!='0')return OP_FALSE;
+      content_length=-1;
+      range_length=-1;
+      /*Pipelining must be explicitly enabled.*/
+      pipeline_supported=0;
+      pipeline_disabled=0;
+      for(;;){
+        char *header;
+        char *cdr;
+        ret=op_http_get_next_header(&header,&cdr,&next);
+        if(OP_UNLIKELY(ret<0))return ret;
+        if(header==NULL)break;
+        if(strcmp(header,"content-length")==0){
+          /*Two Content-Length headers?*/
+          if(OP_UNLIKELY(content_length>=0))return OP_FALSE;
+          content_length=op_http_parse_content_length(cdr);
+          if(OP_UNLIKELY(content_length<0))return (int)content_length;
+          /*Make sure the Content-Length and Content-Range headers match.*/
+          if(range_length>=0&&OP_UNLIKELY(content_length!=range_length)){
+            return OP_FALSE;
+          }
+        }
+        else if(strcmp(header,"content-range")==0){
+          opus_int64 range_first;
+          opus_int64 range_last;
+          /*Two Content-Range headers?*/
+          if(OP_UNLIKELY(range_length>=0))return OP_FALSE;
+          ret=op_http_parse_content_range(&range_first,&range_last,
+           &range_length,cdr);
+          if(OP_UNLIKELY(ret<0))return ret;
+          /*"A response with satus code 206 (Partial Content) MUST NOT
+             include a Content-Range field with a byte-range-resp-spec of
+             '*'."*/
+          if(status_code[2]=='6'
+           &&(OP_UNLIKELY(range_first<0)||OP_UNLIKELY(range_last<0))){
+            return OP_FALSE;
+          }
+          /*We asked for the entire resource.*/
+          if(range_length>=0){
+            /*Quit if we didn't get it.*/
+            if(range_last>=0&&OP_UNLIKELY(range_last!=range_length-1)){
+              return OP_FALSE;
+            }
+          }
+          /*If there was no length, use the end of the range.*/
+          else if(range_last>=0)range_length=range_last+1;
+          /*Make sure the Content-Length and Content-Range headers match.*/
+          if(content_length>=0&&OP_UNLIKELY(content_length!=range_length)){
+            return OP_FALSE;
+          }
+        }
+        else if(strcmp(header,"connection")==0){
+          /*According to RFC 2616, if an HTTP/1.1 application does not support
+             pipelining, it "MUST include the 'close' connection option in
+             every message."
+            Therefore, if we receive one in the initial response, disable
+             pipelining entirely.
+            The server still might support it (e.g., we might just have hit the
+             request limit for a temporary child process), but if it doesn't
+             and we assume it does, every time we cross a chunk boundary we'll
+             error out and reconnect, adding lots of latency.*/
+          ret=op_http_parse_connection(cdr);
+          if(OP_UNLIKELY(ret<0))return ret;
+          pipeline_disabled|=ret;
+        }
+        else if(strcmp(header,"server")==0){
+          /*If we got a Server response header, and it wasn't from a known-bad
+             server, enable pipelining, as long as it's at least HTTP/1.1.
+            According to RFC 2145, the server is supposed to respond with the
+             highest minor version number it supports unless it is known or
+             suspected that we incorrectly implement the HTTP specification.
+            So it should send back at least HTTP/1.1, despite our HTTP/1.0
+             request.*/
+          pipeline_supported=v1_1_compat;
+          if(v1_1_compat)pipeline_disabled|=!op_http_allow_pipelining(cdr);
+          if(_info!=NULL&&_info->server==NULL)_info->server=op_string_dup(cdr);
+        }
+        /*Collect station information headers if the caller requested it.
+          If there's more than one copy of a header, the first one wins.*/
+        else if(_info!=NULL){
+          if(strcmp(header,"content-type")==0){
+            if(_info->content_type==NULL){
+              _info->content_type=op_string_dup(cdr);
+            }
+          }
+          else if(header[0]=='i'&&header[1]=='c'
+           &&(header[2]=='e'||header[2]=='y')&&header[3]=='-'){
+            if(strcmp(header+4,"name")==0){
+              if(_info->name==NULL)_info->name=op_string_dup(cdr);
+            }
+            else if(strcmp(header+4,"description")==0){
+              if(_info->description==NULL)_info->description=op_string_dup(cdr);
+            }
+            else if(strcmp(header+4,"genre")==0){
+              if(_info->genre==NULL)_info->genre=op_string_dup(cdr);
+            }
+            else if(strcmp(header+4,"url")==0){
+              if(_info->url==NULL)_info->url=op_string_dup(cdr);
+            }
+            else if(strcmp(header,"icy-br")==0
+             ||strcmp(header,"ice-bitrate")==0){
+              if(_info->bitrate_kbps<0){
+                opus_int64 bitrate_kbps;
+                /*Just re-using this function to parse a random unsigned
+                   integer field.*/
+                bitrate_kbps=op_http_parse_content_length(cdr);
+                if(bitrate_kbps>=0&&bitrate_kbps<=OP_INT32_MAX){
+                  _info->bitrate_kbps=(opus_int32)bitrate_kbps;
+                }
+              }
+            }
+            else if(strcmp(header,"icy-pub")==0
+             ||strcmp(header,"ice-public")==0){
+              if(_info->is_public<0&&(cdr[0]=='0'||cdr[0]=='1')&&cdr[1]=='\0'){
+                _info->is_public=cdr[0]-'0';
+              }
+            }
+          }
+        }
+      }
+      switch(status_code[2]){
+        /*200 OK*/
+        case '0':break;
+        /*203 Non-Authoritative Information*/
+        case '3':break;
+        /*204 No Content*/
+        case '4':{
+          if(content_length>=0&&OP_UNLIKELY(content_length!=0)){
+            return OP_FALSE;
+          }
+        }break;
+        /*206 Partial Content*/
+        case '6':{
+          /*No Content-Range header.*/
+          if(OP_UNLIKELY(range_length<0))return OP_FALSE;
+          content_length=range_length;
+          /*The server supports range requests for this resource.
+            We can seek.*/
+          _stream->seekable=1;
+        }break;
+        /*201 Created: the response "SHOULD include an entity containing a list
+           of resource characteristics and location(s)," but not an Opus file.
+          202 Accepted: the response "SHOULD include an indication of request's
+           current status and either a pointer to a status monitor or some
+           estimate of when the user can expect the request to be fulfilled,"
+           but not an Opus file.
+          205 Reset Content: this "MUST NOT include an entity," meaning no Opus
+           file.
+          207...209 are not yet defined, so we don't know how to handle them.*/
+        default:return OP_FALSE;
+      }
+      _stream->content_length=content_length;
+      _stream->pipeline=pipeline_supported&&!pipeline_disabled;
+      /*Pipelining requires HTTP/1.1 persistent connections.*/
+      if(_stream->pipeline)_stream->request.buf[minor_version_pos]='1';
+      _stream->conns[0].pos=0;
+      _stream->conns[0].end_pos=_stream->seekable?content_length:-1;
+      _stream->conns[0].chunk_size=-1;
+      _stream->cur_conni=0;
+      _stream->connect_rate=op_time_diff_ms(&end_time,&start_time);
+      _stream->connect_rate=OP_MAX(_stream->connect_rate,1);
+      if(_info!=NULL)_info->is_ssl=OP_URL_IS_SSL(&_stream->url);
+      /*The URL has been successfully opened.*/
+      return 0;
+    }
+    /*Shouldn't get 1xx; 4xx and 5xx are both failures (and we don't retry).
+      Everything else is undefined.*/
+    else if(status_code[0]!='3')return OP_FALSE;
+    /*We have some form of redirect request.*/
+    /*We only understand 30x codes.*/
+    if(status_code[1]!='0')return OP_FALSE;
+    switch(status_code[2]){
+      /*300 Multiple Choices: "If the server has a preferred choice of
+         representation, it SHOULD include the specific URI for that
+         representation in the Location field," otherwise we'll fail.*/
+      case '0':
+      /*301 Moved Permanently*/
+      case '1':
+      /*302 Found*/
+      case '2':
+      /*307 Temporary Redirect*/
+      case '7':
+      /*308 Permanent Redirect (defined by draft-reschke-http-status-308-07).*/
+      case '8':break;
+      /*305 Use Proxy: "The Location field gives the URI of the proxy."
+        TODO: This shouldn't actually be that hard to do.*/
+      case '5':return OP_EIMPL;
+      /*303 See Other: "The new URI is not a substitute reference for the
+         originally requested resource."
+        304 Not Modified: "The 304 response MUST NOT contain a message-body."
+        306 (Unused)
+        309 is not yet defined, so we don't know how to handle it.*/
+      default:return OP_FALSE;
+    }
+    _url=NULL;
+    for(;;){
+      char *header;
+      char *cdr;
+      ret=op_http_get_next_header(&header,&cdr,&next);
+      if(OP_UNLIKELY(ret<0))return ret;
+      if(header==NULL)break;
+      if(strcmp(header,"location")==0&&OP_LIKELY(_url==NULL))_url=cdr;
+    }
+    if(OP_UNLIKELY(_url==NULL))return OP_FALSE;
+    ret=op_parse_url(&next_url,_url);
+    if(OP_UNLIKELY(ret<0))return ret;
+    if(_proxy_host==NULL||_stream->ssl_session!=NULL){
+      if(strcmp(_stream->url.host,next_url.host)==0
+       &&_stream->url.port==next_url.port){
+        /*Try to skip re-resolve when connecting to the same host.*/
+        addrs=&_stream->addr_info;
+      }
+      else{
+        if(_stream->ssl_session!=NULL){
+          /*Forget any cached SSL session from the last host.*/
+          SSL_SESSION_free(_stream->ssl_session);
+          _stream->ssl_session=NULL;
+        }
+      }
+    }
+    if(_proxy_host==NULL){
+      OP_ASSERT(_stream->connect_host==_stream->url.host);
+      _stream->connect_host=next_url.host;
+      _stream->connect_port=next_url.port;
+    }
+    /*Always try to skip re-resolve for proxy connections.*/
+    else addrs=&_stream->addr_info;
+    op_parsed_url_clear(&_stream->url);
+    *&_stream->url=*&next_url;
+    /*TODO: On servers/proxies that support pipelining, we might be able to
+       re-use this connection.*/
+    op_http_conn_close(_stream,_stream->conns+0,&_stream->lru_head,1);
+  }
+  /*Redirection limit reached.*/
+  return OP_FALSE;
+}
+
+static int op_http_conn_send_request(OpusHTTPStream *_stream,
+ OpusHTTPConn *_conn,opus_int64 _pos,opus_int32 _chunk_size,
+ int _try_not_to_block){
+  opus_int64 next_end;
+  int        ret;
+  /*We shouldn't have another request outstanding.*/
+  OP_ASSERT(_conn->next_pos<0);
+  /*Build the request to send.*/
+  OP_ASSERT(_stream->request.nbuf>=_stream->request_tail);
+  _stream->request.nbuf=_stream->request_tail;
+  ret=op_sb_append_nonnegative_int64(&_stream->request,_pos);
+  ret|=op_sb_append(&_stream->request,"-",1);
+  if(_chunk_size>0&&OP_ADV_OFFSET(_pos,2*_chunk_size)<_stream->content_length){
+    /*We shouldn't be pipelining requests with non-HTTP/1.1 servers.*/
+    OP_ASSERT(_stream->pipeline);
+    next_end=_pos+_chunk_size;
+    ret|=op_sb_append_nonnegative_int64(&_stream->request,next_end-1);
+    /*Use a larger chunk size for our next request.*/
+    _chunk_size<<=1;
+    /*But after a while, just request the rest of the resource.*/
+    if(_chunk_size>OP_PIPELINE_CHUNK_SIZE_MAX)_chunk_size=-1;
+  }
+  else{
+    /*Either this was a non-pipelined request or we were close enough to the
+       end to just ask for the rest.*/
+    next_end=-1;
+    _chunk_size=-1;
+  }
+  ret|=op_sb_append(&_stream->request,"\r\n\r\n",4);
+  if(OP_UNLIKELY(ret<0))return ret;
+  /*If we don't want to block, check to see if there's enough space in the send
+     queue.
+    There's still a chance we might block, even if there is enough space, but
+     it's a much slimmer one.
+    Blocking at all is pretty unlikely, as we won't have any requests queued
+     when _try_not_to_block is set, so if FIONSPACE isn't available (e.g., on
+     Linux), just skip the test.*/
+  if(_try_not_to_block){
+# if defined(FIONSPACE)
+    int available;
+    ret=ioctl(_conn->fd,FIONSPACE,&available);
+    if(ret<0||available<_stream->request.nbuf)return 1;
+# endif
+  }
+  ret=op_http_conn_write_fully(_conn,
+   _stream->request.buf,_stream->request.nbuf);
+  if(OP_UNLIKELY(ret<0))return ret;
+  _conn->next_pos=_pos;
+  _conn->next_end=next_end;
+  /*Save the chunk size to use for the next request.*/
+  _conn->chunk_size=_chunk_size;
+  _conn->nrequests_left--;
+  return ret;
+}
+
+/*Handles the response to all requests after the first one.
+  Return: 1 if the connection was closed or timed out, 0 on success, or a
+           negative value on any other error.*/
+static int op_http_conn_handle_response(OpusHTTPStream *_stream,
+ OpusHTTPConn *_conn){
+  char       *next;
+  char       *status_code;
+  opus_int64  range_length;
+  opus_int64  next_pos;
+  opus_int64  next_end;
+  int         ret;
+  ret=op_http_conn_read_response(_conn,&_stream->response);
+  /*If the server just closed the connection on us, we may have just hit a
+     connection re-use limit, so we might want to retry.*/
+  if(OP_UNLIKELY(ret<0))return ret==OP_EREAD?1:ret;
+  next=op_http_parse_status_line(NULL,&status_code,_stream->response.buf);
+  if(OP_UNLIKELY(next==NULL))return OP_FALSE;
+  /*We _need_ a 206 Partial Content response.
+    Nothing else will do.*/
+  if(strncmp(status_code,"206",3)!=0){
+    /*But on a 408 Request Timeout, we might want to re-try.*/
+    return strncmp(status_code,"408",3)==0?1:OP_FALSE;
+  }
+  next_pos=_conn->next_pos;
+  next_end=_conn->next_end;
+  range_length=-1;
+  for(;;){
+    char *header;
+    char *cdr;
+    ret=op_http_get_next_header(&header,&cdr,&next);
+    if(OP_UNLIKELY(ret<0))return ret;
+    if(header==NULL)break;
+    if(strcmp(header,"content-range")==0){
+      opus_int64 range_first;
+      opus_int64 range_last;
+      /*Two Content-Range headers?*/
+      if(OP_UNLIKELY(range_length>=0))return OP_FALSE;
+      ret=op_http_parse_content_range(&range_first,&range_last,
+       &range_length,cdr);
+      if(OP_UNLIKELY(ret<0))return ret;
+      /*"A response with satus code 206 (Partial Content) MUST NOT
+         include a Content-Range field with a byte-range-resp-spec of
+         '*'."*/
+      if(OP_UNLIKELY(range_first<0)||OP_UNLIKELY(range_last<0))return OP_FALSE;
+      /*We also don't want range_last to overflow.*/
+      if(OP_UNLIKELY(range_last>=OP_INT64_MAX))return OP_FALSE;
+      range_last++;
+      /*Quit if we didn't get the offset we asked for.*/
+      if(range_first!=next_pos)return OP_FALSE;
+      if(next_end<0){
+        /*We asked for the rest of the resource.*/
+        if(range_length>=0){
+          /*Quit if we didn't get it.*/
+          if(OP_UNLIKELY(range_last!=range_length))return OP_FALSE;
+        }
+        /*If there was no length, use the end of the range.*/
+        else range_length=range_last;
+        next_end=range_last;
+      }
+      else{
+        if(range_last!=next_end)return OP_FALSE;
+        /*If there was no length, use the larger of the content length or the
+           end of this chunk.*/
+        if(range_length<0){
+          range_length=OP_MAX(range_last,_stream->content_length);
+        }
+      }
+    }
+    else if(strcmp(header,"content-length")==0){
+      opus_int64 content_length;
+      /*Validate the Content-Length header, if present, against the request we
+         made.*/
+      content_length=op_http_parse_content_length(cdr);
+      if(OP_UNLIKELY(content_length<0))return (int)content_length;
+      if(next_end<0){
+        /*If we haven't seen the Content-Range header yet and we asked for the
+            rest of the resource, set next_end, so we can make sure they match
+            when we do find the Content-Range header.*/
+        if(OP_UNLIKELY(next_pos>OP_INT64_MAX-content_length))return OP_FALSE;
+        next_end=next_pos+content_length;
+      }
+      /*Otherwise, make sure they match now.*/
+      else if(OP_UNLIKELY(next_end-next_pos!=content_length))return OP_FALSE;
+    }
+    else if(strcmp(header,"connection")==0){
+      ret=op_http_parse_connection(cdr);
+      if(OP_UNLIKELY(ret<0))return ret;
+      /*If the server told us it was going to close the connection, don't make
+         any more requests.*/
+      if(OP_UNLIKELY(ret>0))_conn->nrequests_left=0;
+    }
+  }
+  /*No Content-Range header.*/
+  if(OP_UNLIKELY(range_length<0))return OP_FALSE;
+  /*Update the content_length if necessary.*/
+  _stream->content_length=range_length;
+  _conn->pos=next_pos;
+  _conn->end_pos=next_end;
+  _conn->next_pos=-1;
+  return 0;
+}
+
+/*Open a new connection that will start reading at byte offset _pos.
+  _pos:        The byte offset to start reading from.
+  _chunk_size: The number of bytes to ask for in the initial request, or -1 to
+                request the rest of the resource.
+               This may be more bytes than remain, in which case it will be
+                converted into a request for the rest.*/
+static int op_http_conn_open_pos(OpusHTTPStream *_stream,
+ OpusHTTPConn *_conn,opus_int64 _pos,opus_int32 _chunk_size){
+  struct timeb  start_time;
+  struct timeb  end_time;
+  opus_int32    connect_rate;
+  opus_int32    connect_time;
+  int           ret;
+  ret=op_http_connect(_stream,_conn,&_stream->addr_info,&start_time);
+  if(OP_UNLIKELY(ret<0))return ret;
+  ret=op_http_conn_send_request(_stream,_conn,_pos,_chunk_size,0);
+  if(OP_UNLIKELY(ret<0))return ret;
+  ret=op_http_conn_handle_response(_stream,_conn);
+  if(OP_UNLIKELY(ret!=0))return OP_FALSE;
+  ftime(&end_time);
+  _stream->cur_conni=_conn-_stream->conns;
+  OP_ASSERT(_stream->cur_conni>=0&&_stream->cur_conni<OP_NCONNS_MAX);
+  /*The connection has been successfully opened.
+    Update the connection time estimate.*/
+  connect_time=op_time_diff_ms(&end_time,&start_time);
+  connect_rate=_stream->connect_rate;
+  connect_rate+=OP_MAX(connect_time,1)-connect_rate+8>>4;
+  _stream->connect_rate=connect_rate;
+  return 0;
+}
+
+/*Read data from the current response body.
+  If we're pipelining and we get close to the end of this response, queue
+   another request.
+  If we've reached the end of this response body, parse the next response and
+   keep going.
+  [out] _buf: Returns the data read.
+  _buf_size:  The size of the buffer.
+  Return: A positive number of bytes read on success.
+          0:        The connection was closed.
+          OP_EREAD: There was a fatal read error.*/
+static int op_http_conn_read_body(OpusHTTPStream *_stream,
+ OpusHTTPConn *_conn,unsigned char *_buf,int _buf_size){
+  opus_int64 pos;
+  opus_int64 end_pos;
+  opus_int64 next_pos;
+  opus_int64 content_length;
+  int        nread;
+  int        pipeline;
+  int        ret;
+  /*Currently this function can only be called on the LRU head.
+    Otherwise, we'd need a _pnext pointer if we needed to close the connection,
+     and re-opening it would re-organize the lists.*/
+  OP_ASSERT(_stream->lru_head==_conn);
+  /*We should have filterd out empty reads by this point.*/
+  OP_ASSERT(_buf_size>0);
+  pos=_conn->pos;
+  end_pos=_conn->end_pos;
+  next_pos=_conn->next_pos;
+  pipeline=_stream->pipeline;
+  content_length=_stream->content_length;
+  if(end_pos>=0){
+    /*Have we reached the end of the current response body?*/
+    if(pos>=end_pos){
+      OP_ASSERT(content_length>=0);
+      /*If this was the end of the stream, we're done.
+        Also return early if a non-blocking read was requested (regardless of
+         whether we might be able to parse the next response without
+         blocking).*/
+      if(content_length<=end_pos)return 0;
+      /*Otherwise, start on the next response.*/
+      if(next_pos<0){
+        /*We haven't issued another request yet.*/
+        if(!pipeline||_conn->nrequests_left<=0){
+          /*There are two ways to get here: either the server told us it was
+             going to close the connection after the last request, or we
+             thought we were reading the whole resource, but it grew while we
+             were reading it.
+            The only way the latter could have happened is if content_length
+             changed while seeking.
+            Open a new request to read the rest.*/
+          OP_ASSERT(_stream->seekable);
+          /*Try to open a new connection to read another chunk.*/
+          op_http_conn_close(_stream,_conn,&_stream->lru_head,1);
+          /*If we're not pipelining, we should be requesting the rest.*/
+          OP_ASSERT(pipeline||_conn->chunk_size==-1);
+          ret=op_http_conn_open_pos(_stream,_conn,end_pos,_conn->chunk_size);
+          if(OP_UNLIKELY(ret<0))return OP_EREAD;
+        }
+        else{
+          /*Issue the request now (better late than never).*/
+          ret=op_http_conn_send_request(_stream,_conn,pos,_conn->chunk_size,0);
+          if(OP_UNLIKELY(ret<0))return OP_EREAD;
+          next_pos=_conn->next_pos;
+          OP_ASSERT(next_pos>=0);
+        }
+      }
+      if(next_pos>=0){
+        /*We shouldn't be trying to read past the current request body if we're
+           seeking somewhere else.*/
+        OP_ASSERT(next_pos==end_pos);
+        ret=op_http_conn_handle_response(_stream,_conn);
+        if(OP_UNLIKELY(ret<0))return OP_EREAD;
+        if(OP_UNLIKELY(ret>0)&&pipeline){
+          opus_int64 next_end;
+          next_end=_conn->next_end;
+          /*Our request timed out or the server closed the connection.
+            Try re-connecting.*/
+          op_http_conn_close(_stream,_conn,&_stream->lru_head,1);
+          /*Unless there's a bug, we should be able to convert
+             (next_pos,next_end) into valid (_pos,_chunk_size) parameters.*/
+          OP_ASSERT(next_end<0
+           ||next_end-next_pos>=0&&next_end-next_pos<=OP_INT32_MAX);
+          ret=op_http_conn_open_pos(_stream,_conn,next_pos,
+           next_end<0?-1:(opus_int32)(next_end-next_pos));
+          if(OP_UNLIKELY(ret<0))return OP_EREAD;
+        }
+        else if(OP_UNLIKELY(ret!=0))return OP_EREAD;
+      }
+      pos=_conn->pos;
+      end_pos=_conn->end_pos;
+      content_length=_stream->content_length;
+    }
+    OP_ASSERT(end_pos>pos);
+    _buf_size=OP_MIN(_buf_size,end_pos-pos);
+  }
+  nread=op_http_conn_read(_conn,(char *)_buf,_buf_size,1);
+  if(OP_UNLIKELY(nread<0))return nread;
+  pos+=nread;
+  _conn->pos=pos;
+  OP_ASSERT(end_pos<0||content_length>=0);
+  /*TODO: If nrequests_left<=0, we can't make a new request, and there will be
+     a big pause after we hit the end of the chunk while we open a new
+     connection.
+    It would be nice to be able to start that process now, but we have no way
+     to do it in the background without blocking (even if we could start it, we
+     have no guarantee the application will return control to us in a
+     sufficiently timely manner to allow us to complete it, and this is
+     uncommon enough that it's not worth using threads just for this).*/
+  if(end_pos>=0&&end_pos<content_length&&next_pos<0
+   &&pipeline&&OP_LIKELY(_conn->nrequests_left>0)){
+    opus_int64 request_thresh;
+    opus_int32 chunk_size;
+    /*Are we getting close to the end of the current response body?
+      If so, we should request more data.*/
+    request_thresh=_stream->connect_rate*_conn->read_rate>>12;
+    /*But don't commit ourselves too quickly.*/
+    chunk_size=_conn->chunk_size;
+    if(chunk_size>=0)request_thresh=OP_MIN(chunk_size>>2,request_thresh);
+    if(end_pos-pos<request_thresh){
+      ret=op_http_conn_send_request(_stream,_conn,end_pos,_conn->chunk_size,1);
+      if(OP_UNLIKELY(ret<0))return OP_EREAD;
+    }
+  }
+  return nread;
+}
+
+static int op_http_stream_read(void *_stream,
+ unsigned char *_ptr,int _buf_size){
+  OpusHTTPStream *stream;
+  ptrdiff_t       nread;
+  opus_int64      size;
+  opus_int64      pos;
+  int             ci;
+  stream=(OpusHTTPStream *)_stream;
+  /*Check for an empty read.*/
+  if(_buf_size<=0)return 0;
+  ci=stream->cur_conni;
+  /*No current connection => EOF.*/
+  if(ci<0)return 0;
+  pos=stream->conns[ci].pos;
+  size=stream->content_length;
+  /*Check for EOF.*/
+  if(size>=0){
+    if(pos>=size)return 0;
+    /*Check for a short read.*/
+    if(_buf_size>size-pos)_buf_size=(int)(size-pos);
+  }
+  nread=op_http_conn_read_body(stream,stream->conns+ci,_ptr,_buf_size);
+  if(OP_UNLIKELY(nread<=0)){
+    /*We hit an error or EOF.
+      Either way, we're done with this connection.*/
+    op_http_conn_close(stream,stream->conns+ci,&stream->lru_head,1);
+    stream->cur_conni=-1;
+    stream->pos=pos;
+  }
+  return nread;
+}
+
+/*Discard data until we reach the _target position.
+  This destroys the contents of _stream->response.buf, as we need somewhere to
+   read this data, and that is a convenient place.
+  _just_read_ahead: Whether or not this is a plain fast-forward.
+                    If 0, we need to issue a new request for a chunk at _target
+                     and discard all the data from our current request(s).
+                    Otherwise, we should be able to reach _target without
+                     issuing any new requests.
+  _target:          The stream position to which to read ahead.*/
+static int op_http_conn_read_ahead(OpusHTTPStream *_stream,
+ OpusHTTPConn *_conn,int _just_read_ahead,opus_int64 _target){
+  opus_int64 pos;
+  opus_int64 end_pos;
+  opus_int64 next_pos;
+  opus_int64 next_end;
+  ptrdiff_t  nread;
+  int        ret;
+  pos=_conn->pos;
+  end_pos=_conn->end_pos;
+  next_pos=_conn->next_pos;
+  next_end=_conn->next_end;
+  if(!_just_read_ahead){
+    /*We need to issue a new pipelined request.
+      This is the only case where we allow more than one outstanding request
+       at a time, so we need to reset next_pos (we'll restore it below if we
+       did have an outstanding request).*/
+    OP_ASSERT(_stream->pipeline);
+    _conn->next_pos=-1;
+    ret=op_http_conn_send_request(_stream,_conn,_target,
+     OP_PIPELINE_CHUNK_SIZE,0);
+    if(OP_UNLIKELY(ret<0))return ret;
+  }
+  /*We can reach the target position by reading forward in the current chunk.*/
+  if(_just_read_ahead&&(end_pos<0||_target<end_pos))end_pos=_target;
+  else if(next_pos>=0){
+    opus_int64 next_next_pos;
+    opus_int64 next_next_end;
+    /*We already have a request outstanding.
+      Finish off the current chunk.*/
+    while(pos<end_pos){
+      nread=op_http_conn_read(_conn,_stream->response.buf,
+       (int)OP_MIN(end_pos-pos,_stream->response.cbuf),1);
+      /*We failed to read ahead.*/
+      if(nread<=0)return OP_FALSE;
+      pos+=nread;
+    }
+    OP_ASSERT(pos==end_pos);
+    if(_just_read_ahead){
+      next_next_pos=next_next_end=-1;
+      end_pos=_target;
+    }
+    else{
+      OP_ASSERT(_conn->next_pos==_target);
+      next_next_pos=_target;
+      next_next_end=_conn->next_end;
+      _conn->next_pos=next_pos;
+      _conn->next_end=next_end;
+      end_pos=next_end;
+    }
+    ret=op_http_conn_handle_response(_stream,_conn);
+    if(OP_UNLIKELY(ret!=0))return OP_FALSE;
+    _conn->next_pos=next_next_pos;
+    _conn->next_end=next_next_end;
+  }
+  while(pos<end_pos){
+    nread=op_http_conn_read(_conn,_stream->response.buf,
+     (int)OP_MIN(end_pos-pos,_stream->response.cbuf),1);
+    /*We failed to read ahead.*/
+    if(nread<=0)return OP_FALSE;
+    pos+=nread;
+  }
+  OP_ASSERT(pos==end_pos);
+  if(!_just_read_ahead){
+    ret=op_http_conn_handle_response(_stream,_conn);
+    if(OP_UNLIKELY(ret!=0))return OP_FALSE;
+  }
+  else _conn->pos=end_pos;
+  OP_ASSERT(_conn->pos==_target);
+  return 0;
+}
+
+static int op_http_stream_seek(void *_stream,opus_int64 _offset,int _whence){
+  struct timeb     seek_time;
+  OpusHTTPStream  *stream;
+  OpusHTTPConn    *conn;
+  OpusHTTPConn   **pnext;
+  OpusHTTPConn    *close_conn;
+  OpusHTTPConn   **close_pnext;
+  opus_int64       content_length;
+  opus_int64       pos;
+  int              pipeline;
+  int              ci;
+  int              ret;
+  stream=(OpusHTTPStream *)_stream;
+  if(!stream->seekable)return -1;
+  content_length=stream->content_length;
+  /*If we're seekable, we should have gotten a Content-Length.*/
+  OP_ASSERT(content_length>=0);
+  ci=stream->cur_conni;
+  pos=ci<0?content_length:stream->conns[ci].pos;
+  switch(_whence){
+    case SEEK_SET:{
+      /*Check for overflow:*/
+      if(_offset<0)return -1;
+      pos=_offset;
+    }break;
+    case SEEK_CUR:{
+      /*Check for overflow:*/
+      if(_offset<-pos||_offset>OP_INT64_MAX-pos)return -1;
+      pos+=_offset;
+    }break;
+    case SEEK_END:{
+      /*Check for overflow:*/
+      if(_offset>content_length||_offset<content_length-OP_INT64_MAX)return -1;
+      pos=content_length-_offset;
+    }break;
+    default:return -1;
+  }
+  /*Mark when we deactivated the active connection.*/
+  if(ci>=0){
+    op_http_conn_read_rate_update(stream->conns+ci);
+    *&seek_time=*&stream->conns[ci].read_time;
+  }
+  else ftime(&seek_time);
+  /*If we seeked past the end of the stream, just disable the active
+     connection.*/
+  if(pos>=content_length){
+    stream->cur_conni=-1;
+    stream->pos=pos;
+    return 0;
+  }
+  /*First try to find a connection we can use without waiting.*/
+  pnext=&stream->lru_head;
+  conn=stream->lru_head;
+  while(conn!=NULL){
+    opus_int64 conn_pos;
+    opus_int64 end_pos;
+    int        available;
+    /*If this connection has been dormant too long or has made too many
+       requests, close it.
+      This is to prevent us from hitting server limits/firewall timeouts.*/
+    if(op_time_diff_ms(&seek_time,&conn->read_time)>
+     OP_CONNECTION_IDLE_TIMEOUT_MS
+     ||conn->nrequests_left<OP_PIPELINE_MIN_REQUESTS){
+      op_http_conn_close(stream,conn,pnext,1);
+      conn=*pnext;
+      continue;
+    }
+    available=op_http_conn_estimate_available(conn);
+    conn_pos=conn->pos;
+    end_pos=conn->end_pos;
+    if(conn->next_pos>=0){
+      OP_ASSERT(end_pos>=0);
+      OP_ASSERT(conn->next_pos==end_pos);
+      end_pos=conn->next_end;
+    }
+    OP_ASSERT(end_pos<0||conn_pos<=end_pos);
+    /*Can we quickly read ahead without issuing a new request or waiting for
+       any more data?
+      If we have an oustanding request, we'll over-estimate the amount of data
+       it has available (because we'll count the response headers, too), but
+       that probably doesn't matter.*/
+    if(conn_pos<=pos&&pos-conn_pos<=available&&(end_pos<0||pos<end_pos)){
+      /*Found a suitable connection to re-use.*/
+      ret=op_http_conn_read_ahead(stream,conn,1,pos);
+      if(OP_UNLIKELY(ret<0)){
+        /*The connection might have become stale, so close it and keep going.*/
+        op_http_conn_close(stream,conn,pnext,1);
+        conn=*pnext;
+        continue;
+      }
+      /*Sucessfully resurrected this connection.*/
+      *pnext=conn->next;
+      conn->next=stream->lru_head;
+      stream->lru_head=conn;
+      stream->cur_conni=conn-stream->conns;
+      return 0;
+    }
+    pnext=&conn->next;
+    conn=conn->next;
+  }
+  /*Chances are that didn't work, so now try to find one we can use by reading
+     ahead a reasonable amount and/or by issuing a new request.*/
+  close_pnext=NULL;
+  close_conn=NULL;
+  pnext=&stream->lru_head;
+  conn=stream->lru_head;
+  pipeline=stream->pipeline;
+  while(conn!=NULL){
+    opus_int64 conn_pos;
+    opus_int64 end_pos;
+    opus_int64 read_ahead_thresh;
+    int        available;
+    int        just_read_ahead;
+    /*Dividing by 2048 instead of 1000 scales this by nearly 1/2, biasing away
+       from connection re-use (and roughly compensating for the lag required to
+       reopen the TCP window of a connection that's been idle).
+      There's no overflow checking here, because it's vanishingly unlikely, and
+       all it would do is cause us to make poor decisions.*/
+    read_ahead_thresh=OP_MAX(OP_READAHEAD_THRESH_MIN,
+     stream->connect_rate*conn->read_rate>>11);
+    available=op_http_conn_estimate_available(conn);
+    conn_pos=conn->pos;
+    end_pos=conn->end_pos;
+    if(conn->next_pos>=0){
+      OP_ASSERT(end_pos>=0);
+      OP_ASSERT(conn->next_pos==end_pos);
+      end_pos=conn->next_end;
+    }
+    OP_ASSERT(end_pos<0||conn_pos<=end_pos);
+    /*Can we quickly read ahead without issuing a new request?*/
+    just_read_ahead=conn_pos<=pos&&pos-conn_pos-available<=read_ahead_thresh
+     &&(end_pos<0||pos<end_pos);
+    if(just_read_ahead||pipeline&&end_pos>=0
+     &&end_pos-conn_pos-available<=read_ahead_thresh){
+      /*Found a suitable connection to re-use.*/
+      ret=op_http_conn_read_ahead(stream,conn,just_read_ahead,pos);
+      if(OP_UNLIKELY(ret<0)){
+        /*The connection might have become stale, so close it and keep going.*/
+        op_http_conn_close(stream,conn,pnext,1);
+        conn=*pnext;
+        continue;
+      }
+      /*Sucessfully resurrected this connection.*/
+      *pnext=conn->next;
+      conn->next=stream->lru_head;
+      stream->lru_head=conn;
+      stream->cur_conni=conn-stream->conns;
+      return 0;
+    }
+    close_pnext=pnext;
+    close_conn=conn;
+    pnext=&conn->next;
+    conn=conn->next;
+  }
+  /*No suitable connections.
+    Open a new one.*/
+  if(stream->free_head==NULL){
+    /*All connections in use.
+      Expire one of them (we should have already picked which one when scanning
+       the list).*/
+    OP_ASSERT(close_conn!=NULL);
+    OP_ASSERT(close_pnext!=NULL);
+    op_http_conn_close(stream,close_conn,close_pnext,1);
+  }
+  OP_ASSERT(stream->free_head!=NULL);
+  conn=stream->free_head;
+  /*If we can pipeline, only request a chunk of data.
+    If we're seeking now, there's a good chance we will want to seek again
+     soon, and this avoids committing this connection to reading the rest of
+     the stream.
+    Particularly with SSL or proxies, issuing a new request on the same
+     connection can be substantially faster than opening a new one.
+    This also limits the amount of data the server will blast at us on this
+     connection if we later seek elsewhere and start reading from a different
+     connection.*/
+  ret=op_http_conn_open_pos(stream,conn,pos,
+   pipeline?OP_PIPELINE_CHUNK_SIZE:-1);
+  if(OP_UNLIKELY(ret<0)){
+    op_http_conn_close(stream,conn,&stream->lru_head,1);
+    return -1;
+  }
+  return 0;
+}
+
+static opus_int64 op_http_stream_tell(void *_stream){
+  OpusHTTPStream *stream;
+  int             ci;
+  stream=(OpusHTTPStream *)_stream;
+  ci=stream->cur_conni;
+  return ci<0?stream->pos:stream->conns[ci].pos;
+}
+
+static int op_http_stream_close(void *_stream){
+  OpusHTTPStream *stream;
+  stream=(OpusHTTPStream *)_stream;
+  if(OP_LIKELY(stream!=NULL)){
+    op_http_stream_clear(stream);
+    _ogg_free(stream);
+  }
+  return 0;
+}
+
+static const OpusFileCallbacks OP_HTTP_CALLBACKS={
+  op_http_stream_read,
+  op_http_stream_seek,
+  op_http_stream_tell,
+  op_http_stream_close
+};
+#endif
+
+void opus_server_info_init(OpusServerInfo *_info){
+  _info->name=NULL;
+  _info->description=NULL;
+  _info->genre=NULL;
+  _info->url=NULL;
+  _info->server=NULL;
+  _info->content_type=NULL;
+  _info->bitrate_kbps=-1;
+  _info->is_public=-1;
+  _info->is_ssl=0;
+}
+
+void opus_server_info_clear(OpusServerInfo *_info){
+  _ogg_free(_info->content_type);
+  _ogg_free(_info->server);
+  _ogg_free(_info->url);
+  _ogg_free(_info->genre);
+  _ogg_free(_info->description);
+  _ogg_free(_info->name);
+}
+
+/*The actual URL stream creation function.
+  This one isn't extensible like the application-level interface, but because
+   it isn't public, we're free to change it in the future.*/
+static void *op_url_stream_create_impl(OpusFileCallbacks *_cb,const char *_url,
+ int _skip_certificate_check,const char *_proxy_host,unsigned _proxy_port,
+ const char *_proxy_user,const char *_proxy_pass,OpusServerInfo *_info){
+  const char *path;
+  /*Check to see if this is a valid file: URL.*/
+  path=op_parse_file_url(_url);
+  if(path!=NULL){
+    char *unescaped_path;
+    void *ret;
+    unescaped_path=op_string_dup(path);
+    if(OP_UNLIKELY(unescaped_path==NULL))return NULL;
+    ret=op_fopen(_cb,op_unescape_url_component(unescaped_path),"rb");
+    _ogg_free(unescaped_path);
+    return ret;
+  }
+#if defined(OP_ENABLE_HTTP)
+  /*If not, try http/https.*/
+  else{
+    OpusHTTPStream *stream;
+    int             ret;
+    stream=(OpusHTTPStream *)_ogg_malloc(sizeof(*stream));
+    if(OP_UNLIKELY(stream==NULL))return NULL;
+    op_http_stream_init(stream);
+    ret=op_http_stream_open(stream,_url,_skip_certificate_check,
+     _proxy_host,_proxy_port,_proxy_user,_proxy_pass,_info);
+    if(OP_UNLIKELY(ret<0)){
+      op_http_stream_clear(stream);
+      _ogg_free(stream);
+      return NULL;
+    }
+    *_cb=*&OP_HTTP_CALLBACKS;
+    return stream;
+  }
+#else
+  (void)_skip_certificate_check;
+  (void)_proxy_host;
+  (void)_proxy_port;
+  (void)_proxy_user;
+  (void)_proxy_pass;
+  (void)_info;
+  return NULL;
+#endif
+}
+
+void *op_url_stream_vcreate(OpusFileCallbacks *_cb,
+ const char *_url,va_list _ap){
+  int             skip_certificate_check;
+  const char     *proxy_host;
+  opus_int32      proxy_port;
+  const char     *proxy_user;
+  const char     *proxy_pass;
+  OpusServerInfo *pinfo;
+  skip_certificate_check=0;
+  proxy_host=NULL;
+  proxy_port=8080;
+  proxy_user=NULL;
+  proxy_pass=NULL;
+  pinfo=NULL;
+  for(;;){
+    ptrdiff_t request;
+    request=va_arg(_ap,char *)-(char *)NULL;
+    /*If we hit NULL, we're done processing options.*/
+    if(!request)break;
+    switch(request){
+      case OP_SSL_SKIP_CERTIFICATE_CHECK_REQUEST:{
+        skip_certificate_check=!!va_arg(_ap,opus_int32);
+      }break;
+      case OP_HTTP_PROXY_HOST_REQUEST:{
+        proxy_host=va_arg(_ap,const char *);
+      }break;
+      case OP_HTTP_PROXY_PORT_REQUEST:{
+        proxy_port=va_arg(_ap,opus_int32);
+        if(proxy_port<0||proxy_port>(opus_int32)65535)return NULL;
+      }break;
+      case OP_HTTP_PROXY_USER_REQUEST:{
+        proxy_user=va_arg(_ap,const char *);
+      }break;
+      case OP_HTTP_PROXY_PASS_REQUEST:{
+        proxy_pass=va_arg(_ap,const char *);
+      }break;
+      case OP_GET_SERVER_INFO_REQUEST:{
+        pinfo=va_arg(_ap,OpusServerInfo *);
+      }break;
+      /*Some unknown option.*/
+      default:return NULL;
+    }
+  }
+  /*If the caller has requested server information, proxy it to a local copy to
+     simplify error handling.*/
+  if(pinfo!=NULL){
+    OpusServerInfo  info;
+    void           *ret;
+    opus_server_info_init(&info);
+    ret=op_url_stream_create_impl(_cb,_url,skip_certificate_check,
+     proxy_host,proxy_port,proxy_user,proxy_pass,&info);
+    if(ret!=NULL)*pinfo=*&info;
+    else opus_server_info_clear(&info);
+    return ret;
+  }
+  return op_url_stream_create_impl(_cb,_url,skip_certificate_check,
+   proxy_host,proxy_port,proxy_user,proxy_pass,NULL);
+}
+
+void *op_url_stream_create(OpusFileCallbacks *_cb,
+ const char *_url,...){
+  va_list  ap;
+  void    *ret;
+  va_start(ap,_url);
+  ret=op_url_stream_vcreate(_cb,_url,ap);
+  va_end(ap);
+  return ret;
+}
+
+/*Convenience routines to open/test URLs in a single step.*/
+
+OggOpusFile *op_vopen_url(const char *_url,int *_error,va_list _ap){
+  OpusFileCallbacks  cb;
+  OggOpusFile       *of;
+  void              *source;
+  source=op_url_stream_vcreate(&cb,_url,_ap);
+  if(OP_UNLIKELY(source==NULL)){
+    if(_error!=NULL)*_error=OP_EFAULT;
+    return NULL;
+  }
+  of=op_open_callbacks(source,&cb,NULL,0,_error);
+  if(OP_UNLIKELY(of==NULL))(*cb.close)(source);
+  return of;
+}
+
+OggOpusFile *op_open_url(const char *_url,int *_error,...){
+  OggOpusFile *ret;
+  va_list      ap;
+  va_start(ap,_error);
+  ret=op_vopen_url(_url,_error,ap);
+  va_end(ap);
+  return ret;
+}
+
+OggOpusFile *op_vtest_url(const char *_url,int *_error,va_list _ap){
+  OpusFileCallbacks  cb;
+  OggOpusFile       *of;
+  void              *source;
+  source=op_url_stream_vcreate(&cb,_url,_ap);
+  if(OP_UNLIKELY(source==NULL)){
+    if(_error!=NULL)*_error=OP_EFAULT;
+    return NULL;
+  }
+  of=op_test_callbacks(source,&cb,NULL,0,_error);
+  if(OP_UNLIKELY(of==NULL))(*cb.close)(source);
+  return of;
+}
+
+OggOpusFile *op_test_url(const char *_url,int *_error,...){
+  OggOpusFile *ret;
+  va_list      ap;
+  va_start(ap,_error);
+  ret=op_vtest_url(_url,_error,ap);
+  va_end(ap);
+  return ret;
+}

+ 687 - 0
drivers/opus/info.c

@@ -0,0 +1,687 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 2012                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************/
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "internal.h"
+#include <limits.h>
+#include <string.h>
+
+static unsigned op_parse_uint16le(const unsigned char *_data){
+  return _data[0]|_data[1]<<8;
+}
+
+static int op_parse_int16le(const unsigned char *_data){
+  int ret;
+  ret=_data[0]|_data[1]<<8;
+  return (ret^0x8000)-0x8000;
+}
+
+static opus_uint32 op_parse_uint32le(const unsigned char *_data){
+  return _data[0]|(opus_uint32)_data[1]<<8|
+   (opus_uint32)_data[2]<<16|(opus_uint32)_data[3]<<24;
+}
+
+static opus_uint32 op_parse_uint32be(const unsigned char *_data){
+  return _data[3]|(opus_uint32)_data[2]<<8|
+   (opus_uint32)_data[1]<<16|(opus_uint32)_data[0]<<24;
+}
+
+int opus_head_parse(OpusHead *_head,const unsigned char *_data,size_t _len){
+  OpusHead head;
+  if(_len<8)return OP_ENOTFORMAT;
+  if(memcmp(_data,"OpusHead",8)!=0)return OP_ENOTFORMAT;
+  if(_len<9)return OP_EBADHEADER;
+  head.version=_data[8];
+  if(head.version>15)return OP_EVERSION;
+  if(_len<19)return OP_EBADHEADER;
+  head.channel_count=_data[9];
+  head.pre_skip=op_parse_uint16le(_data+10);
+  head.input_sample_rate=op_parse_uint32le(_data+12);
+  head.output_gain=op_parse_int16le(_data+16);
+  head.mapping_family=_data[18];
+  if(head.mapping_family==0){
+    if(head.channel_count<1||head.channel_count>2)return OP_EBADHEADER;
+    if(head.version<=1&&_len>19)return OP_EBADHEADER;
+    head.stream_count=1;
+    head.coupled_count=head.channel_count-1;
+    if(_head!=NULL){
+      _head->mapping[0]=0;
+      _head->mapping[1]=1;
+    }
+  }
+  else if(head.mapping_family==1){
+    size_t size;
+    int    ci;
+    if(head.channel_count<1||head.channel_count>8)return OP_EBADHEADER;
+    size=21+head.channel_count;
+    if(_len<size||head.version<=1&&_len>size)return OP_EBADHEADER;
+    head.stream_count=_data[19];
+    if(head.stream_count<1)return OP_EBADHEADER;
+    head.coupled_count=_data[20];
+    if(head.coupled_count>head.stream_count)return OP_EBADHEADER;
+    for(ci=0;ci<head.channel_count;ci++){
+      if(_data[21+ci]>=head.stream_count+head.coupled_count
+       &&_data[21+ci]!=255){
+        return OP_EBADHEADER;
+      }
+    }
+    if(_head!=NULL)memcpy(_head->mapping,_data+21,head.channel_count);
+  }
+  /*General purpose players should not attempt to play back content with
+     channel mapping family 255.*/
+  else if(head.mapping_family==255)return OP_EIMPL;
+  /*No other channel mapping families are currently defined.*/
+  else return OP_EBADHEADER;
+  if(_head!=NULL)memcpy(_head,&head,head.mapping-(unsigned char *)&head);
+  return 0;
+}
+
+void opus_tags_init(OpusTags *_tags){
+  memset(_tags,0,sizeof(*_tags));
+}
+
+void opus_tags_clear(OpusTags *_tags){
+  int ci;
+  for(ci=_tags->comments;ci-->0;)_ogg_free(_tags->user_comments[ci]);
+  _ogg_free(_tags->user_comments);
+  _ogg_free(_tags->comment_lengths);
+  _ogg_free(_tags->vendor);
+}
+
+/*Ensure there's room for up to _ncomments comments.*/
+static int op_tags_ensure_capacity(OpusTags *_tags,size_t _ncomments){
+  char   **user_comments;
+  int     *comment_lengths;
+  size_t   size;
+  if(OP_UNLIKELY(_ncomments>=(size_t)INT_MAX))return OP_EFAULT;
+  size=sizeof(*_tags->comment_lengths)*(_ncomments+1);
+  if(size/sizeof(*_tags->comment_lengths)!=_ncomments+1)return OP_EFAULT;
+  comment_lengths=(int *)_ogg_realloc(_tags->comment_lengths,size);
+  if(OP_UNLIKELY(comment_lengths==NULL))return OP_EFAULT;
+  comment_lengths[_ncomments]=0;
+  _tags->comment_lengths=comment_lengths;
+  size=sizeof(*_tags->user_comments)*(_ncomments+1);
+  if(size/sizeof(*_tags->user_comments)!=_ncomments+1)return OP_EFAULT;
+  user_comments=(char **)_ogg_realloc(_tags->user_comments,size);
+  if(OP_UNLIKELY(user_comments==NULL))return OP_EFAULT;
+  user_comments[_ncomments]=NULL;
+  _tags->user_comments=user_comments;
+  return 0;
+}
+
+/*Duplicate a (possibly non-NUL terminated) string with a known length.*/
+static char *op_strdup_with_len(const char *_s,size_t _len){
+  size_t  size;
+  char   *ret;
+  size=sizeof(*ret)*(_len+1);
+  if(OP_UNLIKELY(size<_len))return NULL;
+  ret=(char *)_ogg_malloc(size);
+  if(OP_LIKELY(ret!=NULL)){
+    ret=(char *)memcpy(ret,_s,sizeof(*ret)*_len);
+    ret[_len]='\0';
+  }
+  return ret;
+}
+
+/*The actual implementation of opus_tags_parse().
+  Unlike the public API, this function requires _tags to already be
+   initialized, modifies its contents before success is guaranteed, and assumes
+   the caller will clear it on error.*/
+static int opus_tags_parse_impl(OpusTags *_tags,
+ const unsigned char *_data,size_t _len){
+  opus_uint32 count;
+  size_t      len;
+  int         ncomments;
+  int         ci;
+  len=_len;
+  if(len<8)return OP_ENOTFORMAT;
+  if(memcmp(_data,"OpusTags",8)!=0)return OP_ENOTFORMAT;
+  if(len<16)return OP_EBADHEADER;
+  _data+=8;
+  len-=8;
+  count=op_parse_uint32le(_data);
+  _data+=4;
+  len-=4;
+  if(count>len)return OP_EBADHEADER;
+  if(_tags!=NULL){
+    _tags->vendor=op_strdup_with_len((char *)_data,count);
+    if(_tags->vendor==NULL)return OP_EFAULT;
+  }
+  _data+=count;
+  len-=count;
+  if(len<4)return OP_EBADHEADER;
+  count=op_parse_uint32le(_data);
+  _data+=4;
+  len-=4;
+  /*Check to make sure there's minimally sufficient data left in the packet.*/
+  if(count>len>>2)return OP_EBADHEADER;
+  /*Check for overflow (the API limits this to an int).*/
+  if(count>(opus_uint32)INT_MAX-1)return OP_EFAULT;
+  if(_tags!=NULL){
+    int ret;
+    ret=op_tags_ensure_capacity(_tags,count);
+    if(ret<0)return ret;
+  }
+  ncomments=(int)count;
+  for(ci=0;ci<ncomments;ci++){
+    /*Check to make sure there's minimally sufficient data left in the packet.*/
+    if((size_t)(ncomments-ci)>len>>2)return OP_EBADHEADER;
+    count=op_parse_uint32le(_data);
+    _data+=4;
+    len-=4;
+    if(count>len)return OP_EBADHEADER;
+    /*Check for overflow (the API limits this to an int).*/
+    if(count>(opus_uint32)INT_MAX)return OP_EFAULT;
+    if(_tags!=NULL){
+      _tags->user_comments[ci]=op_strdup_with_len((char *)_data,count);
+      if(_tags->user_comments[ci]==NULL)return OP_EFAULT;
+      _tags->comment_lengths[ci]=(int)count;
+      _tags->comments=ci+1;
+    }
+    _data+=count;
+    len-=count;
+  }
+  return 0;
+}
+
+int opus_tags_parse(OpusTags *_tags,const unsigned char *_data,size_t _len){
+  if(_tags!=NULL){
+    OpusTags tags;
+    int      ret;
+    opus_tags_init(&tags);
+    ret=opus_tags_parse_impl(&tags,_data,_len);
+    if(ret<0)opus_tags_clear(&tags);
+    else *_tags=*&tags;
+    return ret;
+  }
+  else return opus_tags_parse_impl(NULL,_data,_len);
+}
+
+/*The actual implementation of opus_tags_copy().
+  Unlike the public API, this function requires _dst to already be
+   initialized, modifies its contents before success is guaranteed, and assumes
+   the caller will clear it on error.*/
+static int opus_tags_copy_impl(OpusTags *_dst,const OpusTags *_src){
+  char *vendor;
+  int   ncomments;
+  int   ret;
+  int   ci;
+  vendor=_src->vendor;
+  _dst->vendor=op_strdup_with_len(vendor,strlen(vendor));
+  if(OP_UNLIKELY(_dst->vendor==NULL))return OP_EFAULT;
+  ncomments=_src->comments;
+  ret=op_tags_ensure_capacity(_dst,ncomments);
+  if(OP_UNLIKELY(ret<0))return ret;
+  for(ci=0;ci<ncomments;ci++){
+    int len;
+    len=_src->comment_lengths[ci];
+    OP_ASSERT(len>=0);
+    _dst->user_comments[ci]=op_strdup_with_len(_src->user_comments[ci],len);
+    if(OP_UNLIKELY(_dst->user_comments[ci]==NULL))return OP_EFAULT;
+    _dst->comment_lengths[ci]=len;
+    _dst->comments=ci+1;
+  }
+  return 0;
+}
+
+int opus_tags_copy(OpusTags *_dst,const OpusTags *_src){
+  OpusTags dst;
+  int      ret;
+  opus_tags_init(&dst);
+  ret=opus_tags_copy_impl(&dst,_src);
+  if(OP_UNLIKELY(ret<0))opus_tags_clear(&dst);
+  else *_dst=*&dst;
+  return 0;
+}
+
+int opus_tags_add(OpusTags *_tags,const char *_tag,const char *_value){
+  char *comment;
+  int   tag_len;
+  int   value_len;
+  int   ncomments;
+  int   ret;
+  ncomments=_tags->comments;
+  ret=op_tags_ensure_capacity(_tags,ncomments+1);
+  if(OP_UNLIKELY(ret<0))return ret;
+  tag_len=strlen(_tag);
+  value_len=strlen(_value);
+  /*+2 for '=' and '\0'.*/
+  _tags->comment_lengths[ncomments]=0;
+  _tags->user_comments[ncomments]=comment=
+   (char *)_ogg_malloc(sizeof(*comment)*(tag_len+value_len+2));
+  if(OP_UNLIKELY(comment==NULL))return OP_EFAULT;
+  memcpy(comment,_tag,sizeof(*comment)*tag_len);
+  comment[tag_len]='=';
+  memcpy(comment+tag_len+1,_value,sizeof(*comment)*(value_len+1));
+  _tags->comment_lengths[ncomments]=tag_len+value_len+1;
+  _tags->comments=ncomments+1;
+  return 0;
+}
+
+int opus_tags_add_comment(OpusTags *_tags,const char *_comment){
+  int comment_len;
+  int ncomments;
+  int ret;
+  ncomments=_tags->comments;
+  ret=op_tags_ensure_capacity(_tags,ncomments+1);
+  if(OP_UNLIKELY(ret<0))return ret;
+  comment_len=(int)strlen(_comment);
+  _tags->comment_lengths[ncomments]=0;
+  _tags->user_comments[ncomments]=op_strdup_with_len(_comment,comment_len);
+  if(OP_UNLIKELY(_tags->user_comments[ncomments]==NULL))return OP_EFAULT;
+  _tags->comment_lengths[ncomments]=comment_len;
+  _tags->comments=ncomments+1;
+  return 0;
+}
+
+int opus_tagcompare(const char *_tag_name,const char *_comment){
+  return opus_tagncompare(_tag_name,strlen(_tag_name),_comment);
+}
+
+int opus_tagncompare(const char *_tag_name,int _tag_len,const char *_comment){
+  int ret;
+  OP_ASSERT(_tag_len>=0);
+  ret=op_strncasecmp(_tag_name,_comment,_tag_len);
+  return ret?ret:'='-_comment[_tag_len];
+}
+
+const char *opus_tags_query(const OpusTags *_tags,const char *_tag,int _count){
+  char **user_comments;
+  int    tag_len;
+  int    found;
+  int    ncomments;
+  int    ci;
+  tag_len=strlen(_tag);
+  ncomments=_tags->comments;
+  user_comments=_tags->user_comments;
+  found=0;
+  for(ci=0;ci<ncomments;ci++){
+    if(!opus_tagncompare(_tag,tag_len,user_comments[ci])){
+      /*We return a pointer to the data, not a copy.*/
+      if(_count==found++)return user_comments[ci]+tag_len+1;
+    }
+  }
+  /*Didn't find anything.*/
+  return NULL;
+}
+
+int opus_tags_query_count(const OpusTags *_tags,const char *_tag){
+  char **user_comments;
+  int    tag_len;
+  int    found;
+  int    ncomments;
+  int    ci;
+  tag_len=strlen(_tag);
+  ncomments=_tags->comments;
+  user_comments=_tags->user_comments;
+  found=0;
+  for(ci=0;ci<ncomments;ci++){
+    if(!opus_tagncompare(_tag,tag_len,user_comments[ci]))found++;
+  }
+  return found;
+}
+
+int opus_tags_get_track_gain(const OpusTags *_tags,int *_gain_q8){
+  char **comments;
+  int    ncomments;
+  int    ci;
+  comments=_tags->user_comments;
+  ncomments=_tags->comments;
+  /*Look for the first valid R128_TRACK_GAIN tag and use that.*/
+  for(ci=0;ci<ncomments;ci++){
+    if(opus_tagncompare("R128_TRACK_GAIN",15,comments[ci])==0){
+      char       *p;
+      opus_int32  gain_q8;
+      int         negative;
+      p=comments[ci]+16;
+      negative=0;
+      if(*p=='-'){
+        negative=-1;
+        p++;
+      }
+      else if(*p=='+')p++;
+      gain_q8=0;
+      while(*p>='0'&&*p<='9'){
+        gain_q8=10*gain_q8+*p-'0';
+        if(gain_q8>32767-negative)break;
+        p++;
+      }
+      /*This didn't look like a signed 16-bit decimal integer.
+        Not a valid R128_TRACK_GAIN tag.*/
+      if(*p!='\0')continue;
+      *_gain_q8=(int)(gain_q8+negative^negative);
+      return 0;
+    }
+  }
+  return OP_FALSE;
+}
+
+static int op_is_jpeg(const unsigned char *_buf,size_t _buf_sz){
+  return _buf_sz>=11&&memcmp(_buf,"\xFF\xD8\xFF\xE0",4)==0
+   &&(_buf[4]<<8|_buf[5])>=16&&memcmp(_buf+6,"JFIF",5)==0;
+}
+
+/*Tries to extract the width, height, bits per pixel, and palette size of a
+   JPEG.
+  On failure, simply leaves its outputs unmodified.*/
+static void op_extract_jpeg_params(const unsigned char *_buf,size_t _buf_sz,
+ opus_uint32 *_width,opus_uint32 *_height,
+ opus_uint32 *_depth,opus_uint32 *_colors,int *_has_palette){
+  if(op_is_jpeg(_buf,_buf_sz)){
+    size_t offs;
+    offs=2;
+    for(;;){
+      size_t segment_len;
+      int    marker;
+      while(offs<_buf_sz&&_buf[offs]!=0xFF)offs++;
+      while(offs<_buf_sz&&_buf[offs]==0xFF)offs++;
+      marker=_buf[offs];
+      offs++;
+      /*If we hit EOI* (end of image), or another SOI* (start of image),
+         or SOS (start of scan), then stop now.*/
+      if(offs>=_buf_sz||(marker>=0xD8&&marker<=0xDA))break;
+      /*RST* (restart markers): skip (no segment length).*/
+      else if(marker>=0xD0&&marker<=0xD7)continue;
+      /*Read the length of the marker segment.*/
+      if(_buf_sz-offs<2)break;
+      segment_len=_buf[offs]<<8|_buf[offs+1];
+      if(segment_len<2||_buf_sz-offs<segment_len)break;
+      if(marker==0xC0||(marker>0xC0&&marker<0xD0&&(marker&3)!=0)){
+        /*Found a SOFn (start of frame) marker segment:*/
+        if(segment_len>=8){
+          *_height=_buf[offs+3]<<8|_buf[offs+4];
+          *_width=_buf[offs+5]<<8|_buf[offs+6];
+          *_depth=_buf[offs+2]*_buf[offs+7];
+          *_colors=0;
+          *_has_palette=0;
+        }
+        break;
+      }
+      /*Other markers: skip the whole marker segment.*/
+      offs+=segment_len;
+    }
+  }
+}
+
+static int op_is_png(const unsigned char *_buf,size_t _buf_sz){
+  return _buf_sz>=8&&memcmp(_buf,"\x89PNG\x0D\x0A\x1A\x0A",8)==0;
+}
+
+/*Tries to extract the width, height, bits per pixel, and palette size of a
+   PNG.
+  On failure, simply leaves its outputs unmodified.*/
+static void op_extract_png_params(const unsigned char *_buf,size_t _buf_sz,
+ opus_uint32 *_width,opus_uint32 *_height,
+ opus_uint32 *_depth,opus_uint32 *_colors,int *_has_palette){
+  if(op_is_png(_buf,_buf_sz)){
+    size_t offs;
+    offs=8;
+    while(_buf_sz-offs>=12){
+      ogg_uint32_t chunk_len;
+      chunk_len=op_parse_uint32be(_buf+offs);
+      if(chunk_len>_buf_sz-(offs+12))break;
+      else if(chunk_len==13&&memcmp(_buf+offs+4,"IHDR",4)==0){
+        int color_type;
+        *_width=op_parse_uint32be(_buf+offs+8);
+        *_height=op_parse_uint32be(_buf+offs+12);
+        color_type=_buf[offs+17];
+        if(color_type==3){
+          *_depth=24;
+          *_has_palette=1;
+        }
+        else{
+          int sample_depth;
+          sample_depth=_buf[offs+16];
+          if(color_type==0)*_depth=sample_depth;
+          else if(color_type==2)*_depth=sample_depth*3;
+          else if(color_type==4)*_depth=sample_depth*2;
+          else if(color_type==6)*_depth=sample_depth*4;
+          *_colors=0;
+          *_has_palette=0;
+          break;
+        }
+      }
+      else if(*_has_palette>0&&memcmp(_buf+offs+4,"PLTE",4)==0){
+        *_colors=chunk_len/3;
+        break;
+      }
+      offs+=12+chunk_len;
+    }
+  }
+}
+
+static int op_is_gif(const unsigned char *_buf,size_t _buf_sz){
+  return _buf_sz>=6&&(memcmp(_buf,"GIF87a",6)==0||memcmp(_buf,"GIF89a",6)==0);
+}
+
+/*Tries to extract the width, height, bits per pixel, and palette size of a
+   GIF.
+  On failure, simply leaves its outputs unmodified.*/
+static void op_extract_gif_params(const unsigned char *_buf,size_t _buf_sz,
+ opus_uint32 *_width,opus_uint32 *_height,
+ opus_uint32 *_depth,opus_uint32 *_colors,int *_has_palette){
+  if(op_is_gif(_buf,_buf_sz)&&_buf_sz>=14){
+    *_width=_buf[6]|_buf[7]<<8;
+    *_height=_buf[8]|_buf[9]<<8;
+    /*libFLAC hard-codes the depth to 24.*/
+    *_depth=24;
+    *_colors=1<<((_buf[10]&7)+1);
+    *_has_palette=1;
+  }
+}
+
+/*The actual implementation of opus_picture_tag_parse().
+  Unlike the public API, this function requires _pic to already be
+   initialized, modifies its contents before success is guaranteed, and assumes
+   the caller will clear it on error.*/
+static int opus_picture_tag_parse_impl(OpusPictureTag *_pic,const char *_tag,
+ unsigned char *_buf,size_t _buf_sz,size_t _base64_sz){
+  opus_int32   picture_type;
+  opus_uint32  mime_type_length;
+  char        *mime_type;
+  opus_uint32  description_length;
+  char        *description;
+  opus_uint32  width;
+  opus_uint32  height;
+  opus_uint32  depth;
+  opus_uint32  colors;
+  opus_uint32  data_length;
+  opus_uint32  file_width;
+  opus_uint32  file_height;
+  opus_uint32  file_depth;
+  opus_uint32  file_colors;
+  int          format;
+  int          has_palette;
+  int          colors_set;
+  size_t       i;
+  /*Decode the BASE64 data.*/
+  for(i=0;i<_base64_sz;i++){
+    opus_uint32 value;
+    int         j;
+    value=0;
+    for(j=0;j<4;j++){
+      unsigned c;
+      unsigned d;
+      c=(unsigned char)_tag[4*i+j];
+      if(c=='+')d=62;
+      else if(c=='/')d=63;
+      else if(c>='0'&&c<='9')d=52+c-'0';
+      else if(c>='a'&&c<='z')d=26+c-'a';
+      else if(c>='A'&&c<='Z')d=c-'A';
+      else if(c=='='&&3*i+j>_buf_sz)d=0;
+      else return OP_ENOTFORMAT;
+      value=value<<6|d;
+    }
+    _buf[3*i]=(unsigned char)(value>>16);
+    if(3*i+1<_buf_sz){
+      _buf[3*i+1]=(unsigned char)(value>>8);
+      if(3*i+2<_buf_sz)_buf[3*i+2]=(unsigned char)value;
+    }
+  }
+  i=0;
+  picture_type=op_parse_uint32be(_buf+i);
+  i+=4;
+  /*Extract the MIME type.*/
+  mime_type_length=op_parse_uint32be(_buf+i);
+  i+=4;
+  if(mime_type_length>_buf_sz-32)return OP_ENOTFORMAT;
+  mime_type=(char *)_ogg_malloc(sizeof(*_pic->mime_type)*(mime_type_length+1));
+  if(mime_type==NULL)return OP_EFAULT;
+  memcpy(mime_type,_buf+i,sizeof(*mime_type)*mime_type_length);
+  mime_type[mime_type_length]='\0';
+  _pic->mime_type=mime_type;
+  i+=mime_type_length;
+  /*Extract the description string.*/
+  description_length=op_parse_uint32be(_buf+i);
+  i+=4;
+  if(description_length>_buf_sz-mime_type_length-32)return OP_ENOTFORMAT;
+  description=
+   (char *)_ogg_malloc(sizeof(*_pic->mime_type)*(description_length+1));
+  if(description==NULL)return OP_EFAULT;
+  memcpy(description,_buf+i,sizeof(*description)*description_length);
+  description[description_length]='\0';
+  _pic->description=description;
+  i+=description_length;
+  /*Extract the remaining fields.*/
+  width=op_parse_uint32be(_buf+i);
+  i+=4;
+  height=op_parse_uint32be(_buf+i);
+  i+=4;
+  depth=op_parse_uint32be(_buf+i);
+  i+=4;
+  colors=op_parse_uint32be(_buf+i);
+  i+=4;
+  /*If one of these is set, they all must be, but colors==0 is a valid value.*/
+  colors_set=width!=0||height!=0||depth!=0||colors!=0;
+  if((width==0||height==0||depth==0)&&colors_set)return OP_ENOTFORMAT;
+  data_length=op_parse_uint32be(_buf+i);
+  i+=4;
+  if(data_length>_buf_sz-i)return OP_ENOTFORMAT;
+  /*Trim extraneous data so we don't copy it below.*/
+  _buf_sz=i+data_length;
+  /*Attempt to determine the image format.*/
+  format=OP_PIC_FORMAT_UNKNOWN;
+  if(mime_type_length==3&&strcmp(mime_type,"-->")==0){
+    format=OP_PIC_FORMAT_URL;
+    /*Picture type 1 must be a 32x32 PNG.*/
+    if(picture_type==1&&(width!=0||height!=0)&&(width!=32||height!=32)){
+      return OP_ENOTFORMAT;
+    }
+    /*Append a terminating NUL for the convenience of our callers.*/
+    _buf[_buf_sz++]='\0';
+  }
+  else{
+    if(mime_type_length==10
+     &&op_strncasecmp(mime_type,"image/jpeg",mime_type_length)==0){
+      if(op_is_jpeg(_buf+i,data_length))format=OP_PIC_FORMAT_JPEG;
+    }
+    else if(mime_type_length==9
+     &&op_strncasecmp(mime_type,"image/png",mime_type_length)==0){
+      if(op_is_png(_buf+i,data_length))format=OP_PIC_FORMAT_PNG;
+    }
+    else if(mime_type_length==9
+     &&op_strncasecmp(mime_type,"image/gif",mime_type_length)==0){
+      if(op_is_gif(_buf+i,data_length))format=OP_PIC_FORMAT_GIF;
+    }
+    else if(mime_type_length==0||(mime_type_length==6
+     &&op_strncasecmp(mime_type,"image/",mime_type_length)==0)){
+      if(op_is_jpeg(_buf+i,data_length))format=OP_PIC_FORMAT_JPEG;
+      else if(op_is_png(_buf+i,data_length))format=OP_PIC_FORMAT_PNG;
+      else if(op_is_gif(_buf+i,data_length))format=OP_PIC_FORMAT_GIF;
+    }
+    file_width=file_height=file_depth=file_colors=0;
+    has_palette=-1;
+    switch(format){
+      case OP_PIC_FORMAT_JPEG:{
+        op_extract_jpeg_params(_buf+i,data_length,
+         &file_width,&file_height,&file_depth,&file_colors,&has_palette);
+      }break;
+      case OP_PIC_FORMAT_PNG:{
+        op_extract_png_params(_buf+i,data_length,
+         &file_width,&file_height,&file_depth,&file_colors,&has_palette);
+      }break;
+      case OP_PIC_FORMAT_GIF:{
+        op_extract_gif_params(_buf+i,data_length,
+         &file_width,&file_height,&file_depth,&file_colors,&has_palette);
+      }break;
+    }
+    if(has_palette>=0){
+      /*If we successfully extracted these parameters from the image, override
+         any declared values.*/
+      width=file_width;
+      height=file_height;
+      depth=file_depth;
+      colors=file_colors;
+    }
+    /*Picture type 1 must be a 32x32 PNG.*/
+    if(picture_type==1&&(format!=OP_PIC_FORMAT_PNG||width!=32||height!=32)){
+      return OP_ENOTFORMAT;
+    }
+  }
+  /*Adjust _buf_sz instead of using data_length to capture the terminating NUL
+     for URLs.*/
+  _buf_sz-=i;
+  memmove(_buf,_buf+i,sizeof(*_buf)*_buf_sz);
+  _buf=(unsigned char *)_ogg_realloc(_buf,_buf_sz);
+  if(_buf_sz>0&&_buf==NULL)return OP_EFAULT;
+  _pic->type=picture_type;
+  _pic->width=width;
+  _pic->height=height;
+  _pic->depth=depth;
+  _pic->colors=colors;
+  _pic->data_length=data_length;
+  _pic->data=_buf;
+  _pic->format=format;
+  return 0;
+}
+
+int opus_picture_tag_parse(OpusPictureTag *_pic,const char *_tag){
+  OpusPictureTag  pic;
+  unsigned char  *buf;
+  size_t          base64_sz;
+  size_t          buf_sz;
+  size_t          tag_length;
+  int             ret;
+  if(opus_tagncompare("METADATA_BLOCK_PICTURE",22,_tag)==0)_tag+=23;
+  /*Figure out how much BASE64-encoded data we have.*/
+  tag_length=strlen(_tag);
+  if(tag_length&3)return OP_ENOTFORMAT;
+  base64_sz=tag_length>>2;
+  buf_sz=3*base64_sz;
+  if(buf_sz<32)return OP_ENOTFORMAT;
+  if(_tag[tag_length-1]=='=')buf_sz--;
+  if(_tag[tag_length-2]=='=')buf_sz--;
+  if(buf_sz<32)return OP_ENOTFORMAT;
+  /*Allocate an extra byte to allow appending a terminating NUL to URL data.*/
+  buf=(unsigned char *)_ogg_malloc(sizeof(*buf)*(buf_sz+1));
+  if(buf==NULL)return OP_EFAULT;
+  opus_picture_tag_init(&pic);
+  ret=opus_picture_tag_parse_impl(&pic,_tag,buf,buf_sz,base64_sz);
+  if(ret<0){
+    opus_picture_tag_clear(&pic);
+    _ogg_free(buf);
+  }
+  else *_pic=*&pic;
+  return ret;
+}
+
+void opus_picture_tag_init(OpusPictureTag *_pic){
+  memset(_pic,0,sizeof(*_pic));
+}
+
+void opus_picture_tag_clear(OpusPictureTag *_pic){
+  _ogg_free(_pic->description);
+  _ogg_free(_pic->mime_type);
+  _ogg_free(_pic->data);
+}

+ 42 - 0
drivers/opus/internal.c

@@ -0,0 +1,42 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 2012                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************/
+#ifdef OPUS_HAVE_CONFIG_H
+#include "opus_config.h"
+#endif
+
+#include "internal.h"
+
+#if defined(OP_ENABLE_ASSERTIONS)
+void op_fatal_impl(const char *_str,const char *_file,int _line){
+  fprintf(stderr,"Fatal (internal) error in %s, line %i: %s\n",
+   _file,_line,_str);
+  abort();
+}
+#endif
+
+/*A version of strncasecmp() that is guaranteed to only ignore the case of
+   ASCII characters.*/
+int op_strncasecmp(const char *_a,const char *_b,int _n){
+  int i;
+  for(i=0;i<_n;i++){
+    int a;
+    int b;
+    int d;
+    a=_a[i];
+    b=_b[i];
+    if(a>='a'&&a<='z')a-='a'-'A';
+    if(b>='a'&&b<='z')b-='a'-'A';
+    d=a-b;
+    if(d)return d;
+  }
+  return 0;
+}

+ 249 - 0
drivers/opus/internal.h

@@ -0,0 +1,249 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 2012                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************/
+#if !defined(_opusfile_internal_h)
+# define _opusfile_internal_h (1)
+
+# if !defined(_REENTRANT)
+#  define _REENTRANT
+# endif
+# if !defined(_GNU_SOURCE)
+#  define _GNU_SOURCE
+# endif
+# if !defined(_LARGEFILE_SOURCE)
+#  define _LARGEFILE_SOURCE
+# endif
+# if !defined(_LARGEFILE64_SOURCE)
+#  define _LARGEFILE64_SOURCE
+# endif
+# if !defined(_FILE_OFFSET_BITS)
+#  define _FILE_OFFSET_BITS 64
+# endif
+
+# include <stdlib.h>
+# include <opus/opusfile.h>
+
+typedef struct OggOpusLink OggOpusLink;
+
+# if defined(OPUS_FIXED_POINT)
+
+typedef opus_int16 op_sample;
+
+# else
+
+typedef float      op_sample;
+
+/*We're using this define to test for libopus 1.1 or later until libopus
+   provides a better mechanism.*/
+#  if defined(OPUS_GET_EXPERT_FRAME_DURATION_REQUEST)
+/*Enable soft clipping prevention in 16-bit decodes.*/
+#   define OP_SOFT_CLIP (1)
+#  endif
+
+# endif
+
+# if OP_GNUC_PREREQ(4,2)
+/*Disable excessive warnings about the order of operations.*/
+#  pragma GCC diagnostic ignored "-Wparentheses"
+# elif defined(_MSC_VER)
+/*Disable excessive warnings about the order of operations.*/
+#  pragma warning(disable:4554)
+/*Disable warnings about "deprecated" POSIX functions.*/
+#  pragma warning(disable:4996)
+# endif
+
+# if OP_GNUC_PREREQ(3,0)
+/*Another alternative is
+    (__builtin_constant_p(_x)?!!(_x):__builtin_expect(!!(_x),1))
+   but that evaluates _x multiple times, which may be bad.*/
+#  define OP_LIKELY(_x) (__builtin_expect(!!(_x),1))
+#  define OP_UNLIKELY(_x) (__builtin_expect(!!(_x),0))
+# else
+#  define OP_LIKELY(_x)   (!!(_x))
+#  define OP_UNLIKELY(_x) (!!(_x))
+# endif
+
+# if defined(OP_ENABLE_ASSERTIONS)
+#  if OP_GNUC_PREREQ(2,5)||__SUNPRO_C>=0x590
+__attribute__((noreturn))
+#  endif
+void op_fatal_impl(const char *_str,const char *_file,int _line);
+
+#  define OP_FATAL(_str) (op_fatal_impl(_str,__FILE__,__LINE__))
+
+#  define OP_ASSERT(_cond) \
+  do{ \
+    if(OP_UNLIKELY(!(_cond)))OP_FATAL("assertion failed: " #_cond); \
+  } \
+  while(0)
+#  define OP_ALWAYS_TRUE(_cond) OP_ASSERT(_cond)
+
+# else
+#  define OP_FATAL(_str) abort()
+#  define OP_ASSERT(_cond)
+#  define OP_ALWAYS_TRUE(_cond) ((void)(_cond))
+# endif
+
+# define OP_INT64_MAX (2*(((ogg_int64_t)1<<62)-1)|1)
+# define OP_INT64_MIN (-OP_INT64_MAX-1)
+# define OP_INT32_MAX (2*(((ogg_int32_t)1<<30)-1)|1)
+# define OP_INT32_MIN (-OP_INT32_MAX-1)
+
+# define OP_MIN(_a,_b)        ((_a)<(_b)?(_a):(_b))
+# define OP_MAX(_a,_b)        ((_a)>(_b)?(_a):(_b))
+# define OP_CLAMP(_lo,_x,_hi) (OP_MAX(_lo,OP_MIN(_x,_hi)))
+
+/*Advance a file offset by the given amount, clamping against OP_INT64_MAX.
+  This is used to advance a known offset by things like OP_CHUNK_SIZE or
+   OP_PAGE_SIZE_MAX, while making sure to avoid signed overflow.
+  It assumes that both _offset and _amount are non-negative.*/
+#define OP_ADV_OFFSET(_offset,_amount) \
+ (OP_MIN(_offset,OP_INT64_MAX-(_amount))+(_amount))
+
+/*The maximum channel count for any mapping we'll actually decode.*/
+# define OP_NCHANNELS_MAX (8)
+
+/*Initial state.*/
+# define  OP_NOTOPEN   (0)
+/*We've found the first Opus stream in the first link.*/
+# define  OP_PARTOPEN  (1)
+# define  OP_OPENED    (2)
+/*We've found the first Opus stream in the current link.*/
+# define  OP_STREAMSET (3)
+/*We've initialized the decoder for the chosen Opus stream in the current
+   link.*/
+# define  OP_INITSET   (4)
+
+/*Information cached for a single link in a chained Ogg Opus file.
+  We choose the first Opus stream encountered in each link to play back (and
+   require at least one).*/
+struct OggOpusLink{
+  /*The byte offset of the first header page in this link.*/
+  opus_int64   offset;
+  /*The byte offset of the first data page from the chosen Opus stream in this
+     link (after the headers).*/
+  opus_int64   data_offset;
+  /*The byte offset of the last page from the chosen Opus stream in this link.
+    This is used when seeking to ensure we find a page before the last one, so
+     that end-trimming calculations work properly.
+    This is only valid for seekable sources.*/
+  opus_int64   end_offset;
+  /*The granule position of the last sample.
+    This is only valid for seekable sources.*/
+  ogg_int64_t  pcm_end;
+  /*The granule position before the first sample.*/
+  ogg_int64_t  pcm_start;
+  /*The serial number.*/
+  ogg_uint32_t serialno;
+  /*The contents of the info header.*/
+  OpusHead     head;
+  /*The contents of the comment header.*/
+  OpusTags     tags;
+};
+
+struct OggOpusFile{
+  /*The callbacks used to access the data source.*/
+  OpusFileCallbacks  callbacks;
+  /*A FILE *, memory bufer, etc.*/
+  void              *source;
+  /*Whether or not we can seek with this data source.*/
+  int                seekable;
+  /*The number of links in this chained Ogg Opus file.*/
+  int                nlinks;
+  /*The cached information from each link in a chained Ogg Opus file.
+    If source isn't seekable (e.g., it's a pipe), only the current link
+     appears.*/
+  OggOpusLink       *links;
+  /*The number of serial numbers from a single link.*/
+  int                nserialnos;
+  /*The capacity of the list of serial numbers from a single link.*/
+  int                cserialnos;
+  /*Storage for the list of serial numbers from a single link.*/
+  ogg_uint32_t      *serialnos;
+  /*This is the current offset of the data processed by the ogg_sync_state.
+    After a seek, this should be set to the target offset so that we can track
+     the byte offsets of subsequent pages.
+    After a call to op_get_next_page(), this will point to the first byte after
+     that page.*/
+  opus_int64         offset;
+  /*The total size of this data source, or -1 if it's unseekable.*/
+  opus_int64         end;
+  /*Used to locate pages in the data source.*/
+  ogg_sync_state     oy;
+  /*One of OP_NOTOPEN, OP_PARTOPEN, OP_OPENED, OP_STREAMSET, OP_INITSET.*/
+  int                ready_state;
+  /*The current link being played back.*/
+  int                cur_link;
+  /*The number of decoded samples to discard from the start of decoding.*/
+  opus_int32         cur_discard_count;
+  /*The granule position of the previous packet (current packet start time).*/
+  ogg_int64_t        prev_packet_gp;
+  /*The number of bytes read since the last bitrate query, including framing.*/
+  opus_int64         bytes_tracked;
+  /*The number of samples decoded since the last bitrate query.*/
+  ogg_int64_t        samples_tracked;
+  /*Takes physical pages and welds them into a logical stream of packets.*/
+  ogg_stream_state   os;
+  /*Re-timestamped packets from a single page.
+    Buffering these relies on the undocumented libogg behavior that ogg_packet
+     pointers remain valid until the next page is submitted to the
+     ogg_stream_state they came from.*/
+  ogg_packet         op[255];
+  /*The index of the next packet to return.*/
+  int                op_pos;
+  /*The total number of packets available.*/
+  int                op_count;
+  /*Central working state for the packet-to-PCM decoder.*/
+  OpusMSDecoder     *od;
+  /*The application-provided packet decode callback.*/
+  op_decode_cb_func  decode_cb;
+  /*The application-provided packet decode callback context.*/
+  void              *decode_cb_ctx;
+  /*The stream count used to initialize the decoder.*/
+  int                od_stream_count;
+  /*The coupled stream count used to initialize the decoder.*/
+  int                od_coupled_count;
+  /*The channel count used to initialize the decoder.*/
+  int                od_channel_count;
+  /*The channel mapping used to initialize the decoder.*/
+  unsigned char      od_mapping[OP_NCHANNELS_MAX];
+  /*The buffered data for one decoded packet.*/
+  op_sample         *od_buffer;
+  /*The current position in the decoded buffer.*/
+  int                od_buffer_pos;
+  /*The number of valid samples in the decoded buffer.*/
+  int                od_buffer_size;
+  /*The type of gain offset to apply.
+    One of OP_HEADER_GAIN, OP_TRACK_GAIN, or OP_ABSOLUTE_GAIN.*/
+  int                gain_type;
+  /*The offset to apply to the gain.*/
+  opus_int32         gain_offset_q8;
+  /*Internal state for soft clipping and dithering float->short output.*/
+#if !defined(OPUS_FIXED_POINT)
+# if defined(OP_SOFT_CLIP)
+  float              clip_state[OP_NCHANNELS_MAX];
+# endif
+  float              dither_a[OP_NCHANNELS_MAX*4];
+  float              dither_b[OP_NCHANNELS_MAX*4];
+  opus_uint32        dither_seed;
+  int                dither_mute;
+  int                dither_disabled;
+  /*The number of channels represented by the internal state.
+    This gets set to 0 whenever anything that would prevent state propagation
+     occurs (switching between the float/short APIs, or between the
+     stereo/multistream APIs).*/
+  int                state_channel_count;
+#endif
+};
+
+int op_strncasecmp(const char *_a,const char *_b,int _n);
+
+#endif

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.