2
0
Эх сурвалжийг харах

Merge branch 'master' of github.com:okamstudio/godot

Anton Yabchinskiy 10 жил өмнө
parent
commit
3b9868d2e4
100 өөрчлөгдсөн 12705 нэмэгдсэн , 11580 устгасан
  1. 4 0
      .gitattributes
  2. 14 1
      .gitignore
  3. 3 3
      Doxyfile
  4. 13 7
      SConstruct
  5. 1 0
      bin/tests/test_main.cpp
  6. 33 0
      core/bind/core_bind.cpp
  7. 19 0
      core/bind/core_bind.h
  8. 1 0
      core/error_list.h
  9. 9 9
      core/error_macros.h
  10. 1 2
      core/func_ref.cpp
  11. 1 0
      core/global_constants.cpp
  12. 1 2
      core/globals.cpp
  13. 140 0
      core/image.cpp
  14. 4 0
      core/image.h
  15. 359 359
      core/io/aes256.cpp
  16. 46 46
      core/io/aes256.h
  17. 2 2
      core/io/compression.cpp
  18. 9 2
      core/io/image_loader.cpp
  19. 0 1
      core/io/ioapi.h
  20. 4 4
      core/io/marshalls.cpp
  21. 1 2
      core/io/packet_peer.cpp
  22. 316 25
      core/io/resource_format_binary.cpp
  23. 9 5
      core/io/resource_format_binary.h
  24. 271 55
      core/io/resource_format_xml.cpp
  25. 18 7
      core/io/resource_format_xml.h
  26. 54 13
      core/io/resource_loader.cpp
  27. 16 6
      core/io/resource_loader.h
  28. 1 1
      core/io/resource_saver.h
  29. 8 1
      core/io/translation_loader_po.cpp
  30. 1 1
      core/io/translation_loader_po.h
  31. 1 1
      core/io/unzip.c
  32. 2 2
      core/io/zip.c
  33. 2 0
      core/io/zip.h
  34. 1 0
      core/io/zip_io.h
  35. 8 2
      core/list.h
  36. 2 2
      core/math/math_funcs.h
  37. 21 9
      core/math/vector3.h
  38. 32 32
      core/object.h
  39. 19 0
      core/object_type_db.cpp
  40. 3 1
      core/object_type_db.h
  41. 1 263
      core/os/input.cpp
  42. 4 66
      core/os/input.h
  43. 2 1
      core/os/main_loop.cpp
  44. 2 0
      core/os/main_loop.h
  45. 6 3
      core/os/os.cpp
  46. 4 2
      core/os/os.h
  47. 31 0
      core/path_db.cpp
  48. 4 1
      core/path_db.h
  49. 1 1
      core/resource.h
  50. 4 4
      core/ring_buffer.h
  51. 284 6
      core/script_debugger_remote.cpp
  52. 49 0
      core/script_debugger_remote.h
  53. 36 0
      core/script_language.h
  54. 2 2
      core/typedefs.h
  55. 27 0
      core/undo_redo.cpp
  56. 11 0
      core/undo_redo.h
  57. 35 4
      core/ustring.cpp
  58. 1 0
      core/ustring.h
  59. 10 2
      core/variant.cpp
  60. 8 10
      core/variant_call.cpp
  61. 21 0
      demos/2d/dynamic_collision_shapes/ball.gd
  62. BIN
      demos/2d/dynamic_collision_shapes/ball.png
  63. BIN
      demos/2d/dynamic_collision_shapes/ball.scn
  64. BIN
      demos/2d/dynamic_collision_shapes/box.png
  65. BIN
      demos/2d/dynamic_collision_shapes/circle.png
  66. 23 0
      demos/2d/dynamic_collision_shapes/dynamic_colobjs.gd
  67. BIN
      demos/2d/dynamic_collision_shapes/dynamic_colobjs.scn
  68. 4 0
      demos/2d/dynamic_collision_shapes/engine.cfg
  69. BIN
      demos/2d/dynamic_collision_shapes/poly.png
  70. BIN
      demos/2d/isometric/dungeon.scn
  71. BIN
      demos/2d/lights_shadows/light_shadows.scn
  72. 8 0
      demos/2d/platformer/coin.gd
  73. 174 169
      demos/2d/platformer/enemy.xml
  74. 3 3
      demos/2d/platformer/engine.cfg
  75. 157 191
      demos/2d/platformer/player.xml
  76. 4 20
      demos/2d/platformer/stage.xml
  77. 1 0
      demos/2d/platformer/tiles_demo.png.flags
  78. 2 1
      demos/3d/platformer/enemy.gd
  79. BIN
      demos/3d/platformer/enemy.scn
  80. 3 3
      demos/3d/platformer/player.gd
  81. 190 182
      doc/base/classes.xml
  82. 123 123
      doc/engine_classes.xml
  83. 5 3
      drivers/SCsub
  84. 12 1
      drivers/chibi/event_stream_chibi.cpp
  85. 1 1
      drivers/chibi/event_stream_chibi.h
  86. 2 1
      drivers/convex_decomp/b2Glue.h
  87. 2 2
      drivers/convex_decomp/b2Polygon.cpp
  88. 1 1
      drivers/convex_decomp/b2Polygon.h
  89. 9 1
      drivers/dds/texture_loader_dds.cpp
  90. 1 1
      drivers/dds/texture_loader_dds.h
  91. 2454 2454
      drivers/etc1/rg_etc1.cpp
  92. 76 76
      drivers/etc1/rg_etc1.h
  93. 4 0
      drivers/gl_context/glew.c
  94. 56 5
      drivers/gles2/rasterizer_gles2.cpp
  95. 7 0
      drivers/gles2/rasterizer_gles2.h
  96. 11 11
      drivers/gles2/shader_compiler_gles2.cpp
  97. 64 69
      drivers/mpc/audio_stream_mpc.cpp
  98. 34 13
      drivers/mpc/audio_stream_mpc.h
  99. 5814 5814
      drivers/nedmalloc/malloc.c.h
  100. 1467 1467
      drivers/nedmalloc/nedmalloc.cpp

+ 4 - 0
.gitattributes

@@ -0,0 +1,4 @@
+*.cpp eol=lf
+*.h eol=lf
+*.py eol=lf
+*.hpp eol=lf

+ 14 - 1
.gitignore

@@ -23,6 +23,9 @@ tools/editor/editor_icons.cpp
 make.bat
 log.txt
 
+# Doxygen generated documentation
+doc/doxygen/*
+
 # Javascript specific
 *.bc
 
@@ -50,6 +53,14 @@ platform/android/libs/play_licensing/gen/*
 *.d
 *.so
 *.os
+*.Plo
+*.lo
+*.Po
+
+# Libs generated files
+.deps/*
+.dirstamp
+
 
 # QT project files
 *.config
@@ -73,6 +84,8 @@ platform/android/libs/play_licensing/gen/*
 *.suo
 *.user
 *.sln.docstates
+*.sln
+*.vcxproj*
 
 # Build results
 [Dd]ebug/
@@ -282,4 +295,4 @@ cscope.in.out
 cscope.po.out
 godot.creator.*
 
-projects/
+projects/

+ 3 - 3
Doxyfile

@@ -51,14 +51,14 @@ PROJECT_BRIEF          = "Game Engine MIT"
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
 
-PROJECT_LOGO           = E:/development/godot/logo_small.png
+PROJECT_LOGO           = ./logo_small.png
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = E:/development/godot/doxygen
+OUTPUT_DIRECTORY       = ./doc/doxygen/
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -768,7 +768,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = E:/development/godot
+INPUT                  = ./core/ ./main/ ./scene/
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

+ 13 - 7
SConstruct

@@ -54,13 +54,16 @@ methods.save_active_platforms(active_platforms,active_platform_ids)
 
 custom_tools=['default']
 
+platform_arg = ARGUMENTS.get("platform", False)
+
 if (os.name=="posix"):
 	pass
 elif (os.name=="nt"):
-    if (os.getenv("VSINSTALLDIR")==None):
-	custom_tools=['mingw']
+    if (os.getenv("VSINSTALLDIR")==None or platform_arg=="android"):
+		custom_tools=['mingw']
 
 env_base=Environment(tools=custom_tools,ENV = {'PATH' : os.environ['PATH']});
+
 #env_base=Environment(tools=custom_tools);
 env_base.global_defaults=global_defaults
 env_base.android_source_modules=[]
@@ -99,6 +102,7 @@ opts.Add('p','Platform (same as platform=).',"")
 opts.Add('tools','Build Tools (Including Editor): (yes/no)','yes')
 opts.Add('gdscript','Build GDSCript support: (yes/no)','yes')
 opts.Add('vorbis','Build Ogg Vorbis Support: (yes/no)','yes')
+opts.Add('opus','Build Opus Audio Format Support: (yes/no)','yes')
 opts.Add('minizip','Build Minizip Archive Support: (yes/no)','yes')
 opts.Add('squish','Squish BC Texture Compression in editor (yes/no)','yes')
 opts.Add('theora','Theora Video (yes/no)','yes')
@@ -218,14 +222,14 @@ if selected_platform in platform_list:
 
 	env.Append(LINKFLAGS=string.split(str(LINKFLAGS)))
 
-	detect.configure(env)
-
-
 	flag_list = platform_flags[selected_platform]
 	for f in flag_list:
 		if not (f[0] in ARGUMENTS): # allow command line to override platform flags
 			env[f[0]] = f[1]
 
+	#must happen after the flags, so when flags are used by configure, stuff happens (ie, ssl on x11)
+	detect.configure(env)
+
         #env['platform_libsuffix'] = env['LIBSUFFIX']
 
 	suffix="."+selected_platform
@@ -297,6 +301,8 @@ if selected_platform in platform_list:
 
 	if (env['vorbis']=='yes'):
 		env.Append(CPPFLAGS=['-DVORBIS_ENABLED']);
+	if (env['opus']=='yes'):
+		env.Append(CPPFLAGS=['-DOPUS_ENABLED']);
 
 	if (env['theora']=='yes'):
 		env.Append(CPPFLAGS=['-DTHEORA_ENABLED']);
@@ -364,8 +370,8 @@ if selected_platform in platform_list:
 		
 		#env['MSVS_VERSION']='9.0'
 		env['MSVSBUILDCOM'] = "scons platform=" + selected_platform + " target=" + env["target"] + " bits=" + env["bits"] + " tools=yes"
-		env['MSVSREBUILDCOM'] = "scons platform=" + selected_platform + " target=" + env["target"] + " bits=" + env["bits"] + " tools=yes"
-		env['MSVSCLEANCOM'] = "scons platform=" + selected_platform + " target=" + env["target"] + " bits=" + env["bits"] + " tools=yes"
+		env['MSVSREBUILDCOM'] = "scons platform=" + selected_platform + " target=" + env["target"] + " bits=" + env["bits"] + " tools=yes vsproj=true"
+		env['MSVSCLEANCOM'] = "scons --clean platform=" + selected_platform + " target=" + env["target"] + " bits=" + env["bits"] + " tools=yes"
 			
 		debug_variants = ['Debug|Win32']+['Debug|x64']
 		release_variants = ['Release|Win32']+['Release|x64']

+ 1 - 0
bin/tests/test_main.cpp

@@ -61,6 +61,7 @@ const char ** tests_get_names()  {
 		"gui",
 		"io",
 		"shaderlang",
+		"physics",
 		NULL
 	};
 	

+ 33 - 0
core/bind/core_bind.cpp

@@ -494,6 +494,10 @@ uint64_t _OS::get_unix_time() const {
 	return OS::get_singleton()->get_unix_time();
 };
 
+uint64_t _OS::get_system_time_msec() const {
+	return OS::get_singleton()->get_system_time_msec();
+}
+
 void _OS::delay_usec(uint32_t p_usec) const {
 
 	OS::get_singleton()->delay_usec(p_usec);
@@ -694,6 +698,17 @@ bool _OS::is_debug_build() const {
 
 }
 
+void _OS::set_screen_orientation(ScreenOrientation p_orientation) {
+
+	OS::get_singleton()->set_screen_orientation(OS::ScreenOrientation(p_orientation));
+}
+
+_OS::ScreenOrientation _OS::get_screen_orientation() const {
+
+	return ScreenOrientation(OS::get_singleton()->get_screen_orientation());
+}
+
+
 String _OS::get_system_dir(SystemDir p_dir) const {
 
 	return OS::get_singleton()->get_system_dir(OS::SystemDir(p_dir));
@@ -717,6 +732,11 @@ int _OS::find_scancode_from_string(const String& p_code) const {
 	return find_keycode(p_code);
 }
 
+void _OS::alert(const String& p_alert,const String& p_title) {
+
+	OS::get_singleton()->alert(p_alert,p_title);
+}
+
 _OS *_OS::singleton=NULL;
 
 void _OS::_bind_methods() {
@@ -752,6 +772,9 @@ void _OS::_bind_methods() {
 	ObjectTypeDB::bind_method(_MD("set_window_maximized", "enabled"),&_OS::set_window_maximized);
 	ObjectTypeDB::bind_method(_MD("is_window_maximized"),&_OS::is_window_maximized);
 
+	ObjectTypeDB::bind_method(_MD("set_screen_orientation","orientation"),&_OS::set_screen_orientation);
+	ObjectTypeDB::bind_method(_MD("get_screen_orientation"),&_OS::get_screen_orientation);
+
 
 	ObjectTypeDB::bind_method(_MD("set_iterations_per_second","iterations_per_second"),&_OS::set_iterations_per_second);
 	ObjectTypeDB::bind_method(_MD("get_iterations_per_second"),&_OS::get_iterations_per_second);
@@ -787,6 +810,7 @@ void _OS::_bind_methods() {
 	ObjectTypeDB::bind_method(_MD("get_time","utc"),&_OS::get_time,DEFVAL(false));
 	ObjectTypeDB::bind_method(_MD("get_time_zone_info"),&_OS::get_time_zone_info);
 	ObjectTypeDB::bind_method(_MD("get_unix_time"),&_OS::get_unix_time);
+	ObjectTypeDB::bind_method(_MD("get_system_time_msec"), &_OS::get_system_time_msec);
 
 	ObjectTypeDB::bind_method(_MD("set_icon"),&_OS::set_icon);
 
@@ -840,6 +864,7 @@ void _OS::_bind_methods() {
 
 	ObjectTypeDB::bind_method(_MD("set_use_file_access_save_and_swap","enabled"),&_OS::set_use_file_access_save_and_swap);
 
+	ObjectTypeDB::bind_method(_MD("alert","text","title"),&_OS::alert,DEFVAL("Alert!"));
 
 
 	BIND_CONSTANT( DAY_SUNDAY );
@@ -863,6 +888,14 @@ void _OS::_bind_methods() {
 	BIND_CONSTANT( MONTH_NOVEMBER );
 	BIND_CONSTANT( MONTH_DECEMBER );
 
+	BIND_CONSTANT( SCREEN_ORIENTATION_LANDSCAPE );
+	BIND_CONSTANT( SCREEN_ORIENTATION_PORTRAIT );
+	BIND_CONSTANT( SCREEN_ORIENTATION_REVERSE_LANDSCAPE );
+	BIND_CONSTANT( SCREEN_ORIENTATION_REVERSE_PORTRAIT );
+	BIND_CONSTANT( SCREEN_ORIENTATION_SENSOR_LANDSCAPE );
+	BIND_CONSTANT( SCREEN_ORIENTATION_SENSOR_PORTRAIT );
+	BIND_CONSTANT( SCREEN_ORIENTATION_SENSOR );
+
 	BIND_CONSTANT( SYSTEM_DIR_DESKTOP);
 	BIND_CONSTANT( SYSTEM_DIR_DCIM );
 	BIND_CONSTANT( SYSTEM_DIR_DOCUMENTS );

+ 19 - 0
core/bind/core_bind.h

@@ -208,6 +208,7 @@ public:
 	Dictionary get_time(bool utc) const;
 	Dictionary get_time_zone_info() const;
 	uint64_t get_unix_time() const;
+	uint64_t get_system_time_msec() const;
 
 	int get_static_memory_usage() const;
 	int get_static_memory_peak_usage() const;
@@ -239,11 +240,28 @@ public:
 		SYSTEM_DIR_RINGTONES,
 	};
 
+	enum ScreenOrientation {
+
+		SCREEN_ORIENTATION_LANDSCAPE,
+		SCREEN_ORIENTATION_PORTRAIT,
+		SCREEN_ORIENTATION_REVERSE_LANDSCAPE,
+		SCREEN_ORIENTATION_REVERSE_PORTRAIT,
+		SCREEN_ORIENTATION_SENSOR_LANDSCAPE,
+		SCREEN_ORIENTATION_SENSOR_PORTRAIT,
+		SCREEN_ORIENTATION_SENSOR,
+	};
+
 	String get_system_dir(SystemDir p_dir) const;
 
 
 	String get_data_dir() const;
 
+	void alert(const String& p_alert,const String& p_title="ALERT!");
+
+
+	void set_screen_orientation(ScreenOrientation p_orientation);
+	ScreenOrientation get_screen_orientation() const;
+
 	void set_time_scale(float p_scale);
 	float get_time_scale();
 
@@ -255,6 +273,7 @@ public:
 };
 
 VARIANT_ENUM_CAST(_OS::SystemDir);
+VARIANT_ENUM_CAST(_OS::ScreenOrientation);
 
 
 class _Geometry : public Object {

+ 1 - 0
core/error_list.h

@@ -54,6 +54,7 @@ enum Error {
 	ERR_FILE_CANT_READ,
 	ERR_FILE_UNRECOGNIZED, // (15)
 	ERR_FILE_CORRUPT,
+	ERR_FILE_MISSING_DEPENDENCIES,
 	ERR_FILE_EOF,
 	ERR_CANT_OPEN, ///< Can't open a resource/socket/file
 	ERR_CANT_CREATE,

+ 9 - 9
core/error_macros.h

@@ -104,7 +104,7 @@ extern bool _err_error_exists;
 
 #define ERR_FAIL_INDEX(m_index,m_size) \
 	 do {if ((m_index)<0 || (m_index)>=(m_size)) { \
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Index "_STR(m_index)" out of size ("_STR(m_size)").");	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Index " _STR(m_index)" out of size (" _STR(m_size)").");	\
 		return;	\
 	} else _err_error_exists=false; } while(0);	\
 
@@ -115,7 +115,7 @@ extern bool _err_error_exists;
 
 #define ERR_FAIL_INDEX_V(m_index,m_size,m_retval) \
 	 do {if ((m_index)<0 || (m_index)>=(m_size)) { \
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Index "_STR(m_index)" out of size ("_STR(m_size)").");	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Index " _STR(m_index)" out of size (" _STR(m_size)").");	\
 		return m_retval;	 \
 	} else _err_error_exists=false;} while (0);
 
@@ -125,14 +125,14 @@ extern bool _err_error_exists;
 
  #define ERR_FAIL_NULL(m_param) \
 	 { if ( !m_param ) {	\
-		 _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Parameter ' "_STR(m_param)" ' is null.");	\
+         _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Parameter ' " _STR(m_param)" ' is null.");	\
 		 return;	 \
 	 }else _err_error_exists=false; }	\
 
 
 #define ERR_FAIL_NULL_V(m_param,m_retval) \
 	{ if ( !m_param ) {	\
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Parameter ' "_STR(m_param)" ' is null.");	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Parameter ' " _STR(m_param)" ' is null.");	\
 		return m_retval;	 \
 	}else _err_error_exists=false; }	\
 
@@ -142,7 +142,7 @@ extern bool _err_error_exists;
 
 #define ERR_FAIL_COND(m_cond) \
 	{ if ( m_cond ) {	\
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' "_STR(m_cond)" ' is true.");	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' " _STR(m_cond)" ' is true.");	\
 		return;	 \
 	}else _err_error_exists=false; }	\
 
@@ -154,7 +154,7 @@ extern bool _err_error_exists;
 
 #define ERR_FAIL_COND_V(m_cond,m_retval) \
 	{ if ( m_cond ) {	\
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' "_STR(m_cond)" ' is true. returned: "_STR(m_retval));	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' " _STR(m_cond)" ' is true. returned: " _STR(m_retval));	\
 		return m_retval;	 \
 	}else _err_error_exists=false; }	\
 
@@ -164,7 +164,7 @@ extern bool _err_error_exists;
 
 #define ERR_CONTINUE(m_cond) \
 	{ if ( m_cond ) {	\
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' "_STR(m_cond)" ' is true. Continuing..:");	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' " _STR(m_cond)" ' is true. Continuing..:");	\
 		continue;\
 	} else _err_error_exists=false;}	\
 
@@ -174,7 +174,7 @@ extern bool _err_error_exists;
 
 #define ERR_BREAK(m_cond) \
 	{ if ( m_cond ) {	\
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' "_STR(m_cond)" ' is true. Breaking..:");	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Condition ' " _STR(m_cond)" ' is true. Breaking..:");	\
 		break;\
 	} else _err_error_exists=false;}	\
 
@@ -193,7 +193,7 @@ extern bool _err_error_exists;
 
 #define ERR_FAIL_V(m_value) \
 { \
-		_err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Method/Function Failed, returning: "__STR(m_value));	\
+        _err_print_error(FUNCTION_STR,__FILE__,__LINE__,"Method/Function Failed, returning: " __STR(m_value));	\
 		_err_error_exists=false;	\
 		return m_value;\
 } \

+ 1 - 2
core/func_ref.cpp

@@ -31,8 +31,7 @@ void FuncRef::_bind_methods() {
 
 	{
 		MethodInfo mi;
-		mi.name="call";
-		mi.arguments.push_back( PropertyInfo( Variant::STRING, "method"));
+		mi.name="call_func";
 		Vector<Variant> defargs;
 		for(int i=0;i<10;i++) {
 			mi.arguments.push_back( PropertyInfo( Variant::NIL, "arg"+itos(i)));

+ 1 - 0
core/global_constants.cpp

@@ -422,6 +422,7 @@ static _GlobalConstant _global_constants[]={
 	BIND_GLOBAL_CONSTANT( ERR_FILE_CANT_READ ),
 	BIND_GLOBAL_CONSTANT( ERR_FILE_UNRECOGNIZED ),
 	BIND_GLOBAL_CONSTANT( ERR_FILE_CORRUPT ),
+	BIND_GLOBAL_CONSTANT( ERR_FILE_MISSING_DEPENDENCIES),
 	BIND_GLOBAL_CONSTANT( ERR_FILE_EOF ),
 	BIND_GLOBAL_CONSTANT( ERR_CANT_OPEN ), ///< Can't open a resource/socket/file
 	BIND_GLOBAL_CONSTANT( ERR_CANT_CREATE ),

+ 1 - 2
core/globals.cpp

@@ -54,7 +54,7 @@ String Globals::localize_path(const String& p_path) const {
 	if (resource_path=="")
 		return p_path; //not initialied yet
 
-	if (p_path.begins_with("res://"))
+	if (p_path.find(":/") != -1)
 		return p_path.simplify_path();
 
 
@@ -1477,7 +1477,6 @@ Globals::Globals() {
 	custom_prop_info["render/mipmap_policy"]=PropertyInfo(Variant::INT,"render/mipmap_policy",PROPERTY_HINT_ENUM,"Allow,Allow For Po2,Disallow");
 	custom_prop_info["render/thread_model"]=PropertyInfo(Variant::INT,"render/thread_model",PROPERTY_HINT_ENUM,"Single-Unsafe,Single-Safe,Multi-Threaded");
 	custom_prop_info["physics_2d/thread_model"]=PropertyInfo(Variant::INT,"physics_2d/thread_model",PROPERTY_HINT_ENUM,"Single-Unsafe,Single-Safe,Multi-Threaded");
-	set("display/emulate_touchscreen",false);
 
 	using_datapack=false;
 }

+ 140 - 0
core/image.cpp

@@ -34,6 +34,33 @@
 #include "print_string.h"
 #include <stdio.h>
 
+
+const char* Image::format_names[Image::FORMAT_MAX]={
+	"Grayscale",
+	"Intensity",
+	"GrayscaleAlpha",
+	"RGB",
+	"RGBA",
+	"Indexed",
+	"IndexedAlpha",
+	"YUV422",
+	"YUV444",
+	"BC1",
+	"BC2",
+	"BC3",
+	"BC4",
+	"BC5",
+	"PVRTC2",
+	"PVRTC2Alpha",
+	"PVRTC4",
+	"PVRTC4Alpha",
+	"ETC",
+	"ATC",
+	"ATCAlphaExp",
+	"ATCAlphaInterp",
+
+};
+
 SavePNGFunc Image::save_png_func = NULL;
 
 void Image::_put_pixel(int p_x,int p_y, const BColor& p_color, unsigned char *p_data) {
@@ -400,6 +427,102 @@ Image::Format Image::get_format() const{
 	return format;
 }
 
+static double _bicubic_interp_kernel( double x ) {
+
+	x = ABS(x);
+
+	double bc = 0;
+
+	if ( x <= 1 )
+		bc = ( 1.5 * x - 2.5 ) * x * x + 1;
+	else if ( x < 2 )
+		bc = ( ( -0.5 * x + 2.5 ) * x - 4 ) * x + 2;
+
+
+	return bc;
+}
+
+template<int CC>
+static void _scale_cubic(const uint8_t* p_src, uint8_t* p_dst, uint32_t p_src_width, uint32_t p_src_height, uint32_t p_dst_width, uint32_t p_dst_height) {
+
+
+	// get source image size
+	int width   = p_src_width;
+	int height  = p_src_height;
+	double xfac = (double) width / p_dst_width;
+	double yfac = (double) height / p_dst_height;
+	// coordinates of source points and cooefficiens
+	double  ox, oy, dx, dy, k1, k2;
+	int     ox1, oy1, ox2, oy2;
+	// destination pixel values
+	// width and height decreased by 1
+	int ymax = height - 1;
+	int xmax = width - 1;
+	// temporary pointer
+
+	for ( int y = 0; y < p_dst_height; y++ ) {
+		// Y coordinates
+		oy  = (double) y * yfac - 0.5f;
+		oy1 = (int) oy;
+		dy  = oy - (double) oy1;
+
+		for ( int x = 0; x < p_dst_width; x++ )	{
+			// X coordinates
+			ox  = (double) x * xfac - 0.5f;
+			ox1 = (int) ox;
+			dx  = ox - (double) ox1;
+
+			// initial pixel value
+
+			uint8_t *dst=p_dst + (y*p_dst_width+x)*CC;
+
+			double color[CC];
+			for(int i=0;i<CC;i++) {
+				color[i]=0;
+			}
+
+
+
+			for ( int n = -1; n < 3; n++ ) {
+				// get Y cooefficient
+				k1 = _bicubic_interp_kernel( dy - (double) n );
+
+				oy2 = oy1 + n;
+				if ( oy2 < 0 )
+					oy2 = 0;
+				if ( oy2 > ymax )
+					oy2 = ymax;
+
+				for ( int m = -1; m < 3; m++ ) {
+					// get X cooefficient
+					k2 = k1 * _bicubic_interp_kernel( (double) m - dx );
+
+					ox2 = ox1 + m;
+					if ( ox2 < 0 )
+						ox2 = 0;
+					if ( ox2 > xmax )
+						ox2 = xmax;
+
+					// get pixel of original image
+					const uint8_t *p = p_src + (oy2 * p_src_width + ox2)*CC;
+
+					for(int i=0;i<CC;i++) {
+
+						color[i]+=p[i]*k2;
+					}
+				}
+			}
+
+			for(int i=0;i<CC;i++) {
+				dst[i]=CLAMP(Math::fast_ftoi(color[i]),0,255);
+			}
+		}
+	}
+}
+
+
+
+
 template<int CC>
 static void _scale_bilinear(const uint8_t* p_src, uint8_t* p_dst, uint32_t p_src_width, uint32_t p_src_height, uint32_t p_dst_width, uint32_t p_dst_height) {
 
@@ -559,6 +682,17 @@ void Image::resize( int p_width, int p_height, Interpolation p_interpolation ) {
 			}
 
 		} break;
+		case INTERPOLATE_CUBIC: {
+
+			switch(get_format_pixel_size(format)) {
+				case 1: _scale_cubic<1>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+				case 2: _scale_cubic<2>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+				case 3: _scale_cubic<3>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+				case 4: _scale_cubic<4>(r_ptr,w_ptr,width,height,p_width,p_height); break;
+			}
+
+		} break;
+
 
 	}
 
@@ -2248,6 +2382,12 @@ void Image::fix_alpha_edges() {
 
 }
 
+String Image::get_format_name(Format p_format) {
+
+	ERR_FAIL_INDEX_V(p_format,FORMAT_MAX,String());
+	return format_names[p_format];
+}
+
 Image::Image(const uint8_t* p_png,int p_len) {
 
 	width=0;

+ 4 - 0
core/image.h

@@ -87,10 +87,12 @@ public:
 		FORMAT_MAX
 	};
 
+	static const char* format_names[FORMAT_MAX];
 	enum Interpolation {
 	
 		INTERPOLATE_NEAREST,
 		INTERPOLATE_BILINEAR,
+		INTERPOLATE_CUBIC,
 		/* INTERPOLATE GAUSS */
 	};
 
@@ -351,6 +353,8 @@ public:
 	Image get_rect(const Rect2& p_area) const;
 
 	static void set_compress_bc_func(void (*p_compress_func)(Image *));
+	static String get_format_name(Format p_format);
+
 	Image(const uint8_t* p_mem_png, int p_len=-1);
 	Image(const char **p_xpm);
 	~Image();

+ 359 - 359
core/io/aes256.cpp

@@ -1,359 +1,359 @@
-/*  
-*   Byte-oriented AES-256 implementation.
-*   All lookup tables replaced with 'on the fly' calculations. 
-*
-*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
-*   Other contributors: Hal Finney
-*
-*   Permission to use, copy, modify, and distribute this software for any
-*   purpose with or without fee is hereby granted, provided that the above
-*   copyright notice and this permission notice appear in all copies.
-*
-*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-*/
-#include "aes256.h"
-
-#define F(x)   (((x)<<1) ^ ((((x)>>7) & 1) * 0x1b))
-#define FD(x)  (((x) >> 1) ^ (((x) & 1) ? 0x8d : 0))
-
-// #define BACK_TO_TABLES
-#ifdef BACK_TO_TABLES
-
-const uint8_t sbox[256] = {
-    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
-    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
-    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
-    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
-    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
-    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
-    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
-    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
-    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
-    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
-    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
-    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
-    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
-    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
-    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
-    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
-    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-};
-const uint8_t sboxinv[256] = {
-    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
-    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
-    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
-    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
-    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
-    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
-    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
-    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
-    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
-    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
-    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
-    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
-    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
-    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
-    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
-    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
-    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
-    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
-    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
-    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
-    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
-    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
-    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
-    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
-    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
-    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
-    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
-    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
-    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
-    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
-    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
-    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-};
-
-#define rj_sbox(x)     sbox[(x)]
-#define rj_sbox_inv(x) sboxinv[(x)]
-
-#else /* tableless subroutines */
-
-/* -------------------------------------------------------------------------- */
-uint8_t gf_alog(uint8_t x) // calculate anti-logarithm gen 3
-{
-    uint8_t atb = 1, z;
-
-    while (x--) {z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;}
-
-    return atb;
-} /* gf_alog */
-
-/* -------------------------------------------------------------------------- */
-uint8_t gf_log(uint8_t x) // calculate logarithm gen 3
-{
-    uint8_t atb = 1, i = 0, z;
-
-    do {
-        if (atb == x) break;
-        z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;
-    } while (++i > 0);
-
-    return i;
-} /* gf_log */
-
-
-/* -------------------------------------------------------------------------- */
-uint8_t gf_mulinv(uint8_t x) // calculate multiplicative inverse
-{
-    return (x) ? gf_alog(255 - gf_log(x)) : 0;
-} /* gf_mulinv */
-
-/* -------------------------------------------------------------------------- */
-uint8_t rj_sbox(uint8_t x)
-{
-    uint8_t y, sb;
-
-    sb = y = gf_mulinv(x);
-    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y; 
-    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y;
-
-    return (sb ^ 0x63);
-} /* rj_sbox */
-
-/* -------------------------------------------------------------------------- */
-uint8_t rj_sbox_inv(uint8_t x)
-{
-    uint8_t y, sb;
-
-    y = x ^ 0x63;
-    sb = y = (y<<1)|(y>>7);
-    y = (y<<2)|(y>>6); sb ^= y; y = (y<<3)|(y>>5); sb ^= y;
-
-    return gf_mulinv(sb);
-} /* rj_sbox_inv */
-
-#endif
-
-/* -------------------------------------------------------------------------- */
-uint8_t rj_xtime(uint8_t x) 
-{
-    return (x & 0x80) ? ((x << 1) ^ 0x1b) : (x << 1);
-} /* rj_xtime */
-
-/* -------------------------------------------------------------------------- */
-void aes_subBytes(uint8_t *buf)
-{
-    register uint8_t i = 16;
-
-    while (i--) buf[i] = rj_sbox(buf[i]);
-} /* aes_subBytes */
-
-/* -------------------------------------------------------------------------- */
-void aes_subBytes_inv(uint8_t *buf)
-{
-    register uint8_t i = 16;
-
-    while (i--) buf[i] = rj_sbox_inv(buf[i]);
-} /* aes_subBytes_inv */
-
-/* -------------------------------------------------------------------------- */
-void aes_addRoundKey(uint8_t *buf, uint8_t *key)
-{
-    register uint8_t i = 16;
-
-    while (i--) buf[i] ^= key[i];
-} /* aes_addRoundKey */
-
-/* -------------------------------------------------------------------------- */
-void aes_addRoundKey_cpy(uint8_t *buf, uint8_t *key, uint8_t *cpk)
-{
-    register uint8_t i = 16;
-
-    while (i--)  buf[i] ^= (cpk[i] = key[i]), cpk[16+i] = key[16 + i];
-} /* aes_addRoundKey_cpy */
-
-
-/* -------------------------------------------------------------------------- */
-void aes_shiftRows(uint8_t *buf)
-{
-    register uint8_t i, j; /* to make it potentially parallelable :) */
-
-    i = buf[1]; buf[1] = buf[5]; buf[5] = buf[9]; buf[9] = buf[13]; buf[13] = i;
-    i = buf[10]; buf[10] = buf[2]; buf[2] = i;
-    j = buf[3]; buf[3] = buf[15]; buf[15] = buf[11]; buf[11] = buf[7]; buf[7] = j;
-    j = buf[14]; buf[14] = buf[6]; buf[6]  = j;
-
-} /* aes_shiftRows */
-
-/* -------------------------------------------------------------------------- */
-void aes_shiftRows_inv(uint8_t *buf)
-{
-    register uint8_t i, j; /* same as above :) */
-
-    i = buf[1]; buf[1] = buf[13]; buf[13] = buf[9]; buf[9] = buf[5]; buf[5] = i;
-    i = buf[2]; buf[2] = buf[10]; buf[10] = i;
-    j = buf[3]; buf[3] = buf[7]; buf[7] = buf[11]; buf[11] = buf[15]; buf[15] = j;
-    j = buf[6]; buf[6] = buf[14]; buf[14] = j;
-
-} /* aes_shiftRows_inv */
-
-/* -------------------------------------------------------------------------- */
-void aes_mixColumns(uint8_t *buf)
-{
-    register uint8_t i, a, b, c, d, e;
-
-    for (i = 0; i < 16; i += 4)
-    {
-        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
-        e = a ^ b ^ c ^ d;
-        buf[i] ^= e ^ rj_xtime(a^b);   buf[i+1] ^= e ^ rj_xtime(b^c);
-        buf[i+2] ^= e ^ rj_xtime(c^d); buf[i+3] ^= e ^ rj_xtime(d^a);
-    }
-} /* aes_mixColumns */
-
-/* -------------------------------------------------------------------------- */
-void aes_mixColumns_inv(uint8_t *buf)
-{
-    register uint8_t i, a, b, c, d, e, x, y, z;
-
-    for (i = 0; i < 16; i += 4)
-    {
-        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
-        e = a ^ b ^ c ^ d;
-        z = rj_xtime(e);
-        x = e ^ rj_xtime(rj_xtime(z^a^c));  y = e ^ rj_xtime(rj_xtime(z^b^d));
-        buf[i] ^= x ^ rj_xtime(a^b);   buf[i+1] ^= y ^ rj_xtime(b^c);
-        buf[i+2] ^= x ^ rj_xtime(c^d); buf[i+3] ^= y ^ rj_xtime(d^a);
-    }
-} /* aes_mixColumns_inv */
-
-/* -------------------------------------------------------------------------- */
-void aes_expandEncKey(uint8_t *k, uint8_t *rc) 
-{
-    register uint8_t i;
-
-    k[0] ^= rj_sbox(k[29]) ^ (*rc);
-    k[1] ^= rj_sbox(k[30]);
-    k[2] ^= rj_sbox(k[31]);
-    k[3] ^= rj_sbox(k[28]);
-    *rc = F( *rc);
-
-    for(i = 4; i < 16; i += 4)  k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-    k[16] ^= rj_sbox(k[12]);
-    k[17] ^= rj_sbox(k[13]);
-    k[18] ^= rj_sbox(k[14]);
-    k[19] ^= rj_sbox(k[15]);
-
-    for(i = 20; i < 32; i += 4) k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-
-} /* aes_expandEncKey */
-
-/* -------------------------------------------------------------------------- */
-void aes_expandDecKey(uint8_t *k, uint8_t *rc) 
-{
-    uint8_t i;
-
-    for(i = 28; i > 16; i -= 4) k[i+0] ^= k[i-4], k[i+1] ^= k[i-3], 
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-
-    k[16] ^= rj_sbox(k[12]);
-    k[17] ^= rj_sbox(k[13]);
-    k[18] ^= rj_sbox(k[14]);
-    k[19] ^= rj_sbox(k[15]);
-
-    for(i = 12; i > 0; i -= 4)  k[i+0] ^= k[i-4], k[i+1] ^= k[i-3],
-        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
-
-    *rc = FD(*rc);
-    k[0] ^= rj_sbox(k[29]) ^ (*rc);
-    k[1] ^= rj_sbox(k[30]);
-    k[2] ^= rj_sbox(k[31]);
-    k[3] ^= rj_sbox(k[28]);
-} /* aes_expandDecKey */
-
-
-/* -------------------------------------------------------------------------- */
-void aes256_init(aes256_context *ctx, uint8_t *k)
-{
-    uint8_t rcon = 1;
-    register uint8_t i;
-
-    for (i = 0; i < sizeof(ctx->key); i++) ctx->enckey[i] = ctx->deckey[i] = k[i];
-    for (i = 8;--i;) aes_expandEncKey(ctx->deckey, &rcon);
-} /* aes256_init */
-
-/* -------------------------------------------------------------------------- */
-void aes256_done(aes256_context *ctx)
-{
-    register uint8_t i;
-
-    for (i = 0; i < sizeof(ctx->key); i++) 
-        ctx->key[i] = ctx->enckey[i] = ctx->deckey[i] = 0;
-} /* aes256_done */
-
-/* -------------------------------------------------------------------------- */
-void aes256_encrypt_ecb(aes256_context *ctx, uint8_t *buf)
-{
-    uint8_t i, rcon;
-
-    aes_addRoundKey_cpy(buf, ctx->enckey, ctx->key);
-    for(i = 1, rcon = 1; i < 14; ++i)
-    {
-        aes_subBytes(buf);
-        aes_shiftRows(buf);
-        aes_mixColumns(buf);
-        if( i & 1 ) aes_addRoundKey( buf, &ctx->key[16]);
-        else aes_expandEncKey(ctx->key, &rcon), aes_addRoundKey(buf, ctx->key);
-    }
-    aes_subBytes(buf);
-    aes_shiftRows(buf);
-    aes_expandEncKey(ctx->key, &rcon); 
-    aes_addRoundKey(buf, ctx->key);
-} /* aes256_encrypt */
-
-/* -------------------------------------------------------------------------- */
-void aes256_decrypt_ecb(aes256_context *ctx, uint8_t *buf)
-{
-    uint8_t i, rcon;
-
-    aes_addRoundKey_cpy(buf, ctx->deckey, ctx->key);
-    aes_shiftRows_inv(buf);
-    aes_subBytes_inv(buf);
-
-    for (i = 14, rcon = 0x80; --i;)
-    {
-        if( ( i & 1 ) )           
-        {
-            aes_expandDecKey(ctx->key, &rcon);
-            aes_addRoundKey(buf, &ctx->key[16]);
-        }
-        else aes_addRoundKey(buf, ctx->key);
-        aes_mixColumns_inv(buf);
-        aes_shiftRows_inv(buf);
-        aes_subBytes_inv(buf);
-    }
-    aes_addRoundKey( buf, ctx->key); 
-} /* aes256_decrypt */
+/*  
+*   Byte-oriented AES-256 implementation.
+*   All lookup tables replaced with 'on the fly' calculations. 
+*
+*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
+*   Other contributors: Hal Finney
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+#include "aes256.h"
+
+#define F(x)   (((x)<<1) ^ ((((x)>>7) & 1) * 0x1b))
+#define FD(x)  (((x) >> 1) ^ (((x) & 1) ? 0x8d : 0))
+
+// #define BACK_TO_TABLES
+#ifdef BACK_TO_TABLES
+
+const uint8_t sbox[256] = {
+    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+};
+const uint8_t sboxinv[256] = {
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+};
+
+#define rj_sbox(x)     sbox[(x)]
+#define rj_sbox_inv(x) sboxinv[(x)]
+
+#else /* tableless subroutines */
+
+/* -------------------------------------------------------------------------- */
+uint8_t gf_alog(uint8_t x) // calculate anti-logarithm gen 3
+{
+    uint8_t atb = 1, z;
+
+    while (x--) {z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;}
+
+    return atb;
+} /* gf_alog */
+
+/* -------------------------------------------------------------------------- */
+uint8_t gf_log(uint8_t x) // calculate logarithm gen 3
+{
+    uint8_t atb = 1, i = 0, z;
+
+    do {
+        if (atb == x) break;
+        z = atb; atb <<= 1; if (z & 0x80) atb^= 0x1b; atb ^= z;
+    } while (++i > 0);
+
+    return i;
+} /* gf_log */
+
+
+/* -------------------------------------------------------------------------- */
+uint8_t gf_mulinv(uint8_t x) // calculate multiplicative inverse
+{
+    return (x) ? gf_alog(255 - gf_log(x)) : 0;
+} /* gf_mulinv */
+
+/* -------------------------------------------------------------------------- */
+uint8_t rj_sbox(uint8_t x)
+{
+    uint8_t y, sb;
+
+    sb = y = gf_mulinv(x);
+    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y; 
+    y = (y<<1)|(y>>7); sb ^= y;  y = (y<<1)|(y>>7); sb ^= y;
+
+    return (sb ^ 0x63);
+} /* rj_sbox */
+
+/* -------------------------------------------------------------------------- */
+uint8_t rj_sbox_inv(uint8_t x)
+{
+    uint8_t y, sb;
+
+    y = x ^ 0x63;
+    sb = y = (y<<1)|(y>>7);
+    y = (y<<2)|(y>>6); sb ^= y; y = (y<<3)|(y>>5); sb ^= y;
+
+    return gf_mulinv(sb);
+} /* rj_sbox_inv */
+
+#endif
+
+/* -------------------------------------------------------------------------- */
+uint8_t rj_xtime(uint8_t x) 
+{
+    return (x & 0x80) ? ((x << 1) ^ 0x1b) : (x << 1);
+} /* rj_xtime */
+
+/* -------------------------------------------------------------------------- */
+void aes_subBytes(uint8_t *buf)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] = rj_sbox(buf[i]);
+} /* aes_subBytes */
+
+/* -------------------------------------------------------------------------- */
+void aes_subBytes_inv(uint8_t *buf)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] = rj_sbox_inv(buf[i]);
+} /* aes_subBytes_inv */
+
+/* -------------------------------------------------------------------------- */
+void aes_addRoundKey(uint8_t *buf, uint8_t *key)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] ^= key[i];
+} /* aes_addRoundKey */
+
+/* -------------------------------------------------------------------------- */
+void aes_addRoundKey_cpy(uint8_t *buf, uint8_t *key, uint8_t *cpk)
+{
+    register uint8_t i = 16;
+
+    while (i--)  buf[i] ^= (cpk[i] = key[i]), cpk[16+i] = key[16 + i];
+} /* aes_addRoundKey_cpy */
+
+
+/* -------------------------------------------------------------------------- */
+void aes_shiftRows(uint8_t *buf)
+{
+    register uint8_t i, j; /* to make it potentially parallelable :) */
+
+    i = buf[1]; buf[1] = buf[5]; buf[5] = buf[9]; buf[9] = buf[13]; buf[13] = i;
+    i = buf[10]; buf[10] = buf[2]; buf[2] = i;
+    j = buf[3]; buf[3] = buf[15]; buf[15] = buf[11]; buf[11] = buf[7]; buf[7] = j;
+    j = buf[14]; buf[14] = buf[6]; buf[6]  = j;
+
+} /* aes_shiftRows */
+
+/* -------------------------------------------------------------------------- */
+void aes_shiftRows_inv(uint8_t *buf)
+{
+    register uint8_t i, j; /* same as above :) */
+
+    i = buf[1]; buf[1] = buf[13]; buf[13] = buf[9]; buf[9] = buf[5]; buf[5] = i;
+    i = buf[2]; buf[2] = buf[10]; buf[10] = i;
+    j = buf[3]; buf[3] = buf[7]; buf[7] = buf[11]; buf[11] = buf[15]; buf[15] = j;
+    j = buf[6]; buf[6] = buf[14]; buf[14] = j;
+
+} /* aes_shiftRows_inv */
+
+/* -------------------------------------------------------------------------- */
+void aes_mixColumns(uint8_t *buf)
+{
+    register uint8_t i, a, b, c, d, e;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
+        e = a ^ b ^ c ^ d;
+        buf[i] ^= e ^ rj_xtime(a^b);   buf[i+1] ^= e ^ rj_xtime(b^c);
+        buf[i+2] ^= e ^ rj_xtime(c^d); buf[i+3] ^= e ^ rj_xtime(d^a);
+    }
+} /* aes_mixColumns */
+
+/* -------------------------------------------------------------------------- */
+void aes_mixColumns_inv(uint8_t *buf)
+{
+    register uint8_t i, a, b, c, d, e, x, y, z;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        a = buf[i]; b = buf[i + 1]; c = buf[i + 2]; d = buf[i + 3];
+        e = a ^ b ^ c ^ d;
+        z = rj_xtime(e);
+        x = e ^ rj_xtime(rj_xtime(z^a^c));  y = e ^ rj_xtime(rj_xtime(z^b^d));
+        buf[i] ^= x ^ rj_xtime(a^b);   buf[i+1] ^= y ^ rj_xtime(b^c);
+        buf[i+2] ^= x ^ rj_xtime(c^d); buf[i+3] ^= y ^ rj_xtime(d^a);
+    }
+} /* aes_mixColumns_inv */
+
+/* -------------------------------------------------------------------------- */
+void aes_expandEncKey(uint8_t *k, uint8_t *rc) 
+{
+    register uint8_t i;
+
+    k[0] ^= rj_sbox(k[29]) ^ (*rc);
+    k[1] ^= rj_sbox(k[30]);
+    k[2] ^= rj_sbox(k[31]);
+    k[3] ^= rj_sbox(k[28]);
+    *rc = F( *rc);
+
+    for(i = 4; i < 16; i += 4)  k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+    k[16] ^= rj_sbox(k[12]);
+    k[17] ^= rj_sbox(k[13]);
+    k[18] ^= rj_sbox(k[14]);
+    k[19] ^= rj_sbox(k[15]);
+
+    for(i = 20; i < 32; i += 4) k[i] ^= k[i-4],   k[i+1] ^= k[i-3],
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+
+} /* aes_expandEncKey */
+
+/* -------------------------------------------------------------------------- */
+void aes_expandDecKey(uint8_t *k, uint8_t *rc) 
+{
+    uint8_t i;
+
+    for(i = 28; i > 16; i -= 4) k[i+0] ^= k[i-4], k[i+1] ^= k[i-3], 
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+
+    k[16] ^= rj_sbox(k[12]);
+    k[17] ^= rj_sbox(k[13]);
+    k[18] ^= rj_sbox(k[14]);
+    k[19] ^= rj_sbox(k[15]);
+
+    for(i = 12; i > 0; i -= 4)  k[i+0] ^= k[i-4], k[i+1] ^= k[i-3],
+        k[i+2] ^= k[i-2], k[i+3] ^= k[i-1];
+
+    *rc = FD(*rc);
+    k[0] ^= rj_sbox(k[29]) ^ (*rc);
+    k[1] ^= rj_sbox(k[30]);
+    k[2] ^= rj_sbox(k[31]);
+    k[3] ^= rj_sbox(k[28]);
+} /* aes_expandDecKey */
+
+
+/* -------------------------------------------------------------------------- */
+void aes256_init(aes256_context *ctx, uint8_t *k)
+{
+    uint8_t rcon = 1;
+    register uint8_t i;
+
+    for (i = 0; i < sizeof(ctx->key); i++) ctx->enckey[i] = ctx->deckey[i] = k[i];
+    for (i = 8;--i;) aes_expandEncKey(ctx->deckey, &rcon);
+} /* aes256_init */
+
+/* -------------------------------------------------------------------------- */
+void aes256_done(aes256_context *ctx)
+{
+    register uint8_t i;
+
+    for (i = 0; i < sizeof(ctx->key); i++) 
+        ctx->key[i] = ctx->enckey[i] = ctx->deckey[i] = 0;
+} /* aes256_done */
+
+/* -------------------------------------------------------------------------- */
+void aes256_encrypt_ecb(aes256_context *ctx, uint8_t *buf)
+{
+    uint8_t i, rcon;
+
+    aes_addRoundKey_cpy(buf, ctx->enckey, ctx->key);
+    for(i = 1, rcon = 1; i < 14; ++i)
+    {
+        aes_subBytes(buf);
+        aes_shiftRows(buf);
+        aes_mixColumns(buf);
+        if( i & 1 ) aes_addRoundKey( buf, &ctx->key[16]);
+        else aes_expandEncKey(ctx->key, &rcon), aes_addRoundKey(buf, ctx->key);
+    }
+    aes_subBytes(buf);
+    aes_shiftRows(buf);
+    aes_expandEncKey(ctx->key, &rcon); 
+    aes_addRoundKey(buf, ctx->key);
+} /* aes256_encrypt */
+
+/* -------------------------------------------------------------------------- */
+void aes256_decrypt_ecb(aes256_context *ctx, uint8_t *buf)
+{
+    uint8_t i, rcon;
+
+    aes_addRoundKey_cpy(buf, ctx->deckey, ctx->key);
+    aes_shiftRows_inv(buf);
+    aes_subBytes_inv(buf);
+
+    for (i = 14, rcon = 0x80; --i;)
+    {
+        if( ( i & 1 ) )           
+        {
+            aes_expandDecKey(ctx->key, &rcon);
+            aes_addRoundKey(buf, &ctx->key[16]);
+        }
+        else aes_addRoundKey(buf, ctx->key);
+        aes_mixColumns_inv(buf);
+        aes_shiftRows_inv(buf);
+        aes_subBytes_inv(buf);
+    }
+    aes_addRoundKey( buf, ctx->key); 
+} /* aes256_decrypt */

+ 46 - 46
core/io/aes256.h

@@ -1,46 +1,46 @@
-/*  
-*   Byte-oriented AES-256 implementation.
-*   All lookup tables replaced with 'on the fly' calculations. 
-*
-*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
-*   Other contributors: Hal Finney
-*
-*   Permission to use, copy, modify, and distribute this software for any
-*   purpose with or without fee is hereby granted, provided that the above
-*   copyright notice and this permission notice appear in all copies.
-*
-*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-*/
-
-#ifndef AES_256_H
-#define AES_256_H
-
-#include "typedefs.h"
-
-#ifdef __cplusplus
-extern "C" { 
-#endif
-
-    typedef struct {
-        uint8_t key[32]; 
-        uint8_t enckey[32]; 
-        uint8_t deckey[32];
-    } aes256_context; 
-
-
-    void aes256_init(aes256_context *, uint8_t * /* key */);
-    void aes256_done(aes256_context *);
-    void aes256_encrypt_ecb(aes256_context *, uint8_t * /* plaintext */);
-    void aes256_decrypt_ecb(aes256_context *, uint8_t * /* cipertext */);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+/*  
+*   Byte-oriented AES-256 implementation.
+*   All lookup tables replaced with 'on the fly' calculations. 
+*
+*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
+*   Other contributors: Hal Finney
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef AES_256_H
+#define AES_256_H
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+    typedef struct {
+        uint8_t key[32]; 
+        uint8_t enckey[32]; 
+        uint8_t deckey[32];
+    } aes256_context; 
+
+
+    void aes256_init(aes256_context *, uint8_t * /* key */);
+    void aes256_done(aes256_context *);
+    void aes256_encrypt_ecb(aes256_context *, uint8_t * /* plaintext */);
+    void aes256_decrypt_ecb(aes256_context *, uint8_t * /* cipertext */);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 2 - 2
core/io/compression.cpp

@@ -26,12 +26,12 @@
 /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
+#include "zlib.h"
+#include "os/copymem.h"
 #include "compression.h"
 
 #include "fastlz.h"
-#include "zlib.h"
 #include "zip_io.h"
-#include "os/copymem.h"
 
 int Compression::compress(uint8_t *p_dst, const uint8_t *p_src, int p_src_size,Mode p_mode) {
 

+ 9 - 2
core/io/image_loader.cpp

@@ -50,8 +50,10 @@ Error ImageLoader::load_image(String p_file,Image *p_image, FileAccess *p_custom
 	if (!f) {
 		Error err;
 		f=FileAccess::open(p_file,FileAccess::READ,&err);
-		if (!f)
+		if (!f) {
+			print_line("ERROR OPENING FILE: "+p_file);
 			return err;
+		}
 	}
 				
 	String extension = p_file.extension();
@@ -62,15 +64,20 @@ Error ImageLoader::load_image(String p_file,Image *p_image, FileAccess *p_custom
 		if (!loader[i]->recognize(extension))
 			continue;
 		Error err = loader[i]->load_image(p_image,f);
+
 		if (err!=ERR_FILE_UNRECOGNIZED) {
+
+
 			if (!p_custom)
 				memdelete(f);
+
 			return err;
 		}
 			
 		
 	}
-	
+	print_line("NO LOADER?");
+
 	if (!p_custom)
 		memdelete(f);	
 		

+ 0 - 1
core/io/ioapi.h

@@ -40,7 +40,6 @@
 #endif
 
 #include <stdio.h>
-#include <stdlib.h>
 #include "zlib.h"
 
 #if defined(USE_FILE32API)

+ 4 - 4
core/io/marshalls.cpp

@@ -36,7 +36,10 @@ Error decode_variant(Variant& r_variant,const uint8_t *p_buffer, int p_len,int *
 	const uint8_t * buf=p_buffer;
 	int len=p_len;
 
-	ERR_FAIL_COND_V(len<4,ERR_INVALID_DATA);
+	if (len<4) {
+
+		ERR_FAIL_COND_V(len<4,ERR_INVALID_DATA);
+	}
 
 
 	uint32_t type=decode_uint32(buf);
@@ -299,10 +302,8 @@ Error decode_variant(Variant& r_variant,const uint8_t *p_buffer, int p_len,int *
 				ERR_FAIL_COND_V(len<12,ERR_INVALID_DATA);
 				Vector<StringName> names;
 				Vector<StringName> subnames;
-				bool absolute;
 				StringName prop;
 
-				int i=0;
 				uint32_t namecount=strlen&=0x7FFFFFFF;
 				uint32_t subnamecount = decode_uint32(buf+4);
 				uint32_t flags = decode_uint32(buf+8);
@@ -391,7 +392,6 @@ Error decode_variant(Variant& r_variant,const uint8_t *p_buffer, int p_len,int *
 
 			ie.type=decode_uint32(&buf[0]);
 			ie.device=decode_uint32(&buf[4]);
-			uint32_t len = decode_uint32(&buf[8])-12;
 
 			if (r_len)
 				(*r_len)+=12;

+ 1 - 2
core/io/packet_peer.cpp

@@ -156,7 +156,6 @@ Error PacketPeerStream::_poll_buffer() const {
 	Error err = peer->get_partial_data(&temp_buffer[0], ring_buffer.space_left(), read);
 	if (err)
 		return err;
-
 	if (read==0)
 		return OK;
 
@@ -202,7 +201,7 @@ Error PacketPeerStream::get_packet(const uint8_t **r_buffer,int &r_buffer_size)
 	uint8_t lbuf[4];
 	ring_buffer.copy(lbuf,0,4);
 	remaining-=4;
-	uint32_t len = decode_uint32(lbuf);
+	uint32_t len = decode_uint32(lbuf);	
 	ERR_FAIL_COND_V(remaining<(int)len,ERR_UNAVAILABLE);
 
 	ring_buffer.read(lbuf,4); //get rid of first 4 bytes

+ 316 - 25
core/io/resource_format_binary.cpp

@@ -31,6 +31,7 @@
 #include "globals.h"
 #include "io/file_access_compressed.h"
 #include "io/marshalls.h"
+#include "os/dir_access.h"
 //#define print_bl(m_what) print_line(m_what)
 #define print_bl(m_what)
 
@@ -99,7 +100,9 @@ enum {
 	OBJECT_EMPTY=0,
 	OBJECT_EXTERNAL_RESOURCE=1,
 	OBJECT_INTERNAL_RESOURCE=2,
-	FORMAT_VERSION=0
+	OBJECT_EXTERNAL_RESOURCE_INDEX=3,
+	FORMAT_VERSION=1,
+	FORMAT_VERSION_CAN_RENAME_DEPS=1
 
 
 };
@@ -375,7 +378,7 @@ Error ResourceInteractiveLoaderBinary::parse_variant(Variant& r_v)  {
 				} break;
 				case OBJECT_INTERNAL_RESOURCE: {
 					uint32_t index=f->get_32();
-					String path = res_path+"::"+itos(index);
+					String path = res_path+"::"+itos(index);					
 					RES res = ResourceLoader::load(path);
 					if (res.is_null()) {
 						WARN_PRINT(String("Couldn't load resource: "+path).utf8().get_data());
@@ -384,6 +387,7 @@ Error ResourceInteractiveLoaderBinary::parse_variant(Variant& r_v)  {
 
 				} break;
 				case OBJECT_EXTERNAL_RESOURCE: {
+					//old file format, still around for compatibility
 
 					String type = get_unicode_string();
 					String path = get_unicode_string();
@@ -394,6 +398,10 @@ Error ResourceInteractiveLoaderBinary::parse_variant(Variant& r_v)  {
 
 					}
 
+					if (remaps.find(path)) {
+						path=remaps[path];
+					}
+
 					RES res=ResourceLoader::load(path,type);
 
 					if (res.is_null()) {
@@ -401,6 +409,34 @@ Error ResourceInteractiveLoaderBinary::parse_variant(Variant& r_v)  {
 					}
 					r_v=res;
 
+				} break;
+				case OBJECT_EXTERNAL_RESOURCE_INDEX: {
+					//new file format, just refers to an index in the external list
+					uint32_t erindex = f->get_32();
+
+					if (erindex>=external_resources.size()) {
+						WARN_PRINT("Broken external resource! (index out of size");
+						r_v=Variant();
+					} else {
+
+						String type = external_resources[erindex].type;
+						String path = external_resources[erindex].path;
+
+						if (path.find("://")==-1 && path.is_rel_path()) {
+							// path is relative to file being loaded, so convert to a resource path
+							path=Globals::get_singleton()->localize_path(res_path.get_base_dir().plus_file(path));
+
+						}
+
+						RES res=ResourceLoader::load(path,type);
+
+						if (res.is_null()) {
+							WARN_PRINT(String("Couldn't load resource: "+path).utf8().get_data());
+						}
+						r_v=res;
+					}
+
+
 				} break;
 				default: {
 
@@ -628,17 +664,20 @@ Error ResourceInteractiveLoaderBinary::poll(){
 
 	if (s<external_resources.size()) {
 
-		RES res = ResourceLoader::load(external_resources[s].path,external_resources[s].type);
+		String path = external_resources[s].path;
+		if (remaps.has(path)) {
+			path=remaps[path];
+		}
+		RES res = ResourceLoader::load(path,external_resources[s].type);
 		if (res.is_null()) {
 
 			if (!ResourceLoader::get_abort_on_missing_resources()) {
 
-				ResourceLoader::notify_load_error("Resource Not Found: "+external_resources[s].path);
+				ResourceLoader::notify_dependency_error(local_path,path,external_resources[s].type);
 			} else {
 
-
-				error=ERR_FILE_CORRUPT;
-				ERR_EXPLAIN("Can't load dependency: "+external_resources[s].path);
+				error=ERR_FILE_MISSING_DEPENDENCIES;
+				ERR_EXPLAIN("Can't load dependency: "+path);
 				ERR_FAIL_V(error);
 			}
 
@@ -787,6 +826,27 @@ int ResourceInteractiveLoaderBinary::get_stage_count() const {
 	return external_resources.size()+internal_resources.size();
 }
 
+
+static void save_ustring(FileAccess* f,const String& p_string) {
+
+
+	CharString utf8 = p_string.utf8();
+	f->store_32(utf8.length()+1);
+	f->store_buffer((const uint8_t*)utf8.get_data(),utf8.length()+1);
+}
+
+
+static String get_ustring(FileAccess *f) {
+
+	int len = f->get_32();
+	Vector<char> str_buf;
+	str_buf.resize(len);
+	f->get_buffer((uint8_t*)&str_buf[0],len);
+	String s;
+	s.parse_utf8(&str_buf[0]);
+	return s;
+}
+
 String ResourceInteractiveLoaderBinary::get_unicode_string() {
 
 	int len = f->get_32();
@@ -801,7 +861,7 @@ String ResourceInteractiveLoaderBinary::get_unicode_string() {
 
 
 
-void ResourceInteractiveLoaderBinary::get_dependencies(FileAccess *p_f,List<String> *p_dependencies) {
+void ResourceInteractiveLoaderBinary::get_dependencies(FileAccess *p_f,List<String> *p_dependencies,bool p_add_types) {
 
 	open(p_f);
 	if (error)
@@ -814,6 +874,10 @@ void ResourceInteractiveLoaderBinary::get_dependencies(FileAccess *p_f,List<Stri
 			dep=ResourceLoader::guess_full_filename(dep,external_resources[i].type);
 		}
 
+		if (p_add_types && external_resources[i].type!=String()) {
+			dep+="::"+external_resources[i].type;
+		}
+
 		p_dependencies->push_back(dep);
 	}
 
@@ -866,7 +930,7 @@ void ResourceInteractiveLoaderBinary::open(FileAccess *p_f) {
 	print_bl("minor: "+itos(ver_minor));
 	print_bl("format: "+itos(ver_format));
 
-	if (ver_format<FORMAT_VERSION ||  ver_major>VERSION_MAJOR) {
+	if (ver_format>FORMAT_VERSION ||  ver_major>VERSION_MAJOR) {
 
 		f->close();
 		ERR_EXPLAIN("File Format '"+itos(FORMAT_VERSION)+"."+itos(ver_major)+"."+itos(ver_minor)+"' is too new! Please upgrade to a a new engine version: "+local_path);
@@ -903,6 +967,7 @@ void ResourceInteractiveLoaderBinary::open(FileAccess *p_f) {
 	}
 
 	//see if the exporter has different set of external resources for more efficient loading
+	/*
 	String preload_depts = "deps/"+res_path.md5_text();
 	if (Globals::get_singleton()->has(preload_depts)) {
 		external_resources.clear();
@@ -913,7 +978,7 @@ void ResourceInteractiveLoaderBinary::open(FileAccess *p_f) {
 			external_resources[i].path=depts.get_name(i);
 		}
 		print_line(res_path+" - EXTERNAL RESOURCES: "+itos(external_resources.size()));
-	}
+	}*/
 
 	print_bl("ext resources: "+itos(ext_resources_size));
 	uint32_t int_resources_size=f->get_32();
@@ -973,7 +1038,7 @@ String ResourceInteractiveLoaderBinary::recognize(FileAccess *p_f) {
 	uint32_t ver_minor=f->get_32();
 	uint32_t ver_format=f->get_32();
 
-	if (ver_format<FORMAT_VERSION ||  ver_major>VERSION_MAJOR) {
+	if (ver_format>FORMAT_VERSION ||  ver_major>VERSION_MAJOR) {
 
 		f->close();
 		return "";
@@ -1000,8 +1065,10 @@ ResourceInteractiveLoaderBinary::~ResourceInteractiveLoaderBinary() {
 }
 
 
-Ref<ResourceInteractiveLoader> ResourceFormatLoaderBinary::load_interactive(const String &p_path) {
+Ref<ResourceInteractiveLoader> ResourceFormatLoaderBinary::load_interactive(const String &p_path, Error *r_error) {
 
+	if (r_error)
+		*r_error=ERR_FILE_CANT_OPEN;
 
 	Error err;
 	FileAccess *f = FileAccess::open(p_path,FileAccess::READ,&err);
@@ -1114,7 +1181,7 @@ Error ResourceFormatLoaderBinary::load_import_metadata(const String &p_path, Ref
 }
 
 
-void ResourceFormatLoaderBinary::get_dependencies(const String& p_path,List<String> *p_dependencies) {
+void ResourceFormatLoaderBinary::get_dependencies(const String& p_path,List<String> *p_dependencies,bool p_add_types) {
 
 	FileAccess *f = FileAccess::open(p_path,FileAccess::READ);
 	ERR_FAIL_COND(!f);
@@ -1123,7 +1190,217 @@ void ResourceFormatLoaderBinary::get_dependencies(const String& p_path,List<Stri
 	ria->local_path=Globals::get_singleton()->localize_path(p_path);
 	ria->res_path=ria->local_path;
 //	ria->set_local_path( Globals::get_singleton()->localize_path(p_path) );
-	ria->get_dependencies(f,p_dependencies);
+	ria->get_dependencies(f,p_dependencies,p_add_types);
+}
+
+Error ResourceFormatLoaderBinary::rename_dependencies(const String &p_path,const Map<String,String>& p_map) {
+
+
+//	Error error=OK;
+
+
+	FileAccess *f=FileAccess::open(p_path,FileAccess::READ);
+	ERR_FAIL_COND_V(!f,ERR_CANT_OPEN);
+
+	FileAccess* fw=NULL;//=FileAccess::open(p_path+".depren");
+
+	String local_path=p_path.get_base_dir();
+
+	uint8_t header[4];
+	f->get_buffer(header,4);
+	if (header[0]=='R' && header[1]=='S' && header[2]=='C' && header[3]=='C') {
+		//compressed
+		FileAccessCompressed *fac = memnew( FileAccessCompressed );
+		fac->open_after_magic(f);
+		f=fac;
+
+		FileAccessCompressed *facw = memnew( FileAccessCompressed );
+		facw->configure("RSCC");
+		Error err = facw->_open(p_path+".depren",FileAccess::WRITE);
+		if (err) {
+			memdelete(fac);
+			memdelete(facw);
+			ERR_FAIL_COND_V(err,ERR_FILE_CORRUPT);
+		}
+
+		fw=facw;
+
+
+	} else if (header[0]!='R' || header[1]!='S' || header[2]!='R' || header[3]!='C') {
+		//not normal
+
+		//error=ERR_FILE_UNRECOGNIZED;
+		memdelete(f);
+		ERR_EXPLAIN("Unrecognized binary resource file: "+local_path);
+		ERR_FAIL_V(ERR_FILE_UNRECOGNIZED);
+	} else {
+		fw = FileAccess::open(p_path+".depren",FileAccess::WRITE);
+		if (!fw) {
+			memdelete(f);
+		}
+		ERR_FAIL_COND_V(!fw,ERR_CANT_CREATE);
+	}
+
+	bool big_endian = f->get_32();
+#ifdef BIG_ENDIAN_ENABLED
+	endian_swap = !big_endian;
+#else
+	bool endian_swap = big_endian;
+#endif
+
+	bool use_real64 = f->get_32();
+
+	f->set_endian_swap(big_endian!=0); //read big endian if saved as big endian
+	fw->store_32(endian_swap);
+	fw->set_endian_swap(big_endian!=0);
+	fw->store_32(use_real64); //use real64
+
+	uint32_t ver_major=f->get_32();
+	uint32_t ver_minor=f->get_32();
+	uint32_t ver_format=f->get_32();
+
+	if (ver_format<FORMAT_VERSION_CAN_RENAME_DEPS) {
+
+		memdelete(f);
+		memdelete(fw);
+		DirAccess *da = DirAccess::create(DirAccess::ACCESS_FILESYSTEM);
+		da->remove(p_path+".depren");
+		memdelete(da);
+		//fuck it, use the old approach;
+
+		WARN_PRINT(("This file is old, so it can't refactor dependencies, opening and resaving: "+p_path).utf8().get_data());
+
+		Error err;
+		f = FileAccess::open(p_path,FileAccess::READ,&err);
+		if (err!=OK) {
+			ERR_FAIL_COND_V(err!=OK,ERR_FILE_CANT_OPEN);
+		}
+
+		Ref<ResourceInteractiveLoaderBinary> ria = memnew( ResourceInteractiveLoaderBinary );
+		ria->local_path=Globals::get_singleton()->localize_path(p_path);
+		ria->res_path=ria->local_path;
+		ria->remaps=p_map;
+	//	ria->set_local_path( Globals::get_singleton()->localize_path(p_path) );
+		ria->open(f);
+
+		err = ria->poll();
+
+		while(err==OK) {
+			err=ria->poll();
+		}
+
+		ERR_FAIL_COND_V(err!=ERR_FILE_EOF,ERR_FILE_CORRUPT);
+		RES res = ria->get_resource();
+		ERR_FAIL_COND_V(!res.is_valid(),ERR_FILE_CORRUPT);
+
+		return ResourceFormatSaverBinary::singleton->save(p_path,res);
+	}
+
+	if (ver_format>FORMAT_VERSION ||  ver_major>VERSION_MAJOR) {
+
+		memdelete(f);
+		memdelete(fw);
+		ERR_EXPLAIN("File Format '"+itos(FORMAT_VERSION)+"."+itos(ver_major)+"."+itos(ver_minor)+"' is too new! Please upgrade to a a new engine version: "+local_path);
+		ERR_FAIL_V(ERR_FILE_UNRECOGNIZED);
+
+	}
+
+	fw->store_32( VERSION_MAJOR ); //current version
+	fw->store_32( VERSION_MINOR );
+	fw->store_32( FORMAT_VERSION );
+
+	save_ustring(fw,get_ustring(f)); //type
+
+
+	size_t md_ofs = f->get_pos();
+	size_t importmd_ofs = f->get_64();
+	fw->store_64(0); //metadata offset
+
+	for(int i=0;i<14;i++) {
+		fw->store_32(0);
+		f->get_32();
+	}
+
+	//string table
+	uint32_t string_table_size=f->get_32();
+
+	fw->store_32(string_table_size);
+
+	for(uint32_t i=0;i<string_table_size;i++) {
+
+		String s = get_ustring(f);
+		save_ustring(fw,s);
+	}
+
+	//external resources
+	uint32_t ext_resources_size=f->get_32();
+	fw->store_32(ext_resources_size);
+	for(uint32_t i=0;i<ext_resources_size;i++) {
+
+		String type = get_ustring(f);
+		String path = get_ustring(f);
+
+		bool relative=false;
+		if (!path.begins_with("res://")) {
+			path=local_path.plus_file(path).simplify_path();
+			relative=true;
+		}
+
+
+		if (p_map.has(path)) {
+			String np=p_map[path];
+			path=np;
+		}
+
+		if (relative) {
+			//restore relative
+			path=local_path.path_to_file(path);
+		}
+
+		save_ustring(fw,type);
+		save_ustring(fw,path);
+	}
+
+	int64_t size_diff = (int64_t)fw->get_pos() - (int64_t)f->get_pos();
+
+	//internal resources
+	uint32_t int_resources_size=f->get_32();
+	fw->store_32(int_resources_size);
+
+	for(uint32_t i=0;i<int_resources_size;i++) {
+
+
+		String path=get_ustring(f);
+		uint64_t offset=f->get_64();
+		save_ustring(fw,path);
+		fw->store_64(offset+size_diff);
+	}
+
+	//rest of file
+	uint8_t b = f->get_8();
+	while(!f->eof_reached()) {
+		fw->store_8(b);
+		b = f->get_8();
+	}
+
+	bool all_ok = fw->get_error()==OK;
+
+	fw->seek(md_ofs);
+	fw->store_64(importmd_ofs+size_diff);
+
+
+	memdelete(f);
+	memdelete(fw);
+
+	if (!all_ok) {
+		return ERR_CANT_CREATE;
+	}
+
+	DirAccess *da = DirAccess::create(DirAccess::ACCESS_RESOURCES);
+	da->remove(p_path);
+	da->rename(p_path+".depren",p_path);
+	memdelete(da);
+	return OK;
 }
 
 
@@ -1433,10 +1710,8 @@ void ResourceFormatSaverBinaryInstance::write_variant(const Variant& p_property,
 			}
 
 			if (res->get_path().length() && res->get_path().find("::")==-1) {
-				f->store_32(OBJECT_EXTERNAL_RESOURCE);
-				save_unicode_string(res->get_save_type());
-				String path=relative_paths?local_path.path_to_file(res->get_path()):res->get_path();
-				save_unicode_string(path);
+				f->store_32(OBJECT_EXTERNAL_RESOURCE_INDEX);
+				f->store_32(external_resources[res]);
 			} else {
 
 				if (!resource_set.has(res)) {
@@ -1594,11 +1869,12 @@ void ResourceFormatSaverBinaryInstance::_find_resources(const Variant& p_variant
 
 			RES res = p_variant.operator RefPtr();
 
-			if (res.is_null())
+			if (res.is_null() || external_resources.has(res))
 				return;
 
 			if (!p_main && (!bundle_resources ) && res->get_path().length() && res->get_path().find("::") == -1 ) {
-				external_resources.insert(res);
+				int idx = external_resources.size();
+				external_resources[res]=idx;
 				return;
 			}
 
@@ -1842,10 +2118,18 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path,const RES& p_
 
 	// save external resource table
 	f->store_32(external_resources.size()); //amount of external resources
-	for(Set<RES>::Element *E=external_resources.front();E;E=E->next()) {
+	Vector<RES> save_order;
+	save_order.resize(external_resources.size());
+
+	for(Map<RES,int>::Element *E=external_resources.front();E;E=E->next()) {
+		save_order[E->get()]=E->key();
+	}
 
-		save_unicode_string(E->get()->get_save_type());
-		String path = E->get()->get_path();
+	for(int i=0;i<save_order.size();i++) {
+
+		save_unicode_string(save_order[i]->get_save_type());
+		String path = save_order[i]->get_path();
+		path=relative_paths?local_path.path_to_file(path):path;
 		save_unicode_string(path);
 	}
 	// save internal resource table
@@ -1888,10 +2172,11 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path,const RES& p_
 
 			save_unicode_string("local://"+itos(r->get_subindex()));
 			if (takeover_paths) {
-				r->set_path(p_path+"::"+itos(ofs_pos.size()),true);
+				r->set_path(p_path+"::"+itos(r->get_subindex()),true);
 			}
-		} else
+		} else {
 			save_unicode_string(r->get_path()); //actual external
+		}
 		ofs_pos.push_back(f->get_pos());
 		f->store_64(0); //offset in 64 bits
 	}
@@ -1995,3 +2280,9 @@ void ResourceFormatSaverBinary::get_recognized_extensions(const RES& p_resource,
 
 }
 
+ResourceFormatSaverBinary* ResourceFormatSaverBinary::singleton=NULL;
+
+ResourceFormatSaverBinary::ResourceFormatSaverBinary() {
+
+	singleton=this;
+}

+ 9 - 5
core/io/resource_format_binary.h

@@ -71,6 +71,7 @@ class ResourceInteractiveLoaderBinary : public ResourceInteractiveLoader {
 	String get_unicode_string();
 	void _advance_padding(uint32_t p_len);
 
+	Map<String,String> remaps;
 	Error error;
 
 	int stage;
@@ -88,9 +89,10 @@ public:
 	virtual int get_stage() const;
 	virtual int get_stage_count() const;
 
+	void set_remaps(const Map<String,String>& p_remaps) { remaps=p_remaps; }
 	void open(FileAccess *p_f);
 	String recognize(FileAccess *p_f);
-	void get_dependencies(FileAccess *p_f,List<String> *p_dependencies);
+	void get_dependencies(FileAccess *p_f, List<String> *p_dependencies, bool p_add_types);
 
 
 	ResourceInteractiveLoaderBinary();
@@ -101,13 +103,14 @@ public:
 class ResourceFormatLoaderBinary : public ResourceFormatLoader {
 public:
 
-	virtual Ref<ResourceInteractiveLoader> load_interactive(const String &p_path);
+	virtual Ref<ResourceInteractiveLoader> load_interactive(const String &p_path,Error *r_error=NULL);
 	virtual void get_recognized_extensions_for_type(const String& p_type,List<String> *p_extensions) const;
 	virtual void get_recognized_extensions(List<String> *p_extensions) const;
 	virtual bool handles_type(const String& p_type) const;
 	virtual String get_resource_type(const String &p_path) const;
-	virtual void get_dependencies(const String& p_path,List<String> *p_dependencies);
+	virtual void get_dependencies(const String& p_path, List<String> *p_dependencies, bool p_add_types=false);
 	virtual Error load_import_metadata(const String &p_path, Ref<ResourceImportMetadata>& r_var) const;
+	virtual Error rename_dependencies(const String &p_path,const Map<String,String>& p_map);
 
 
 
@@ -134,7 +137,7 @@ class ResourceFormatSaverBinaryInstance  {
 	Vector<StringName> strings;
 
 
-	Set<RES> external_resources;
+	Map<RES,int> external_resources;
 	List<RES> saved_resources;
 
 
@@ -174,11 +177,12 @@ class ResourceFormatSaverBinary : public ResourceFormatSaver  {
 
 public:
 
+	static ResourceFormatSaverBinary* singleton;
 	virtual Error save(const String &p_path,const RES& p_resource,uint32_t p_flags=0);
 	virtual bool recognize(const RES& p_resource) const;
 	virtual void get_recognized_extensions(const RES& p_resource,List<String> *p_extensions) const;
 
-
+	ResourceFormatSaverBinary();
 };
 
 

+ 271 - 55
core/io/resource_format_xml.cpp

@@ -29,10 +29,10 @@
 #include "resource_format_xml.h"
 #include "globals.h"
 #include "version.h"
+#include "os/dir_access.h"
 
 
-
-ResourceInteractiveLoaderXML::Tag* ResourceInteractiveLoaderXML::parse_tag(bool *r_exit,bool p_printerr) {
+ResourceInteractiveLoaderXML::Tag* ResourceInteractiveLoaderXML::parse_tag(bool *r_exit, bool p_printerr, List<String> *r_order) {
 
 
 	while(get_char()!='<' && !f->eof_reached()) {}
@@ -107,7 +107,11 @@ ResourceInteractiveLoaderXML::Tag* ResourceInteractiveLoaderXML::parse_tag(bool
 				if (r_value.size()) {
 
 					r_value.push_back(0);
-					tag.args[name].parse_utf8(r_value.get_data());
+					String str;
+					str.parse_utf8(r_value.get_data());
+					tag.args[name]=str;
+					if (r_order)
+						r_order->push_back(name);
 				}
 				break;
 
@@ -119,7 +123,11 @@ ResourceInteractiveLoaderXML::Tag* ResourceInteractiveLoaderXML::parse_tag(bool
 				} else if (reading_value && r_value.size()) {
 
 					r_value.push_back(0);
-					tag.args[name].parse_utf8(r_value.get_data());
+					String str;
+					str.parse_utf8(r_value.get_data());
+					tag.args[name]=str;
+					if (r_order)
+						r_order->push_back(name);
 					name="";
 					r_value.clear();
 					reading_value=false;
@@ -463,6 +471,10 @@ Error ResourceInteractiveLoaderXML::parse_property(Variant& r_v, String &r_name)
 
 			}
 
+			if (remaps.has(path)) {
+				path=remaps[path];
+			}
+
 			//take advantage of the resource loader cache. The resource is cached on it, even if
 			RES res=ResourceLoader::load(path,hint);
 
@@ -473,10 +485,31 @@ Error ResourceInteractiveLoaderXML::parse_property(Variant& r_v, String &r_name)
 			}
 
 			r_v=res.get_ref_ptr();
+		} else if (tag->args.has("external")) {
+
+			int index = tag->args["external"].to_int();
+			if (ext_resources.has(index)) {
+				String path=ext_resources[index].path;
+				String type=ext_resources[index].type;
+
+				//take advantage of the resource loader cache. The resource is cached on it, even if
+				RES res=ResourceLoader::load(path,type);
+
+				if (res.is_null()) {
+
+					WARN_PRINT(String("Couldn't load externalresource: "+path).ascii().get_data());
+				}
+
+				r_v=res.get_ref_ptr();
+			} else {
+				WARN_PRINT(String("Invalid external resource index: "+itos(index)).ascii().get_data());
+
+			}
 		}
 
 
 
+
 		Error err=goto_end_of_tag();
 		if (err) {
 			ERR_EXPLAIN(local_path+":"+itos(get_current_line())+": Error closing <resource> tag.");
@@ -1364,32 +1397,6 @@ Error ResourceInteractiveLoaderXML::poll() {
 	if (error!=OK)
 		return error;
 
-	if (ext_resources.size()) {
-
-		error=ERR_FILE_CORRUPT;
-		String path=ext_resources.front()->get();
-
-		RES res = ResourceLoader::load(path);
-
-		if (res.is_null()) {
-
-			if (ResourceLoader::get_abort_on_missing_resources()) {
-				ERR_EXPLAIN(local_path+":"+itos(get_current_line())+": editor exported nonexistent resource at: "+path);
-				ERR_FAIL_V(error);
-			} else {
-				ResourceLoader::notify_load_error("Resource Not Found: "+path);
-			}
-		} else {
-
-			resource_cache.push_back(res);
-		}
-
-		error=OK;
-		ext_resources.pop_front();
-		resource_current++;
-		return error;
-	}
-
 	bool exit;
 	Tag *tag = parse_tag(&exit);
 
@@ -1413,12 +1420,13 @@ Error ResourceInteractiveLoaderXML::poll() {
 		ERR_EXPLAIN(local_path+":"+itos(get_current_line())+": <ext_resource> missing 'path' field.");
 		ERR_FAIL_COND_V(!tag->args.has("path"),ERR_FILE_CORRUPT);
 
-		String type;
+		String type="Resource";
 		if (tag->args.has("type"))
 			type=tag->args["type"];
 
 		String path = tag->args["path"];
 
+
 		ERR_EXPLAIN(local_path+":"+itos(get_current_line())+": <ext_resource> can't use a local path, this is a bug?.");
 		ERR_FAIL_COND_V(path.begins_with("local://"),ERR_FILE_CORRUPT);
 
@@ -1427,6 +1435,9 @@ Error ResourceInteractiveLoaderXML::poll() {
 			path=Globals::get_singleton()->localize_path(local_path.get_base_dir().plus_file(path));
 		}
 
+		if (remaps.has(path)) {
+			path=remaps[path];
+		}
 
 		RES res = ResourceLoader::load(path,type);
 
@@ -1436,13 +1447,21 @@ Error ResourceInteractiveLoaderXML::poll() {
 				ERR_EXPLAIN(local_path+":"+itos(get_current_line())+": <ext_resource> referenced nonexistent resource at: "+path);
 				ERR_FAIL_V(error);
 			} else {
-				ResourceLoader::notify_load_error("Resource Not Found: "+path);
+				ResourceLoader::notify_dependency_error(local_path,path,type);
 			}
 		} else {
 
 			resource_cache.push_back(res);
 		}
 
+		if (tag->args.has("index")) {
+			ExtResource er;
+			er.path=path;
+			er.type=type;
+			ext_resources[tag->args["index"].to_int()]=er;
+		}
+
+
 		Error err = close_tag("ext_resource");
 		if (err)
 			return error;
@@ -1566,7 +1585,7 @@ int ResourceInteractiveLoaderXML::get_stage() const {
 }
 int ResourceInteractiveLoaderXML::get_stage_count() const {
 
-	return resources_total+ext_resources.size();
+	return resources_total;//+ext_resources;
 }
 
 ResourceInteractiveLoaderXML::~ResourceInteractiveLoaderXML() {
@@ -1574,7 +1593,7 @@ ResourceInteractiveLoaderXML::~ResourceInteractiveLoaderXML() {
 	memdelete(f);
 }
 
-void ResourceInteractiveLoaderXML::get_dependencies(FileAccess *f,List<String> *p_dependencies) {
+void ResourceInteractiveLoaderXML::get_dependencies(FileAccess *f,List<String> *p_dependencies,bool p_add_types) {
 
 
 	open(f);
@@ -1617,6 +1636,10 @@ void ResourceInteractiveLoaderXML::get_dependencies(FileAccess *f,List<String> *
 			path = ResourceLoader::guess_full_filename(path,type);
 		}
 
+		if (p_add_types && tag->args.has("type")) {
+			path+="::"+tag->args["type"];
+		}
+
 		p_dependencies->push_back(path);
 
 		Error err = close_tag("ext_resource");
@@ -1628,6 +1651,167 @@ void ResourceInteractiveLoaderXML::get_dependencies(FileAccess *f,List<String> *
 
 }
 
+Error ResourceInteractiveLoaderXML::rename_dependencies(FileAccess *p_f, const String &p_path,const Map<String,String>& p_map) {
+
+	open(p_f);
+	ERR_FAIL_COND_V(error!=OK,error);
+
+	//FileAccess
+
+	bool old_format=false;
+
+	FileAccess *fw = NULL;
+
+	String base_path=local_path.get_base_dir();
+
+	while(true) {
+		bool exit;
+		List<String> order;
+
+		Tag *tag = parse_tag(&exit,true,&order);
+
+		bool done=false;
+
+		if (!tag) {
+			if (fw) {
+				memdelete(fw);
+			}
+			error=ERR_FILE_CORRUPT;
+			ERR_FAIL_COND_V(!exit,error);
+			error=ERR_FILE_EOF;
+
+			return error;
+		}
+
+		if (tag->name=="ext_resource") {
+
+			if (!tag->args.has("index") || !tag->args.has("path") || !tag->args.has("type")) {
+				old_format=true;
+				break;
+			}
+
+			if (!fw) {
+
+				fw=FileAccess::open(p_path+".depren",FileAccess::WRITE);
+				fw->store_line("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); //no escape
+				fw->store_line("<resource_file type=\""+resource_type+"\" subresource_count=\""+itos(resources_total)+"\" version=\""+itos(VERSION_MAJOR)+"."+itos(VERSION_MINOR)+"\" version_name=\""+VERSION_FULL_NAME+"\">");
+
+			}
+
+			String path = tag->args["path"];
+			String index = tag->args["index"];
+			String type = tag->args["type"];
+
+
+			bool relative=false;
+			if (!path.begins_with("res://")) {
+				path=base_path.plus_file(path).simplify_path();
+				relative=true;
+			}
+
+
+			if (p_map.has(path)) {
+				String np=p_map[path];
+				path=np;
+			}
+
+			if (relative) {
+				//restore relative
+				path=base_path.path_to_file(path);
+			}
+
+			tag->args["path"]=path;
+			tag->args["index"]=index;
+			tag->args["type"]=type;
+
+		} else {
+
+			done=true;
+		}
+
+		String tagt="\t<";
+		if (exit)
+			tagt+="/";
+		tagt+=tag->name;
+
+		for(List<String>::Element *E=order.front();E;E=E->next()) {
+			tagt+=" "+E->get()+"=\""+tag->args[E->get()]+"\"";
+		}
+		tagt+=">";
+		fw->store_line(tagt);
+		if (done)
+			break;
+		close_tag("ext_resource");
+		fw->store_line("\t</ext_resource>");
+
+	}
+
+
+	if (old_format) {
+		if (fw)
+			memdelete(fw);
+
+		DirAccess *da = DirAccess::create(DirAccess::ACCESS_FILESYSTEM);
+		da->remove(p_path+".depren");
+		memdelete(da);
+		//fuck it, use the old approach;
+
+		WARN_PRINT(("This file is old, so it can't refactor dependencies, opening and resaving: "+p_path).utf8().get_data());
+
+		Error err;
+		FileAccess *f2 = FileAccess::open(p_path,FileAccess::READ,&err);
+		if (err!=OK) {
+			ERR_FAIL_COND_V(err!=OK,ERR_FILE_CANT_OPEN);
+		}
+
+		Ref<ResourceInteractiveLoaderXML> ria = memnew( ResourceInteractiveLoaderXML );
+		ria->local_path=Globals::get_singleton()->localize_path(p_path);
+		ria->res_path=ria->local_path;
+		ria->remaps=p_map;
+	//	ria->set_local_path( Globals::get_singleton()->localize_path(p_path) );
+		ria->open(f2);
+
+		err = ria->poll();
+
+		while(err==OK) {
+			err=ria->poll();
+		}
+
+		ERR_FAIL_COND_V(err!=ERR_FILE_EOF,ERR_FILE_CORRUPT);
+		RES res = ria->get_resource();
+		ERR_FAIL_COND_V(!res.is_valid(),ERR_FILE_CORRUPT);
+
+		return ResourceFormatSaverXML::singleton->save(p_path,res);
+	}
+
+	if (!fw) {
+
+		return OK; //nothing to rename, do nothing
+	}
+
+	uint8_t c=f->get_8();
+	while(!f->eof_reached()) {
+		fw->store_8(c);
+		c=f->get_8();
+	}
+
+	bool all_ok = fw->get_error()==OK;
+
+	memdelete(fw);
+
+	if (!all_ok) {
+		return ERR_CANT_CREATE;
+	}
+
+	DirAccess *da = DirAccess::create(DirAccess::ACCESS_RESOURCES);
+	da->remove(p_path);
+	da->rename(p_path+".depren",p_path);
+	memdelete(da);
+
+	return OK;
+
+}
+
 
 void ResourceInteractiveLoaderXML::open(FileAccess *p_f) {
 
@@ -1686,6 +1870,7 @@ void ResourceInteractiveLoaderXML::open(FileAccess *p_f) {
 
 	}
 
+	/*
 	String preload_depts = "deps/"+local_path.md5_text();
 	if (Globals::get_singleton()->has(preload_depts)) {
 		ext_resources.clear();
@@ -1697,7 +1882,7 @@ void ResourceInteractiveLoaderXML::open(FileAccess *p_f) {
 		}
 		print_line(local_path+" - EXTERNAL RESOURCES: "+itos(ext_resources.size()));
 	}
-
+*/
 
 }
 
@@ -1730,7 +1915,10 @@ String ResourceInteractiveLoaderXML::recognize(FileAccess *p_f) {
 
 /////////////////////
 
-Ref<ResourceInteractiveLoader> ResourceFormatLoaderXML::load_interactive(const String &p_path) {
+Ref<ResourceInteractiveLoader> ResourceFormatLoaderXML::load_interactive(const String &p_path, Error *r_error) {
+
+	if (r_error)
+		*r_error=ERR_CANT_OPEN;
 
 	Error err;
 	FileAccess *f = FileAccess::open(p_path,FileAccess::READ,&err);
@@ -1816,7 +2004,7 @@ String ResourceFormatLoaderXML::get_resource_type(const String &p_path) const{
 }
 
 
-void ResourceFormatLoaderXML::get_dependencies(const String& p_path,List<String> *p_dependencies) {
+void ResourceFormatLoaderXML::get_dependencies(const String& p_path,List<String> *p_dependencies,bool p_add_types) {
 
 	FileAccess *f = FileAccess::open(p_path,FileAccess::READ);
 	if (!f) {
@@ -1828,11 +2016,27 @@ void ResourceFormatLoaderXML::get_dependencies(const String& p_path,List<String>
 	ria->local_path=Globals::get_singleton()->localize_path(p_path);
 	ria->res_path=ria->local_path;
 //	ria->set_local_path( Globals::get_singleton()->localize_path(p_path) );
-	ria->get_dependencies(f,p_dependencies);
+	ria->get_dependencies(f,p_dependencies,p_add_types);
+
 
+}
+
+Error ResourceFormatLoaderXML::rename_dependencies(const String &p_path,const Map<String,String>& p_map) {
+
+	FileAccess *f = FileAccess::open(p_path,FileAccess::READ);
+	if (!f) {
 
+		ERR_FAIL_V(ERR_CANT_OPEN);
+	}
+
+	Ref<ResourceInteractiveLoaderXML> ria = memnew( ResourceInteractiveLoaderXML );
+	ria->local_path=Globals::get_singleton()->localize_path(p_path);
+	ria->res_path=ria->local_path;
+//	ria->set_local_path( Globals::get_singleton()->localize_path(p_path) );
+	return ria->rename_dependencies(f,p_path,p_map);
 }
 
+
 /****************************************************************************************/
 /****************************************************************************************/
 /****************************************************************************************/
@@ -1852,8 +2056,8 @@ void ResourceFormatLoaderXML::get_dependencies(const String& p_path,List<String>
 void ResourceFormatSaverXMLInstance::escape(String& p_str) {
 
 	p_str=p_str.replace("&","&amp;");
-	p_str=p_str.replace("<","&gt;");
-	p_str=p_str.replace(">","&lt;");
+	p_str=p_str.replace("<","&lt;");
+	p_str=p_str.replace(">","&gt;");
 	p_str=p_str.replace("'","&apos;");
 	p_str=p_str.replace("\"","&quot;");
 	for (char i=1;i<32;i++) {
@@ -2024,20 +2228,26 @@ void ResourceFormatSaverXMLInstance::write_property(const String& p_name,const V
 				return; // don't save it
 			}
 
-			params="resource_type=\""+res->get_save_type()+"\"";
+			if (external_resources.has(res)) {
 
-			if (res->get_path().length() && res->get_path().find("::")==-1) {
-				//external resource
-				String path=relative_paths?local_path.path_to_file(res->get_path()):res->get_path();
-				escape(path);
-				params+=" path=\""+path+"\"";
+				params="external=\""+itos(external_resources[res])+"\"";
 			} else {
+				params="resource_type=\""+res->get_save_type()+"\"";
 
-				//internal resource
-				ERR_EXPLAIN("Resource was not pre cached for the resource section, bug?");
-				ERR_FAIL_COND(!resource_set.has(res));
 
-				params+=" path=\"local://"+itos(res->get_subindex())+"\"";
+				if (res->get_path().length() && res->get_path().find("::")==-1) {
+					//external resource
+					String path=relative_paths?local_path.path_to_file(res->get_path()):res->get_path();
+					escape(path);
+					params+=" path=\""+path+"\"";
+				} else {
+
+					//internal resource
+					ERR_EXPLAIN("Resource was not pre cached for the resource section, bug?");
+					ERR_FAIL_COND(!resource_set.has(res));
+
+					params+=" path=\"local://"+itos(res->get_subindex())+"\"";
+				}
 			}
 
 		} break;
@@ -2441,11 +2651,12 @@ void ResourceFormatSaverXMLInstance::_find_resources(const Variant& p_variant,bo
 
 			RES res = p_variant.operator RefPtr();
 
-			if (res.is_null())
+			if (res.is_null() || external_resources.has(res))
 				return;
 
 			if (!p_main && (!bundle_resources ) && res->get_path().length() && res->get_path().find("::") == -1 ) {
-				external_resources.push_back(res);
+				int index = external_resources.size();
+				external_resources[res]=index;
 				return;
 			}
 
@@ -2533,12 +2744,12 @@ Error ResourceFormatSaverXMLInstance::save(const String &p_path,const RES& p_res
 	enter_tag("resource_file","type=\""+p_resource->get_type()+"\" subresource_count=\""+itos(saved_resources.size()+external_resources.size())+"\" version=\""+itos(VERSION_MAJOR)+"."+itos(VERSION_MINOR)+"\" version_name=\""+VERSION_FULL_NAME+"\"");
 	write_string("\n",false);
 
-	for(List<RES>::Element *E=external_resources.front();E;E=E->next()) {
+	for(Map<RES,int>::Element *E=external_resources.front();E;E=E->next()) {
 
 		write_tabs();
-		String p = E->get()->get_path();
+		String p = E->key()->get_path();
 
-		enter_tag("ext_resource","path=\""+p+"\" type=\""+E->get()->get_save_type()+"\""); //bundled
+		enter_tag("ext_resource","path=\""+p+"\" type=\""+E->key()->get_save_type()+"\" index=\""+itos(E->get())+"\""); //bundled
 		exit_tag("ext_resource"); //bundled
 		write_string("\n",false);
 	}
@@ -2667,3 +2878,8 @@ void ResourceFormatSaverXML::get_recognized_extensions(const RES& p_resource,Lis
 	}
 
 }
+
+ResourceFormatSaverXML* ResourceFormatSaverXML::singleton=NULL;
+ResourceFormatSaverXML::ResourceFormatSaverXML() {
+	singleton=this;
+}

+ 18 - 7
core/io/resource_format_xml.h

@@ -46,13 +46,21 @@ class ResourceInteractiveLoaderXML : public ResourceInteractiveLoader {
 
 		String name;
 		HashMap<String,String> args;
+
 	};
 
 	_FORCE_INLINE_ Error _parse_array_element(Vector<char> &buff,bool p_number_only,FileAccess *f,bool *end);
 
 
+	struct ExtResource {
+		String path;
+		String type;
+	};
+
 
-	List<StringName> ext_resources;
+	Map<String,String> remaps;
+
+	Map<int,ExtResource> ext_resources;
 
 	int resources_total;
 	int resource_current;
@@ -66,7 +74,7 @@ friend class ResourceFormatLoaderXML;
 	List<Tag> tag_stack;
 
 	List<RES> resource_cache;
-	Tag* parse_tag(bool* r_exit=NULL,bool p_printerr=true);
+	Tag* parse_tag(bool* r_exit=NULL,bool p_printerr=true,List<String> *r_order=NULL);
 	Error close_tag(const String& p_name);
 	_FORCE_INLINE_ void unquote(String& p_str);
 	Error goto_end_of_tag();
@@ -87,7 +95,8 @@ public:
 
 	void open(FileAccess *p_f);
 	String recognize(FileAccess *p_f);
-	void get_dependencies(FileAccess *p_f,List<String> *p_dependencies);
+	void get_dependencies(FileAccess *p_f, List<String> *p_dependencies, bool p_add_types);
+	Error rename_dependencies(FileAccess *p_f, const String &p_path,const Map<String,String>& p_map);
 
 
 	~ResourceInteractiveLoaderXML();
@@ -97,12 +106,13 @@ public:
 class ResourceFormatLoaderXML : public ResourceFormatLoader {
 public:
 
-	virtual Ref<ResourceInteractiveLoader> load_interactive(const String &p_path);
+	virtual Ref<ResourceInteractiveLoader> load_interactive(const String &p_path,Error *r_error=NULL);
 	virtual void get_recognized_extensions_for_type(const String& p_type,List<String> *p_extensions) const;
 	virtual void get_recognized_extensions(List<String> *p_extensions) const;
 	virtual bool handles_type(const String& p_type) const;
 	virtual String get_resource_type(const String &p_path) const;
-	virtual void get_dependencies(const String& p_path,List<String> *p_dependencies);
+	virtual void get_dependencies(const String& p_path, List<String> *p_dependencies, bool p_add_types=false);
+	virtual Error rename_dependencies(const String &p_path,const Map<String,String>& p_map);
 
 
 };
@@ -125,7 +135,7 @@ class ResourceFormatSaverXMLInstance  {
 	int depth;
 	Set<RES> resource_set;
 	List<RES> saved_resources;
-	List<RES> external_resources;
+	Map<RES,int> external_resources;
 
 	void enter_tag(const char* p_tag,const String& p_args=String());
 	void exit_tag(const char* p_tag);
@@ -148,11 +158,12 @@ public:
 
 class ResourceFormatSaverXML : public ResourceFormatSaver {
 public:
+	static ResourceFormatSaverXML* singleton;
 	virtual Error save(const String &p_path,const RES& p_resource,uint32_t p_flags=0);
 	virtual bool recognize(const RES& p_resource) const;
 	virtual void get_recognized_extensions(const RES& p_resource,List<String> *p_extensions) const;
 
-
+	ResourceFormatSaverXML();
 };
 
 

+ 54 - 13
core/io/resource_loader.cpp

@@ -103,10 +103,10 @@ public:
 
 
 
-Ref<ResourceInteractiveLoader> ResourceFormatLoader::load_interactive(const String &p_path) {
+Ref<ResourceInteractiveLoader> ResourceFormatLoader::load_interactive(const String &p_path, Error *r_error) {
 
 	//either this
-	Ref<Resource> res = load(p_path);
+	Ref<Resource> res = load(p_path,p_path,r_error);
 	if (res.is_null())
 		return Ref<ResourceInteractiveLoader>();
 
@@ -115,12 +115,13 @@ Ref<ResourceInteractiveLoader> ResourceFormatLoader::load_interactive(const Stri
 	return ril;
 }
 
-RES ResourceFormatLoader::load(const String &p_path,const String& p_original_path) {
+RES ResourceFormatLoader::load(const String &p_path, const String& p_original_path, Error *r_error) {
+
 
 	String path=p_path;
 
 	//or this must be implemented
-	Ref<ResourceInteractiveLoader> ril = load_interactive(p_path);
+	Ref<ResourceInteractiveLoader> ril = load_interactive(p_path,r_error);
 	if (!ril.is_valid())
 		return RES();
 	ril->set_local_path(p_original_path);
@@ -130,9 +131,14 @@ RES ResourceFormatLoader::load(const String &p_path,const String& p_original_pat
 		Error err = ril->poll();
 
 		if (err==ERR_FILE_EOF) {
+			if (r_error)
+				*r_error=OK;
 			return ril->get_resource();
 		}
 
+		if (r_error)
+			*r_error=err;
+
 		ERR_FAIL_COND_V(err!=OK,RES());
 	}
 
@@ -140,7 +146,7 @@ RES ResourceFormatLoader::load(const String &p_path,const String& p_original_pat
 
 }
 
-void ResourceFormatLoader::get_dependencies(const String& p_path,List<String> *p_dependencies) {
+void ResourceFormatLoader::get_dependencies(const String& p_path, List<String> *p_dependencies, bool p_add_types) {
 
 	//do nothing by default
 }
@@ -149,7 +155,10 @@ void ResourceFormatLoader::get_dependencies(const String& p_path,List<String> *p
 ///////////////////////////////////
 
 
-RES ResourceLoader::load(const String &p_path,const String& p_type_hint,bool p_no_cache) {
+RES ResourceLoader::load(const String &p_path, const String& p_type_hint, bool p_no_cache, Error *r_error) {
+
+	if (r_error)
+		*r_error=ERR_CANT_OPEN;
 
 	String local_path;
 	if (p_path.is_rel_path())
@@ -183,7 +192,7 @@ RES ResourceLoader::load(const String &p_path,const String& p_type_hint,bool p_n
 		if (p_type_hint!="" && !loader[i]->handles_type(p_type_hint))
 			continue;
 		found=true;
-		RES res = loader[i]->load(remapped_path,local_path);
+		RES res = loader[i]->load(remapped_path,local_path,r_error);
 		if (res.is_null())
 			continue;
 		if (!p_no_cache)
@@ -222,14 +231,12 @@ Ref<ResourceImportMetadata> ResourceLoader::load_import_metadata(const String &p
 		local_path = Globals::get_singleton()->localize_path(p_path);
 
 	String extension=p_path.extension();
-	bool found=false;
 	Ref<ResourceImportMetadata> ret;
 
 	for (int i=0;i<loader_count;i++) {
 
 		if (!loader[i]->recognize(extension))
 			continue;
-		found=true;
 
 		Error err = loader[i]->load_import_metadata(local_path,ret);
 		if (err==OK)
@@ -289,9 +296,11 @@ String ResourceLoader::find_complete_path(const String& p_path,const String& p_t
 	return local_path;
 }
 
-Ref<ResourceInteractiveLoader> ResourceLoader::load_interactive(const String &p_path,const String& p_type_hint,bool p_no_cache) {
+Ref<ResourceInteractiveLoader> ResourceLoader::load_interactive(const String &p_path,const String& p_type_hint,bool p_no_cache,Error *r_error) {
 
 
+	if (r_error)
+		*r_error=ERR_CANT_OPEN;
 
 	String local_path;
 	if (p_path.is_rel_path())
@@ -327,7 +336,7 @@ Ref<ResourceInteractiveLoader> ResourceLoader::load_interactive(const String &p_
 		if (p_type_hint!="" && !loader[i]->handles_type(p_type_hint))
 			continue;
 		found=true;
-		Ref<ResourceInteractiveLoader> ril = loader[i]->load_interactive(remapped_path);
+		Ref<ResourceInteractiveLoader> ril = loader[i]->load_interactive(remapped_path,r_error);
 		if (ril.is_null())
 			continue;
 		if (!p_no_cache)
@@ -352,7 +361,32 @@ void ResourceLoader::add_resource_format_loader(ResourceFormatLoader *p_format_l
 	loader[loader_count++]=p_format_loader;
 }
 
-void ResourceLoader::get_dependencies(const String& p_path,List<String> *p_dependencies) {
+void ResourceLoader::get_dependencies(const String& p_path, List<String> *p_dependencies, bool p_add_types) {
+
+
+	String local_path;
+	if (p_path.is_rel_path())
+		local_path="res://"+p_path;
+	else
+		local_path = Globals::get_singleton()->localize_path(p_path);
+
+	String remapped_path = PathRemap::get_singleton()->get_remap(local_path);
+
+	String extension=remapped_path.extension();
+
+	for (int i=0;i<loader_count;i++) {
+
+		if (!loader[i]->recognize(extension))
+			continue;
+		//if (p_type_hint!="" && !loader[i]->handles_type(p_type_hint))
+		//	continue;
+
+		loader[i]->get_dependencies(remapped_path,p_dependencies,p_add_types);
+
+	}
+}
+
+Error ResourceLoader::rename_dependencies(const String &p_path,const Map<String,String>& p_map) {
 
 
 	String local_path;
@@ -372,11 +406,15 @@ void ResourceLoader::get_dependencies(const String& p_path,List<String> *p_depen
 		//if (p_type_hint!="" && !loader[i]->handles_type(p_type_hint))
 		//	continue;
 
-		loader[i]->get_dependencies(remapped_path,p_dependencies);
+		return loader[i]->rename_dependencies(p_path,p_map);
 
 	}
+
+	return OK; // ??
+
 }
 
+
 String ResourceLoader::guess_full_filename(const String &p_path,const String& p_type) {
 
 	String local_path;
@@ -414,6 +452,9 @@ String ResourceLoader::get_resource_type(const String &p_path) {
 ResourceLoadErrorNotify ResourceLoader::err_notify=NULL;
 void *ResourceLoader::err_notify_ud=NULL;
 
+DependencyErrorNotify ResourceLoader::dep_err_notify=NULL;
+void *ResourceLoader::dep_err_notify_ud=NULL;
+
 bool ResourceLoader::abort_on_missing_resource=true;
 bool ResourceLoader::timestamp_on_load=false;
 

+ 16 - 6
core/io/resource_loader.h

@@ -57,21 +57,23 @@ public:
 class ResourceFormatLoader {
 public:
 
-	virtual Ref<ResourceInteractiveLoader> load_interactive(const String &p_path);
-	virtual RES load(const String &p_path,const String& p_original_path="");
+	virtual Ref<ResourceInteractiveLoader> load_interactive(const String &p_path,Error *r_error=NULL);
+	virtual RES load(const String &p_path,const String& p_original_path="",Error *r_error=NULL);
 	virtual void get_recognized_extensions(List<String> *p_extensions) const=0;
 	virtual void get_recognized_extensions_for_type(const String& p_type,List<String> *p_extensions) const;
 	bool recognize(const String& p_extension) const;
 	virtual bool handles_type(const String& p_type) const=0;
 	virtual String get_resource_type(const String &p_path) const=0;
-	virtual void get_dependencies(const String& p_path,List<String> *p_dependencies);
+	virtual void get_dependencies(const String& p_path,List<String> *p_dependencies,bool p_add_types=false);
 	virtual Error load_import_metadata(const String &p_path, Ref<ResourceImportMetadata>& r_var) const { return ERR_UNAVAILABLE; }
+	virtual Error rename_dependencies(const String &p_path,const Map<String,String>& p_map) { return OK; }
 
 	virtual ~ResourceFormatLoader() {}
 };
 
 
 typedef void (*ResourceLoadErrorNotify)(void *p_ud,const String& p_text);
+typedef void (*DependencyErrorNotify)(void *p_ud,const String& p_loading,const String& p_which,const String& p_type);
 
 
 class ResourceLoader {	
@@ -86,6 +88,8 @@ class ResourceLoader {
 
 	static void* err_notify_ud;
 	static ResourceLoadErrorNotify err_notify;
+	static void* dep_err_notify_ud;
+	static DependencyErrorNotify dep_err_notify;
 	static bool abort_on_missing_resource;
 
 	static String find_complete_path(const String& p_path,const String& p_type);
@@ -93,14 +97,15 @@ public:
 
 
 	
-	static Ref<ResourceInteractiveLoader> load_interactive(const String &p_path,const String& p_type_hint="",bool p_no_cache=false);
-	static RES load(const String &p_path,const String& p_type_hint="",bool p_no_cache=false);
+	static Ref<ResourceInteractiveLoader> load_interactive(const String &p_path,const String& p_type_hint="",bool p_no_cache=false,Error *r_error=NULL);
+	static RES load(const String &p_path,const String& p_type_hint="",bool p_no_cache=false,Error *r_error=NULL);
 	static Ref<ResourceImportMetadata> load_import_metadata(const String &p_path);
 
 	static void get_recognized_extensions_for_type(const String& p_type,List<String> *p_extensions);
 	static void add_resource_format_loader(ResourceFormatLoader *p_format_loader);
 	static String get_resource_type(const String &p_path);
-	static void get_dependencies(const String& p_path,List<String> *p_dependencies);
+	static void get_dependencies(const String& p_path,List<String> *p_dependencies,bool p_add_types=false);
+	static Error rename_dependencies(const String &p_path,const Map<String,String>& p_map);
 
 	static String guess_full_filename(const String &p_path,const String& p_type);
 
@@ -108,6 +113,11 @@ public:
 
 	static void notify_load_error(const String& p_err) { if (err_notify) err_notify(err_notify_ud,p_err); }
 	static void set_error_notify_func(void* p_ud,ResourceLoadErrorNotify p_err_notify) { err_notify=p_err_notify; err_notify_ud=p_ud;}
+
+	static void notify_dependency_error(const String& p_path,const String& p_dependency,const String& p_type) { if (dep_err_notify) dep_err_notify(dep_err_notify_ud,p_path,p_dependency,p_type); }
+	static void set_dependency_error_notify_func(void* p_ud,DependencyErrorNotify p_err_notify) { dep_err_notify=p_err_notify; dep_err_notify_ud=p_ud;}
+
+
 	static void set_abort_on_missing_resources(bool p_abort) { abort_on_missing_resource=p_abort; }
 	static bool get_abort_on_missing_resources() { return abort_on_missing_resource; }
 };

+ 1 - 1
core/io/resource_saver.h

@@ -46,7 +46,7 @@ public:
 	virtual Error save(const String &p_path,const RES& p_resource,uint32_t p_flags=0)=0;
 	virtual bool recognize(const RES& p_resource) const=0;
 	virtual void get_recognized_extensions(const RES& p_resource,List<String> *p_extensions) const=0;
-		
+
 	virtual ~ResourceFormatSaver() {}
 };
 

+ 8 - 1
core/io/translation_loader_po.cpp

@@ -30,7 +30,10 @@
 #include "os/file_access.h"
 #include "translation.h"
 
-RES TranslationLoaderPO::load(const String &p_path,const String& p_original_path) {
+RES TranslationLoaderPO::load(const String &p_path, const String& p_original_path, Error *r_error) {
+
+	if (r_error)
+		*r_error=ERR_CANT_OPEN;
 
 	FileAccess *f=FileAccess::open(p_path,FileAccess::READ);
 	ERR_FAIL_COND_V(!f,RES());
@@ -49,6 +52,8 @@ RES TranslationLoaderPO::load(const String &p_path,const String& p_original_path
 	String msg_id;
 	String msg_str;
 	String config;
+	if (r_error)
+		*r_error=ERR_FILE_CORRUPT;
 
 	Ref<Translation> translation = Ref<Translation>( memnew( Translation ));
 	int line = 1;
@@ -174,6 +179,8 @@ RES TranslationLoaderPO::load(const String &p_path,const String& p_original_path
 		}
 	}
 
+	if (r_error)
+		*r_error=OK;
 
 	return translation;
 

+ 1 - 1
core/io/translation_loader_po.h

@@ -34,7 +34,7 @@
 class TranslationLoaderPO : public ResourceFormatLoader {
 public:
 
-	virtual RES load(const String &p_path,const String& p_original_path="");
+	virtual RES load(const String &p_path,const String& p_original_path="",Error *r_error=NULL);
 	virtual void get_recognized_extensions(List<String> *p_extensions) const;
 	virtual bool handles_type(const String& p_type) const;
 	virtual String get_resource_type(const String &p_path) const;

+ 1 - 1
core/io/unzip.c

@@ -1788,7 +1788,7 @@ extern int ZEXPORT unzReadCurrentFile  (unzFile file, voidp buf, unsigned len)
         return UNZ_PARAMERROR;
 
 
-    if ((pfile_in_zip_read_info->read_buffer == NULL))
+    if (pfile_in_zip_read_info->read_buffer==NULL)
         return UNZ_END_OF_LIST_OF_FILE;
     if (len==0)
         return 0;

+ 2 - 2
core/io/zip.c

@@ -1114,9 +1114,9 @@ extern int ZEXPORT zipOpenNewFileInZip4_64 (zipFile file, const char* filename,
     zi->ci.flag = flagBase;
     if ((level==8) || (level==9))
       zi->ci.flag |= 2;
-    if ((level==2))
+    if (level==2)
       zi->ci.flag |= 4;
-    if ((level==1))
+    if (level==1)
       zi->ci.flag |= 6;
     if (password != NULL)
       zi->ci.flag |= 1;

+ 2 - 0
core/io/zip.h

@@ -39,6 +39,8 @@
 #ifndef _zip12_H
 #define _zip12_H
 
+#include <stdlib.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif

+ 1 - 0
core/io/zip_io.h

@@ -34,6 +34,7 @@
 #include "os/file_access.h"
 #include "os/copymem.h"
 
+
 static void* zipio_open(void* data, const char* p_fname, int mode) {
 
 	FileAccess *&f = *(FileAccess**)data;

+ 8 - 2
core/list.h

@@ -518,10 +518,16 @@ public:
 
 		if (value->prev_ptr) {
 			value->prev_ptr->next_ptr = value->next_ptr;
-		};
+		}
+		else {
+			_data->first = value->next_ptr;
+		}
 		if (value->next_ptr) {
 			value->next_ptr->prev_ptr = value->prev_ptr;
-		};
+		}
+		else {
+			_data->last = value->prev_ptr;
+		}
 
 		value->next_ptr = where;
 		if (!where) {

+ 2 - 2
core/math/math_funcs.h

@@ -79,9 +79,9 @@ public:
 		return Math::log( p_linear ) * 8.6858896380650365530225783783321;
 	}
 
-	static inline double db2linear(double p_linear) {
+	static inline double db2linear(double p_db) {
 
-		return Math::exp( p_linear * 0.11512925464970228420089957273422 );
+		return Math::exp( p_db * 0.11512925464970228420089957273422 );
 	}
 
 	static bool is_nan(double p_val);

+ 21 - 9
core/math/vector3.h

@@ -40,11 +40,11 @@ struct Vector3 {
 	enum Axis {
 		AXIS_X,
 		AXIS_Y,
-		AXIS_Z,	
+		AXIS_Z,
 	};
 
 	union {
-	
+
 #ifdef USE_QUAD_VECTORS
 
 		struct {
@@ -52,7 +52,7 @@ struct Vector3 {
 			real_t y;
 			real_t z;
 			real_t _unused;
-		};		
+		};
 		real_t coord[4];
 #else
 
@@ -61,18 +61,18 @@ struct Vector3 {
 			real_t y;
 			real_t z;
 		};
-		
+
 		real_t coord[3];
 #endif
 	};
 
 	_FORCE_INLINE_ const real_t& operator[](int p_axis) const {
-	
+
 		return coord[p_axis];
 	}
 
 	_FORCE_INLINE_ real_t& operator[](int p_axis) {
-	
+
 		return coord[p_axis];
 	}
 
@@ -84,7 +84,7 @@ struct Vector3 {
 
         _FORCE_INLINE_ real_t length() const;
         _FORCE_INLINE_ real_t length_squared() const;
-        
+
         _FORCE_INLINE_ void normalize();
         _FORCE_INLINE_ Vector3 normalized() const;
         _FORCE_INLINE_ Vector3 inverse() const;
@@ -107,6 +107,8 @@ struct Vector3 {
         _FORCE_INLINE_ real_t dot(const Vector3& p_b) const;
 
         _FORCE_INLINE_ Vector3 abs() const;
+		_FORCE_INLINE_ Vector3 floor() const;
+		_FORCE_INLINE_ Vector3 ceil() const;
 
         _FORCE_INLINE_ real_t distance_to(const Vector3& p_b) const;
         _FORCE_INLINE_ real_t distance_squared_to(const Vector3& p_b) const;
@@ -172,7 +174,17 @@ real_t Vector3::dot(const Vector3& p_b) const {
 Vector3 Vector3::abs() const {
 
 	return Vector3( Math::abs(x), Math::abs(y), Math::abs(z) );
-}	
+}
+
+Vector3 Vector3::floor() const {
+
+	return Vector3( Math::floor(x), Math::floor(y), Math::floor(z) );
+}
+
+Vector3 Vector3::ceil() const {
+
+	return Vector3( Math::ceil(x), Math::ceil(y), Math::ceil(z) );
+}
 
 Vector3 Vector3::linear_interpolate(const Vector3& p_b,float p_t) const {
 
@@ -301,7 +313,7 @@ bool Vector3::operator<(const Vector3& p_v) const {
 			return y<p_v.y;
 	} else
 		return x<p_v.x;
-	
+
 }
 
 bool Vector3::operator<=(const Vector3& p_v) const {

+ 32 - 32
core/object.h

@@ -49,7 +49,7 @@
 
 enum PropertyHint {
 	PROPERTY_HINT_NONE, ///< no hint provided.
-	PROPERTY_HINT_RANGE, ///< hint_text = "min,max,step"
+	PROPERTY_HINT_RANGE, ///< hint_text = "min,max,step,slider; //slider is optional"
 	PROPERTY_HINT_EXP_RANGE, ///< hint_text = "min,max,step", exponential edit
 	PROPERTY_HINT_ENUM, ///< hint_text= "val1,val2,val3,etc"
 	PROPERTY_HINT_EXP_EASING, /// exponential easing funciton (Math::ease)
@@ -58,12 +58,12 @@ enum PropertyHint {
 	PROPERTY_HINT_KEY_ACCEL, ///< hint_text= "length" (as integer)
 	PROPERTY_HINT_FLAGS, ///< hint_text= "flag1,flag2,etc" (as bit flags)
 	PROPERTY_HINT_ALL_FLAGS,
-	PROPERTY_HINT_FILE, ///< a file path must be passed, hint_text (optionally) is a filter "*.png,*.wav,*.doc," 
+	PROPERTY_HINT_FILE, ///< a file path must be passed, hint_text (optionally) is a filter "*.png,*.wav,*.doc,"
 	PROPERTY_HINT_DIR, ///< a directort path must be passed
 	PROPERTY_HINT_GLOBAL_FILE, ///< a file path must be passed, hint_text (optionally) is a filter "*.png,*.wav,*.doc,"
 	PROPERTY_HINT_GLOBAL_DIR, ///< a directort path must be passed
 	PROPERTY_HINT_RESOURCE_TYPE, ///< a resource object type
-	PROPERTY_HINT_MULTILINE_TEXT, ///< used for string properties that can contain multiple lines	
+	PROPERTY_HINT_MULTILINE_TEXT, ///< used for string properties that can contain multiple lines
 	PROPERTY_HINT_COLOR_NO_ALPHA, ///< used for ignoring alpha component when editing a color
 	PROPERTY_HINT_IMAGE_COMPRESS_LOSSY,
 	PROPERTY_HINT_IMAGE_COMPRESS_LOSSLESS,
@@ -71,7 +71,7 @@ enum PropertyHint {
 };
 
 enum PropertyUsageFlags {
-	
+
 	PROPERTY_USAGE_STORAGE=1,
 	PROPERTY_USAGE_EDITOR=2,
 	PROPERTY_USAGE_NETWORK=4,
@@ -102,15 +102,15 @@ enum PropertyUsageFlags {
 #define ADD_PROPERTYINO( m_property, m_setter, m_getter, m_index ) ObjectTypeDB::add_property( get_type_static(), (m_property).added_usage(PROPERTY_USAGE_STORE_IF_NONONE), m_setter, m_getter, m_index )
 
 struct PropertyInfo {
-	
-	Variant::Type type;	
+
+	Variant::Type type;
 	String name;
 	PropertyHint hint;
-	String hint_string;	
+	String hint_string;
 	uint32_t usage;
 
 	_FORCE_INLINE_ PropertyInfo added_usage(int p_fl) const { PropertyInfo pi=*this; pi.usage|=p_fl; return pi; }
-	
+
 	PropertyInfo() { type=Variant::NIL; hint=PROPERTY_HINT_NONE; usage = PROPERTY_USAGE_DEFAULT; }
 	PropertyInfo( Variant::Type p_type, const String p_name, PropertyHint p_hint=PROPERTY_HINT_NONE, const String& p_hint_string="",uint32_t p_usage=PROPERTY_USAGE_DEFAULT) {
 		type=p_type; name=p_name; hint=p_hint; hint_string=p_hint_string; usage=p_usage;
@@ -125,23 +125,23 @@ struct PropertyInfo {
 Array convert_property_list(const List<PropertyInfo> * p_list);
 
 struct MethodInfo {
-	
+
 	String name;
 	List<PropertyInfo> arguments;
 	Vector<Variant> default_arguments;
 	PropertyInfo return_val;
 	uint32_t flags;
 	int id;
-	
+
 	inline bool  operator<(const MethodInfo& p_method) const { return id==p_method.id?(name < p_method.name):(id<p_method.id); }
-	
+
 	MethodInfo();
 	MethodInfo(const String& p_name);
 	MethodInfo(const String& p_name, const PropertyInfo& p_param1);
 	MethodInfo(const String& p_name, const PropertyInfo& p_param1,const PropertyInfo& p_param2);
 	MethodInfo(const String& p_name, const PropertyInfo& p_param1,const PropertyInfo& p_param2,const PropertyInfo& p_param3);
 	MethodInfo(const String& p_name, const PropertyInfo& p_param1,const PropertyInfo& p_param2,const PropertyInfo& p_param3,const PropertyInfo& p_param4);
-	MethodInfo(const String& p_name, const PropertyInfo& p_param1,const PropertyInfo& p_param2,const PropertyInfo& p_param3,const PropertyInfo& p_param4,const PropertyInfo& p_param5);															
+	MethodInfo(const String& p_name, const PropertyInfo& p_param1,const PropertyInfo& p_param2,const PropertyInfo& p_param3,const PropertyInfo& p_param4,const PropertyInfo& p_param5);
 	MethodInfo(Variant::Type ret);
 	MethodInfo(Variant::Type ret,const String& p_name);
 	MethodInfo(Variant::Type ret,const String& p_name, const PropertyInfo& p_param1);
@@ -158,7 +158,7 @@ struct MethodInfo {
 //return NULL;
 
 /*
-   the following is an uncomprehensible blob of hacks and workarounds to compensate for many of the fallencies in C++. As a plus, this macro pretty much alone defines the object model. 
+   the following is an uncomprehensible blob of hacks and workarounds to compensate for many of the fallencies in C++. As a plus, this macro pretty much alone defines the object model.
 */
 
 #define REVERSE_GET_PROPERTY_LIST \
@@ -314,7 +314,7 @@ private:
 class ScriptInstance;
 typedef uint32_t ObjectID;
 
-class Object {		
+class Object {
 public:
 
 	enum ConnectFlags {
@@ -404,7 +404,7 @@ friend void postinitialize_handler(Object*);
 
 	void property_list_changed_notify();
 
-protected:	
+protected:
 
 	virtual bool _use_builtin_script() const { return false; }
 	virtual void _initialize_typev() { initialize_type(); }
@@ -412,14 +412,14 @@ protected:
 	virtual bool _getv(const StringName& p_name,Variant &r_property) const { return false; };
 	virtual void _get_property_listv(List<PropertyInfo> *p_list,bool p_reversed) const {};
 	virtual void _notificationv(int p_notification,bool p_reversed) {};
-	
+
 	static String _get_category() { return ""; }
 	static void _bind_methods();
 	bool _set(const StringName& p_name,const Variant &p_property) { return false; };
 	bool _get(const StringName& p_name,Variant &r_property) const { return false;  };
 	void _get_property_list(List<PropertyInfo> *p_list) const {};
 	void _notification(int p_notification) {};
-	
+
 	_FORCE_INLINE_ static void (*_get_bind_methods())() {
 		return &Object::_bind_methods;
 	}
@@ -431,13 +431,13 @@ protected:
 	}
 	_FORCE_INLINE_ void (Object::* (_get_get_property_list() const))(List<PropertyInfo> *p_list) const{
 			return &Object::_get_property_list;
-	}	
+	}
 	_FORCE_INLINE_ void (Object::* (_get_notification() const))(int){
 			return &Object::_notification;
-	}	
+	}
 	static void get_valid_parents_static(List<String> *p_parents);
 	static void _get_valid_parents_static(List<String> *p_parents);
-		
+
 
 	void cancel_delete();
 
@@ -485,7 +485,7 @@ public:
 
 	void add_change_receptor( Object *p_receptor );
 	void remove_change_receptor( Object *p_receptor );
-	
+
 	template<class T>
 	T *cast_to() {
 
@@ -500,7 +500,7 @@ public:
 			return NULL;
 #endif
 	}
-		
+
 	template<class T>
 	const T *cast_to() const {
 
@@ -517,11 +517,11 @@ public:
 	}
 
 	enum {
-		
+
 		NOTIFICATION_POSTINITIALIZE=0,
 		NOTIFICATION_PREDELETE=1
 	};
-	
+
 	/* TYPE API */
 	static void get_inheritance_list_static(List<String>* p_inheritance_list) {  p_inheritance_list->push_back("Object"); }
 
@@ -545,7 +545,7 @@ public:
 			return *_type_ptr;
 		}
 	}
-	
+
 	/* IAPI */
 //	void set(const String& p_name, const Variant& p_value);
 //	Variant get(const String& p_name) const;
@@ -554,7 +554,7 @@ public:
 	Variant get(const StringName& p_name, bool *r_valid=NULL) const;
 
 	void get_property_list(List<PropertyInfo> *p_list,bool p_reversed=false) const;
-	
+
 	bool has_method(const StringName& p_method) const;
 	void get_method_list(List<MethodInfo> *p_list) const;
 	Variant callv(const StringName& p_method,const Array& p_args);
@@ -564,14 +564,14 @@ public:
 	Variant call(const StringName& p_name, VARIANT_ARG_LIST); // C++ helper
 	void call_multilevel(const StringName& p_name, VARIANT_ARG_LIST); // C++ helper
 
-	void notification(int p_notification,bool p_reversed=false);	
+	void notification(int p_notification,bool p_reversed=false);
 
 	//used mainly by script, get and set all INCLUDING string
 	virtual Variant getvar(const Variant& p_key, bool *r_valid=NULL) const;
 	virtual void setvar(const Variant& p_key, const Variant& p_value,bool *r_valid=NULL);
 
 	/* SCRIPT */
-	
+
 	void set_script(const RefPtr& p_script);
 	RefPtr get_script() const;
 
@@ -614,14 +614,14 @@ public:
 	StringName tr(const StringName& p_message) const; //translate message (alternative)
 
 	bool _is_queued_for_deletion; // set to true by SceneTree::queue_delete()
-	bool is_queued_for_deletion() const; 
+	bool is_queued_for_deletion() const;
 
 	_FORCE_INLINE_ void set_message_translation(bool p_enable) { _can_translate=p_enable; }
 	_FORCE_INLINE_ bool can_translate_messages() const { return _can_translate; }
 
 	void clear_internal_resource_paths();
 
-	Object();	
+	Object();
 	virtual ~Object();
 
 };
@@ -649,13 +649,13 @@ class ObjectDB {
 	static HashMap<Object*,ObjectID,ObjectPtrHash> instance_checks;
 
 	static uint32_t instance_counter;
-friend class Object;	
+friend class Object;
 friend void unregister_core_types();
 
 	static void cleanup();
 	static uint32_t add_instance(Object *p_object);
 	static void remove_instance(Object *p_object);
-public:	
+public:
 
 	typedef void (*DebugFunc)(Object *p_obj);
 

+ 19 - 0
core/object_type_db.cpp

@@ -746,6 +746,25 @@ bool ObjectTypeDB::has_method(StringName p_type,StringName p_method,bool p_no_in
 
 }
 
+bool ObjectTypeDB::get_setter_and_type_for_property(const StringName& p_class, const StringName& p_prop, StringName& r_class, StringName& r_setter) {
+
+	TypeInfo *type=types.getptr(p_class);
+	TypeInfo *check=type;
+	while(check) {
+
+		if (check->property_setget.has(p_prop)) {
+			r_class=check->name;
+			r_setter=check->property_setget[p_prop].setter;
+			return true;
+		}
+
+		check=check->inherits_ptr;
+	}
+
+	return false;
+
+}
+
 #ifdef DEBUG_METHODS_ENABLED
 MethodBind* ObjectTypeDB::bind_methodfi(uint32_t p_flags, MethodBind *p_bind , const MethodDefinition &method_name, const Variant **p_defs, int p_defcount) {
 	StringName mdname=method_name.name;

+ 3 - 1
core/object_type_db.h

@@ -475,7 +475,9 @@ public:
 	static void get_integer_constant_list(const StringName& p_type, List<String> *p_constants, bool p_no_inheritance=false);
 	static int get_integer_constant(const StringName& p_type, const StringName &p_name, bool *p_success=NULL);
 	static StringName get_category(const StringName& p_node);
-	
+
+	static bool get_setter_and_type_for_property(const StringName& p_class, const StringName& p_prop, StringName& r_class, StringName& r_setter);
+
 	static void set_type_enabled(StringName p_type,bool p_enable);
 	static bool is_type_enabled(StringName p_type);
 

+ 1 - 263
core/os/input.cpp

@@ -64,6 +64,7 @@ void Input::_bind_methods() {
 	ObjectTypeDB::bind_method(_MD("warp_mouse_pos","to"),&Input::warp_mouse_pos);
 	ObjectTypeDB::bind_method(_MD("action_press"),&Input::action_press);
 	ObjectTypeDB::bind_method(_MD("action_release"),&Input::action_release);
+	ObjectTypeDB::bind_method(_MD("set_custom_mouse_cursor","image:Texture","hotspot"),&Input::set_custom_mouse_cursor,DEFVAL(Vector2()));
 
 	BIND_CONSTANT( MOUSE_MODE_VISIBLE );
 	BIND_CONSTANT( MOUSE_MODE_HIDDEN );
@@ -104,266 +105,3 @@ Input::Input() {
 
 //////////////////////////////////////////////////////////
 
-
-void InputDefault::SpeedTrack::update(const Vector2& p_delta_p) {
-
-	uint64_t tick = OS::get_singleton()->get_ticks_usec();
-	uint32_t tdiff = tick-last_tick;
-	float delta_t = tdiff / 1000000.0;
-	last_tick=tick;
-
-
-	accum+=p_delta_p;
-	accum_t+=delta_t;
-
-	if (accum_t>max_ref_frame*10)
-		accum_t=max_ref_frame*10;
-
-	while( accum_t>=min_ref_frame ) {
-
-		float slice_t = min_ref_frame / accum_t;
-		Vector2 slice = accum*slice_t;
-		accum=accum-slice;
-		accum_t-=min_ref_frame;
-
-		speed=(slice/min_ref_frame).linear_interpolate(speed,min_ref_frame/max_ref_frame);
-	}
-
-
-
-}
-
-void InputDefault::SpeedTrack::reset() {
-	last_tick = OS::get_singleton()->get_ticks_usec();
-	speed=Vector2();
-	accum_t=0;
-}
-
-InputDefault::SpeedTrack::SpeedTrack() {
-
-	 min_ref_frame=0.1;
-	 max_ref_frame=0.3;
-	 reset();
-}
-
-bool InputDefault::is_key_pressed(int p_scancode) {
-
-	_THREAD_SAFE_METHOD_
-	return keys_pressed.has(p_scancode);
-}
-
-bool InputDefault::is_mouse_button_pressed(int p_button) {
-
-	_THREAD_SAFE_METHOD_
-	return (mouse_button_mask&(1<<p_button))!=0;
-}
-
-
-static int _combine_device(int p_value,int p_device) {
-
-	return p_value|(p_device<<20);
-}
-
-bool InputDefault::is_joy_button_pressed(int p_device, int p_button) {
-
-	_THREAD_SAFE_METHOD_
-	return joy_buttons_pressed.has(_combine_device(p_button,p_device));
-}
-
-bool InputDefault::is_action_pressed(const StringName& p_action) {
-
-	if (custom_action_press.has(p_action))
-		return true; //simpler
-
-	const List<InputEvent> *alist = InputMap::get_singleton()->get_action_list(p_action);
-	if (!alist)
-		return NULL;
-
-
-	for (const List<InputEvent>::Element *E=alist->front();E;E=E->next()) {
-
-
-		int device=E->get().device;
-
-		switch(E->get().type) {
-
-			case InputEvent::KEY: {
-
-				const InputEventKey &iek=E->get().key;
-				if ((keys_pressed.has(iek.scancode)))
-					return true;
-			} break;
-			case InputEvent::MOUSE_BUTTON: {
-
-				const InputEventMouseButton &iemb=E->get().mouse_button;
-				 if(mouse_button_mask&(1<<iemb.button_index))
-					 return true;
-			} break;
-			case InputEvent::JOYSTICK_BUTTON: {
-
-				const InputEventJoystickButton &iejb=E->get().joy_button;
-				int c = _combine_device(iejb.button_index,device);
-				if (joy_buttons_pressed.has(c))
-					return true;
-			} break;
-		}
-	}
-
-	return false;
-}
-
-float InputDefault::get_joy_axis(int p_device,int p_axis) {
-
-	_THREAD_SAFE_METHOD_
-	int c = _combine_device(p_axis,p_device);
-	if (joy_axis.has(c)) {
-		return joy_axis[c];
-	} else {
-		return 0;
-	}
-}
-
-String InputDefault::get_joy_name(int p_idx) {
-
-	_THREAD_SAFE_METHOD_
-	return joy_names[p_idx];
-};
-
-void InputDefault::joy_connection_changed(int p_idx, bool p_connected, String p_name) {
-
-	_THREAD_SAFE_METHOD_
-	joy_names[p_idx] = p_connected ? p_name : "";
-
-	emit_signal("joy_connection_changed", p_idx, p_connected);
-};
-
-Vector3 InputDefault::get_accelerometer() {
-
-	_THREAD_SAFE_METHOD_
-	return accelerometer;
-}
-
-void InputDefault::parse_input_event(const InputEvent& p_event) {
-
-	_THREAD_SAFE_METHOD_
-	switch(p_event.type) {
-
-		case InputEvent::KEY: {
-
-			if (p_event.key.echo)
-				break;
-			if (p_event.key.scancode==0)
-				break;
-
-		//	print_line(p_event);
-
-			if (p_event.key.pressed)
-				keys_pressed.insert(p_event.key.scancode);
-			else
-				keys_pressed.erase(p_event.key.scancode);
-		} break;
-		case InputEvent::MOUSE_BUTTON: {
-
-			if (p_event.mouse_button.doubleclick)
-				break;
-
-			if (p_event.mouse_button.pressed)
-				mouse_button_mask|=(1<<p_event.mouse_button.button_index);
-			else
-				mouse_button_mask&=~(1<<p_event.mouse_button.button_index);
-		} break;
-		case InputEvent::JOYSTICK_BUTTON: {
-
-			int c = _combine_device(p_event.joy_button.button_index,p_event.device);
-
-			if (p_event.joy_button.pressed)
-				joy_buttons_pressed.insert(c);
-			else
-				joy_buttons_pressed.erase(c);
-		} break;
-		case InputEvent::JOYSTICK_MOTION: {
-			set_joy_axis(p_event.device, p_event.joy_motion.axis, p_event.joy_motion.axis_value);
-		} break;
-
-	}
-
-	if (main_loop)
-		main_loop->input_event(p_event);
-
-}
-
-void InputDefault::set_joy_axis(int p_device,int p_axis,float p_value) {
-
-	_THREAD_SAFE_METHOD_
-	int c = _combine_device(p_axis,p_device);
-	joy_axis[c]=p_value;
-}
-
-void InputDefault::set_accelerometer(const Vector3& p_accel) {
-
-	_THREAD_SAFE_METHOD_
-
-	accelerometer=p_accel;
-
-}
-
-void InputDefault::set_main_loop(MainLoop *p_main_loop) {
-	main_loop=p_main_loop;
-
-}
-
-void InputDefault::set_mouse_pos(const Point2& p_posf) {
-
-	mouse_speed_track.update(p_posf-mouse_pos);
-	mouse_pos=p_posf;
-}
-
-Point2 InputDefault::get_mouse_pos() const {
-
-	return mouse_pos;
-}
-Point2 InputDefault::get_mouse_speed() const {
-
-	return mouse_speed_track.speed;
-}
-
-int InputDefault::get_mouse_button_mask() const {
-
-	return OS::get_singleton()->get_mouse_button_state();
-}
-
-void InputDefault::warp_mouse_pos(const Vector2& p_to) {
-
-	OS::get_singleton()->warp_mouse_pos(p_to);
-}
-
-
-void InputDefault::iteration(float p_step) {
-
-
-}
-
-void InputDefault::action_press(const StringName& p_action) {
-
-	if (custom_action_press.has(p_action)) {
-
-		custom_action_press[p_action]++;
-	} else {
-		custom_action_press[p_action]=1;
-	}
-}
-
-void InputDefault::action_release(const StringName& p_action){
-
-	ERR_FAIL_COND(!custom_action_press.has(p_action));
-	custom_action_press[p_action]--;
-	if (custom_action_press[p_action]==0) {
-		custom_action_press.erase(p_action);
-	}
-}
-
-InputDefault::InputDefault() {
-
-	mouse_button_mask=0;
-	main_loop=NULL;
-}

+ 4 - 66
core/os/input.h

@@ -78,77 +78,15 @@ public:
 
 	void get_argument_options(const StringName& p_function,int p_idx,List<String>*r_options) const;
 
+	virtual bool is_emulating_touchscreen() const=0;
+
+	virtual void set_custom_mouse_cursor(const RES& p_cursor,const Vector2& p_hotspot=Vector2())=0;
+	virtual void set_mouse_in_window(bool p_in_window)=0;
 
 	Input();
 };
 
 VARIANT_ENUM_CAST(Input::MouseMode);
 
-class InputDefault : public Input {
-
-	OBJ_TYPE( InputDefault, Input );
-	_THREAD_SAFE_CLASS_
-
-	int mouse_button_mask;
-	Set<int> keys_pressed;
-	Set<int> joy_buttons_pressed;
-	Map<int,float> joy_axis;
-	Map<StringName,int> custom_action_press;
-	Map<int, String> joy_names;
-	Vector3 accelerometer;
-	Vector2 mouse_pos;
-	MainLoop *main_loop;
-
-	struct SpeedTrack {
-
-		uint64_t last_tick;
-		Vector2 speed;
-		Vector2 accum;
-		float accum_t;
-		float min_ref_frame;
-		float max_ref_frame;
-
-		void update(const Vector2& p_delta_p);
-		void reset();
-		SpeedTrack();
-	};
-
-	SpeedTrack mouse_speed_track;
-
-public:
-
-	virtual bool is_key_pressed(int p_scancode);
-	virtual bool is_mouse_button_pressed(int p_button);
-	virtual bool is_joy_button_pressed(int p_device, int p_button);
-	virtual bool is_action_pressed(const StringName& p_action);
-
-	virtual float get_joy_axis(int p_device,int p_axis);
-	String get_joy_name(int p_idx);
-	void joy_connection_changed(int p_idx, bool p_connected, String p_name);
-
-	virtual Vector3 get_accelerometer();
-
-	virtual Point2 get_mouse_pos() const;
-	virtual Point2 get_mouse_speed() const;
-	virtual int get_mouse_button_mask() const;
-
-	virtual void warp_mouse_pos(const Vector2& p_to);
-
-
-	void parse_input_event(const InputEvent& p_event);
-	void set_accelerometer(const Vector3& p_accel);
-	void set_joy_axis(int p_device,int p_axis,float p_value);
-
-	void set_main_loop(MainLoop *main_loop);
-	void set_mouse_pos(const Point2& p_posf);
-
-	void action_press(const StringName& p_action);
-	void action_release(const StringName& p_action);
-
-	void iteration(float p_step);
-
-	InputDefault();
-
-};
 
 #endif // INPUT_H

+ 2 - 1
core/os/main_loop.cpp

@@ -45,7 +45,8 @@ void MainLoop::_bind_methods() {
 	BIND_VMETHOD( MethodInfo("_idle",PropertyInfo(Variant::REAL,"delta")) );
 	BIND_VMETHOD( MethodInfo("_finalize") );
 
-
+	BIND_CONSTANT(NOTIFICATION_WM_MOUSE_ENTER);
+	BIND_CONSTANT(NOTIFICATION_WM_MOUSE_EXIT);
 	BIND_CONSTANT(NOTIFICATION_WM_FOCUS_IN);
 	BIND_CONSTANT(NOTIFICATION_WM_FOCUS_OUT);
 	BIND_CONSTANT(NOTIFICATION_WM_QUIT_REQUEST);

+ 2 - 0
core/os/main_loop.h

@@ -47,6 +47,8 @@ protected:
 public:	
 
 	enum {
+		NOTIFICATION_WM_MOUSE_ENTER = 3,
+		NOTIFICATION_WM_MOUSE_EXIT = 4,
 		NOTIFICATION_WM_FOCUS_IN = 5,
 		NOTIFICATION_WM_FOCUS_OUT = 6,
 		NOTIFICATION_WM_QUIT_REQUEST = 7,

+ 6 - 3
core/os/os.cpp

@@ -31,7 +31,7 @@
 #include <stdarg.h>
 #include "dir_access.h"
 #include "globals.h"
-
+#include "input.h"
 
 OS* OS::singleton=NULL;
 
@@ -50,7 +50,9 @@ uint64_t OS::get_unix_time() const {
 
 	return 0;
 };
-
+uint64_t OS::get_system_time_msec() const {
+	return 0;
+}
 void OS::debug_break() {
 
 	// something
@@ -361,7 +363,7 @@ Error OS::set_cwd(const String& p_cwd) {
 bool OS::has_touchscreen_ui_hint() const {
 
 	//return false;
-	return GLOBAL_DEF("display/emulate_touchscreen",false);
+	return Input::get_singleton() && Input::get_singleton()->is_emulating_touchscreen();
 }
 
 int OS::get_free_static_memory() const {
@@ -514,6 +516,7 @@ OS::OS() {
 	_target_fps=0;
 	_render_thread_mode=RENDER_THREAD_SAFE;
 	_time_scale=1.0;
+	_pixel_snap=false;
 	Math::seed(1234567);
 }
 

+ 4 - 2
core/os/os.h

@@ -58,6 +58,7 @@ class OS {
 	float _fps;
 	int _target_fps;
 	float _time_scale;
+	bool _pixel_snap;
 
 	char *last_error;
 
@@ -155,7 +156,7 @@ public:
 	virtual int get_screen_count() const{ return 1; }
 	virtual int get_current_screen() const { return 0; }
 	virtual void set_current_screen(int p_screen) { }
-	virtual Point2 get_screen_position(int p_screen=0)  { return Point2(); }
+	virtual Point2 get_screen_position(int p_screen=0) const { return Point2(); }
 	virtual Size2 get_screen_size(int p_screen=0) const { return get_window_size(); }
 	virtual Point2 get_window_position() const { return Vector2(); }
 	virtual void set_window_position(const Point2& p_position) {}
@@ -254,6 +255,7 @@ public:
 	virtual Time get_time(bool local=false) const=0;
 	virtual TimeZoneInfo get_time_zone_info() const=0;
 	virtual uint64_t get_unix_time() const;
+	virtual uint64_t get_system_time_msec() const;
 
 	virtual void delay_usec(uint32_t p_usec) const=0; 
 	virtual uint64_t get_ticks_usec() const=0;
@@ -392,7 +394,7 @@ public:
 	void set_time_scale(float p_scale);
 	float get_time_scale() const;
 
-
+	_FORCE_INLINE_ bool get_use_pixel_snap() const { return _pixel_snap; }
 
 	OS();	
 	virtual ~OS();

+ 31 - 0
core/path_db.cpp

@@ -286,6 +286,37 @@ NodePath::NodePath(const Vector<StringName>& p_path,const Vector<StringName>& p_
 	data->property=p_property;
 }
 
+
+void NodePath::simplify() {
+
+	if (!data)
+		return;
+	for(int i=0;i<data->path.size();i++) {
+		if (data->path.size()==1)
+			break;
+		if (data->path[i].operator String()==".") {
+			data->path.remove(i);
+			i--;
+		} else if (data->path[i].operator String()==".." && i>0 && data->path[i-1].operator String()!="." && data->path[i-1].operator String()!="..") {
+			//remove both
+			data->path.remove(i-1);
+			data->path.remove(i-1);
+			i-=2;
+			if (data->path.size()==0) {
+				data->path.push_back(".");
+				break;
+			}
+		}
+	}
+}
+
+NodePath NodePath::simplified() const {
+
+	NodePath np=*this;
+	np.simplify();
+	return np;
+}
+
 NodePath::NodePath(const String& p_path) {
 
 	data=NULL;

+ 4 - 1
core/path_db.h

@@ -84,7 +84,10 @@ public:
 	bool operator==(const NodePath& p_path) const;
 	bool operator!=(const NodePath& p_path) const;
 	void operator=(const NodePath& p_path);
-	
+
+	void simplify();
+	NodePath simplified() const;
+
 	NodePath(const Vector<StringName>& p_path,bool p_absolute,const String& p_property="");	
 	NodePath(const Vector<StringName>& p_path,const Vector<StringName>& p_subpath,bool p_absolute,const String& p_property="");
 	NodePath(const NodePath& p_path);

+ 1 - 1
core/resource.h

@@ -130,7 +130,7 @@ public:
 	void set_name(const String& p_name);
 	String get_name() const;
 
-	void set_path(const String& p_path,bool p_take_over=false);
+	virtual void set_path(const String& p_path,bool p_take_over=false);
 	String get_path() const;
 
 	void set_subindex(int p_sub_index);

+ 4 - 4
core/ring_buffer.h

@@ -141,15 +141,15 @@ public:
 	inline int space_left() {
 		int left = read_pos - write_pos;
 		if (left < 0) {
-			return size() + left;
+			return size() + left - 1;
 		};
 		if (left == 0) {
-			return size();
+			return size()-1;
 		};
-		return left;
+		return left -1;
 	};
 	inline int data_left() {
-		return size() - space_left();
+		return size() - space_left() - 1;
 	};
 	
 	inline int size() {

+ 284 - 6
core/script_debugger_remote.cpp

@@ -31,10 +31,36 @@
 #include "io/ip.h"
 #include "globals.h"
 
+void ScriptDebuggerRemote::_send_video_memory() {
+
+	List<ResourceUsage> usage;
+	if (resource_usage_func)
+		resource_usage_func(&usage);
+
+	usage.sort();
+
+	packet_peer_stream->put_var("message:video_mem");
+	packet_peer_stream->put_var(usage.size()*4);
+
+
+	for(List<ResourceUsage>::Element *E=usage.front();E;E=E->next()) {
+
+		packet_peer_stream->put_var(E->get().path);
+		packet_peer_stream->put_var(E->get().type);
+		packet_peer_stream->put_var(E->get().format);
+		packet_peer_stream->put_var(E->get().vram);
+	}
+
+}
+
 Error ScriptDebuggerRemote::connect_to_host(const String& p_host,uint16_t p_port) {
 
 
-    IP_Address ip = IP::get_singleton()->resolve_hostname(p_host);
+    IP_Address ip;
+    if (p_host.is_valid_ip_address())
+	    ip=p_host;
+    else
+	    ip = IP::get_singleton()->resolve_hostname(p_host);
 
 
     int port = p_port;
@@ -127,7 +153,7 @@ void ScriptDebuggerRemote::debug(ScriptLanguage *p_script,bool p_can_continue) {
 			ERR_CONTINUE( cmd[0].get_type()!=Variant::STRING );
 
 			String command = cmd[0];
-			cmd.remove(0);
+
 
 
 			if (command=="get_stack_dump") {
@@ -150,7 +176,7 @@ void ScriptDebuggerRemote::debug(ScriptLanguage *p_script,bool p_can_continue) {
 
 			} else if (command=="get_stack_frame_vars") {
 
-
+				cmd.remove(0);
 				ERR_CONTINUE( cmd.size()!=1 );
 				int lv = cmd[0];
 
@@ -243,6 +269,20 @@ void ScriptDebuggerRemote::debug(ScriptLanguage *p_script,bool p_can_continue) {
 
 				if (request_scene_tree)
 					request_scene_tree(request_scene_tree_ud);
+
+			} else if (command=="request_video_mem") {
+
+				_send_video_memory();
+			} else if (command=="breakpoint") {
+
+				bool set = cmd[3];
+				if (set)
+					insert_breakpoint(cmd[2],cmd[1]);
+				else
+					remove_breakpoint(cmd[2],cmd[1]);
+
+			} else {
+				_parse_live_edit(cmd);
 			}
 
 
@@ -287,6 +327,37 @@ void ScriptDebuggerRemote::_get_output() {
 		messages.pop_front();
 		locking=false;
 	}
+
+	while (errors.size()) {
+		locking=true;
+		packet_peer_stream->put_var("error");
+		OutputError oe = errors.front()->get();
+
+		packet_peer_stream->put_var(oe.callstack.size()+2);
+
+		Array error_data;
+
+		error_data.push_back(oe.hr);
+		error_data.push_back(oe.min);
+		error_data.push_back(oe.sec);
+		error_data.push_back(oe.msec);
+		error_data.push_back(oe.source_func);
+		error_data.push_back(oe.source_file);
+		error_data.push_back(oe.source_line);
+		error_data.push_back(oe.error);
+		error_data.push_back(oe.error_descr);
+		error_data.push_back(oe.warning);
+		packet_peer_stream->put_var(error_data);
+		packet_peer_stream->put_var(oe.callstack.size());
+		for(int i=0;i<oe.callstack.size();i++) {
+			packet_peer_stream->put_var(oe.callstack[i]);
+
+		}
+
+		errors.pop_front();
+		locking=false;
+
+	}
 	mutex->unlock();
 }
 
@@ -301,6 +372,160 @@ void ScriptDebuggerRemote::line_poll() {
 }
 
 
+void ScriptDebuggerRemote::_err_handler(void* ud,const char* p_func,const char*p_file,int p_line,const char *p_err, const char * p_descr,ErrorHandlerType p_type) {
+
+	if (p_type==ERR_HANDLER_SCRIPT)
+		return; //ignore script errors, those go through debugger
+
+	ScriptDebuggerRemote *sdr = (ScriptDebuggerRemote*)ud;
+
+	OutputError oe;
+	oe.error=p_err;
+	oe.error_descr=p_descr;
+	oe.source_file=p_file;
+	oe.source_line=p_line;
+	oe.source_func=p_func;
+	oe.warning=p_type==ERR_HANDLER_WARNING;
+	uint64_t time = OS::get_singleton()->get_ticks_msec();
+	oe.hr=time/3600000;
+	oe.min=(time/60000)%60;
+	oe.sec=(time/1000)%60;
+	oe.msec=time%1000;
+	Array cstack;
+
+	Vector<ScriptLanguage::StackInfo> si;
+
+	for(int i=0;i<ScriptServer::get_language_count();i++) {
+		si=ScriptServer::get_language(i)->debug_get_current_stack_info();
+		if (si.size())
+			break;
+	}
+
+	cstack.resize(si.size()*2);
+	for(int i=0;i<si.size();i++) {
+		String path;
+		int line=0;
+		if (si[i].script.is_valid()) {
+			path=si[i].script->get_path();
+			line=si[i].line;
+		}
+		cstack[i*2+0]=path;
+		cstack[i*2+1]=line;
+	}
+
+	oe.callstack=cstack;
+
+
+	sdr->mutex->lock();
+
+	if (!sdr->locking && sdr->tcp_client->is_connected()) {
+
+		sdr->errors.push_back(oe);
+	}
+
+	sdr->mutex->unlock();
+}
+
+
+bool ScriptDebuggerRemote::_parse_live_edit(const Array& cmd) {
+
+	String cmdstr = cmd[0];
+	if (!live_edit_funcs || !cmdstr.begins_with("live_"))
+		return false;
+
+
+	//print_line(Variant(cmd).get_construct_string());
+	if (cmdstr=="live_set_root") {
+
+		if (!live_edit_funcs->root_func)
+			return true;
+		//print_line("root: "+Variant(cmd).get_construct_string());
+		live_edit_funcs->root_func(live_edit_funcs->udata,cmd[1],cmd[2]);
+
+	} else if (cmdstr=="live_node_path") {
+
+		if (!live_edit_funcs->node_path_func)
+			return true;
+		//print_line("path: "+Variant(cmd).get_construct_string());
+
+		live_edit_funcs->node_path_func(live_edit_funcs->udata,cmd[1],cmd[2]);
+
+	} else if (cmdstr=="live_res_path") {
+
+		if (!live_edit_funcs->res_path_func)
+			return true;
+		live_edit_funcs->res_path_func(live_edit_funcs->udata,cmd[1],cmd[2]);
+
+	} else if (cmdstr=="live_node_prop_res") {
+		if (!live_edit_funcs->node_set_res_func)
+			return true;
+
+		live_edit_funcs->node_set_res_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3]);
+
+	} else if (cmdstr=="live_node_prop") {
+
+		if (!live_edit_funcs->node_set_func)
+			return true;
+		live_edit_funcs->node_set_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3]);
+
+	} else if (cmdstr=="live_res_prop_res") {
+
+		if (!live_edit_funcs->res_set_res_func)
+			return true;
+		live_edit_funcs->res_set_res_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3]);
+
+	} else if (cmdstr=="live_res_prop") {
+
+		if (!live_edit_funcs->res_set_func)
+			return true;
+		live_edit_funcs->res_set_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3]);
+
+	} else if (cmdstr=="live_node_call") {
+
+		if (!live_edit_funcs->node_call_func)
+			return true;
+		live_edit_funcs->node_call_func(live_edit_funcs->udata,cmd[1],cmd[2],  cmd[3],cmd[4],cmd[5],cmd[6],cmd[7]);
+
+	} else if (cmdstr=="live_res_call") {
+
+		if (!live_edit_funcs->res_call_func)
+			return true;
+		live_edit_funcs->res_call_func(live_edit_funcs->udata,cmd[1],cmd[2],  cmd[3],cmd[4],cmd[5],cmd[6],cmd[7]);
+
+	} else if (cmdstr=="live_create_node") {
+
+		live_edit_funcs->tree_create_node_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3]);
+
+	} else if (cmdstr=="live_instance_node") {
+
+		live_edit_funcs->tree_instance_node_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3]);
+
+	} else if (cmdstr=="live_remove_node") {
+
+		live_edit_funcs->tree_remove_node_func(live_edit_funcs->udata,cmd[1]);
+
+	} else if (cmdstr=="live_remove_and_keep_node") {
+
+		live_edit_funcs->tree_remove_and_keep_node_func(live_edit_funcs->udata,cmd[1],cmd[2]);
+	} else if (cmdstr=="live_restore_node") {
+
+		live_edit_funcs->tree_restore_node_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3]);
+
+	} else if (cmdstr=="live_duplicate_node") {
+
+		live_edit_funcs->tree_duplicate_node_func(live_edit_funcs->udata,cmd[1],cmd[2]);
+	} else if (cmdstr=="live_reparent_node") {
+
+		live_edit_funcs->tree_reparent_node_func(live_edit_funcs->udata,cmd[1],cmd[2],cmd[3],cmd[4]);
+
+	} else {
+
+		return false;
+	}
+
+	return true;
+}
+
 void ScriptDebuggerRemote::_poll_events() {
 
 	while(packet_peer_stream->get_available_packet_count()>0) {
@@ -321,7 +546,7 @@ void ScriptDebuggerRemote::_poll_events() {
 		ERR_CONTINUE( cmd[0].get_type()!=Variant::STRING );
 
 		String command = cmd[0];
-		cmd.remove(0);
+		//cmd.remove(0);
 
 		if (command=="break") {
 
@@ -331,6 +556,18 @@ void ScriptDebuggerRemote::_poll_events() {
 
 			if (request_scene_tree)
 				request_scene_tree(request_scene_tree_ud);
+		} else if (command=="request_video_mem") {
+
+			_send_video_memory();
+		} else if (command=="breakpoint") {
+
+			bool set = cmd[3];
+			if (set)
+				insert_breakpoint(cmd[2],cmd[1]);
+			else
+				remove_breakpoint(cmd[2],cmd[1]);
+		} else {
+			_parse_live_edit(cmd);
 		}
 
 	}
@@ -394,10 +631,35 @@ void ScriptDebuggerRemote::_print_handler(void *p_this,const String& p_string) {
 
 	ScriptDebuggerRemote *sdr = (ScriptDebuggerRemote*)p_this;
 
+	uint64_t ticks = OS::get_singleton()->get_ticks_usec()/1000;
+	sdr->msec_count+=ticks-sdr->last_msec;
+	sdr->last_msec=ticks;
+
+	if (sdr->msec_count>1000) {
+		sdr->char_count=0;
+		sdr->msec_count=0;
+	}
+
+	String s = p_string;
+	int allowed_chars = MIN(MAX(sdr->max_cps - sdr->char_count,0), s.length());
+
+	if (allowed_chars==0)
+		return;
+
+	if (allowed_chars<s.length()) {
+		s=s.substr(0,allowed_chars);
+	}
+
+	sdr->char_count+=allowed_chars;
+
+	if (sdr->char_count>=sdr->max_cps) {
+		s+="\n[output overflow, print less text!]\n";
+	}
+
 	sdr->mutex->lock();
 	if (!sdr->locking && sdr->tcp_client->is_connected()) {
 
-		sdr->output_strings .push_back(p_string);
+		sdr->output_strings.push_back(s);
 	}
 	sdr->mutex->unlock();
 }
@@ -413,6 +675,13 @@ void ScriptDebuggerRemote::set_request_scene_tree_message_func(RequestSceneTreeM
 	request_scene_tree_ud=p_udata;
 }
 
+void ScriptDebuggerRemote::set_live_edit_funcs(LiveEditFuncs *p_funcs) {
+
+	live_edit_funcs=p_funcs;
+}
+
+ScriptDebuggerRemote::ResourceUsageFunc ScriptDebuggerRemote::resource_usage_func=NULL;
+
 ScriptDebuggerRemote::ScriptDebuggerRemote() {
 
 	tcp_client  = StreamPeerTCP::create_ref();
@@ -429,13 +698,22 @@ ScriptDebuggerRemote::ScriptDebuggerRemote() {
 	last_perf_time=0;
 	poll_every=0;
 	request_scene_tree=NULL;
+	live_edit_funcs=NULL;
+	max_cps = GLOBAL_DEF("debug/max_remote_stdout_chars_per_second",2048);
+	char_count=0;
+	msec_count=0;
+	last_msec=0;
+
+	eh.errfunc=_err_handler;
+	eh.userdata=this;
+	add_error_handler(&eh);
 
 }
 
 ScriptDebuggerRemote::~ScriptDebuggerRemote() {
 
 	remove_print_handler(&phl);
+	remove_error_handler(&eh);
 	memdelete(mutex);
 
-
 }

+ 49 - 0
core/script_debugger_remote.h

@@ -34,6 +34,7 @@
 #include "io/stream_peer_tcp.h"
 #include "io/packet_peer.h"
 #include "list.h"
+
 class ScriptDebuggerRemote : public ScriptDebugger {
 
 	struct Message {
@@ -42,6 +43,8 @@ class ScriptDebuggerRemote : public ScriptDebugger {
 		Array data;
 	};
 
+
+
 	Ref<StreamPeerTCP> tcp_client;
 	Ref<PacketPeerStream> packet_peer_stream;
 
@@ -49,8 +52,31 @@ class ScriptDebuggerRemote : public ScriptDebugger {
 	Object *performance;
 	bool requested_quit;
 	Mutex *mutex;
+
+	struct OutputError {
+
+		int hr;
+		int min;
+		int sec;
+		int msec;
+		String source_file;
+		String source_func;
+		int source_line;
+		String error;
+		String error_descr;
+		bool warning;
+		Array callstack;
+
+	};
+
 	List<String> output_strings;
 	List<Message> messages;
+	List<OutputError> errors;
+
+	int max_cps;
+	int char_count;
+	uint64_t last_msec;
+	uint64_t msec_count;
 
 	bool locking; //hack to avoid a deadloop
 	static void _print_handler(void *p_this,const String& p_string);
@@ -62,12 +88,34 @@ class ScriptDebuggerRemote : public ScriptDebugger {
 	uint32_t poll_every;
 
 
+	bool _parse_live_edit(const Array &p_command);
+
 	RequestSceneTreeMessageFunc request_scene_tree;
 	void *request_scene_tree_ud;
 
+	void _send_video_memory();
+	LiveEditFuncs *live_edit_funcs;
+
+	ErrorHandlerList eh;
+	static void _err_handler(void*,const char*,const char*,int p_line,const char *, const char *,ErrorHandlerType p_type);
+
 
 public:
 
+	struct ResourceUsage {
+
+		String path;
+		String format;
+		String type;
+		RID id;
+		int vram;
+		bool operator<(const ResourceUsage& p_img) const { return vram==p_img.vram ? id<p_img.id : vram > p_img.vram; }
+	};
+
+	typedef void (*ResourceUsageFunc)(List<ResourceUsage>*);
+
+	static ResourceUsageFunc resource_usage_func;
+
 	Error connect_to_host(const String& p_host,uint16_t p_port);
 	virtual void debug(ScriptLanguage *p_script,bool p_can_continue=true);
 	virtual void idle_poll();
@@ -79,6 +127,7 @@ public:
 	virtual void send_message(const String& p_message, const Array& p_args);
 
 	virtual void set_request_scene_tree_message_func(RequestSceneTreeMessageFunc p_func, void *p_udata);
+	virtual void set_live_edit_funcs(LiveEditFuncs *p_funcs);
 
 	ScriptDebuggerRemote();
 	~ScriptDebuggerRemote();

+ 36 - 0
core/script_language.h

@@ -176,6 +176,13 @@ public:
 	virtual void debug_get_globals(List<String> *p_locals, List<Variant> *p_values, int p_max_subitems=-1,int p_max_depth=-1)=0;
 	virtual String debug_parse_stack_level_expression(int p_level,const String& p_expression,int p_max_subitems=-1,int p_max_depth=-1)=0;
 
+	struct StackInfo {
+		Ref<Script> script;
+		int line;
+	};
+
+	virtual Vector<StackInfo> debug_get_current_stack_info() { return Vector<StackInfo>(); }
+
 	/* LOADER FUNCTIONS */
 
 	virtual void get_recognized_extensions(List<String> *p_extensions) const=0;
@@ -238,6 +245,32 @@ public:
 
 	typedef void (*RequestSceneTreeMessageFunc)(void *);
 
+	struct LiveEditFuncs {
+
+		void *udata;
+		void (*node_path_func)(void *,const NodePath &p_path,int p_id);
+		void (*res_path_func)(void *,const String &p_path,int p_id);
+
+		void (*node_set_func)(void *,int p_id,const StringName& p_prop,const Variant& p_value);
+		void (*node_set_res_func)(void *,int p_id,const StringName& p_prop,const String& p_value);
+		void (*node_call_func)(void *,int p_id,const StringName& p_method,VARIANT_ARG_DECLARE);
+		void (*res_set_func)(void *,int p_id,const StringName& p_prop,const Variant& p_value);
+		void (*res_set_res_func)(void *,int p_id,const StringName& p_prop,const String& p_value);
+		void (*res_call_func)(void *,int p_id,const StringName& p_method,VARIANT_ARG_DECLARE);
+		void (*root_func)(void*, const NodePath& p_scene_path,const String& p_scene_from);
+
+		void (*tree_create_node_func)(void*,const NodePath& p_parent,const String& p_type,const String& p_name);
+		void (*tree_instance_node_func)(void*,const NodePath& p_parent,const String& p_path,const String& p_name);
+		void (*tree_remove_node_func)(void*,const NodePath& p_at);
+		void (*tree_remove_and_keep_node_func)(void*,const NodePath& p_at,ObjectID p_keep_id);
+		void (*tree_restore_node_func)(void*,ObjectID p_id,const NodePath& p_at,int p_at_pos);
+		void (*tree_duplicate_node_func)(void*,const NodePath& p_at,const String& p_new_name);
+		void (*tree_reparent_node_func)(void*,const NodePath& p_at,const NodePath& p_new_place,const String& p_new_name,int p_at_pos);
+
+	};
+
+
+
 	_FORCE_INLINE_ static ScriptDebugger * get_singleton() { return singleton; }
 	void set_lines_left(int p_left);
 	int get_lines_left() const;
@@ -252,10 +285,12 @@ public:
 	bool is_breakpoint_line(int p_line) const;
 	void clear_breakpoints();
 
+
 	virtual void debug(ScriptLanguage *p_script,bool p_can_continue=true)=0;
 	virtual void idle_poll();
 	virtual void line_poll();
 
+
 	void set_break_language(ScriptLanguage *p_lang);
 	ScriptLanguage* get_break_language() const;
 
@@ -265,6 +300,7 @@ public:
 	virtual void request_quit() {}
 
 	virtual void set_request_scene_tree_message_func(RequestSceneTreeMessageFunc p_func, void *p_udata) {}
+	virtual void set_live_edit_funcs(LiveEditFuncs *p_funcs) {}
 
 	ScriptDebugger();
 	virtual ~ScriptDebugger() {singleton=NULL;}

+ 2 - 2
core/typedefs.h

@@ -41,8 +41,8 @@
 #define _MKSTR(m_x) _STR(m_x)
 #endif
 // have to include version.h for this to work, include it in the .cpp not the .h
-#define VERSION_MKSTRING _MKSTR(VERSION_MAJOR)"."_MKSTR(VERSION_MINOR)"."_MKSTR(VERSION_STATUS)"."_MKSTR(VERSION_REVISION)
-#define VERSION_FULL_NAME _MKSTR(VERSION_NAME)" v"VERSION_MKSTRING
+#define VERSION_MKSTRING _MKSTR(VERSION_MAJOR)"." _MKSTR(VERSION_MINOR)"." _MKSTR(VERSION_STATUS)"." _MKSTR(VERSION_REVISION)
+#define VERSION_FULL_NAME _MKSTR(VERSION_NAME)" v" VERSION_MKSTRING
 
 
 #ifndef _ALWAYS_INLINE_

+ 27 - 0
core/undo_redo.cpp

@@ -244,7 +244,12 @@ void UndoRedo::_process_operation_list(List<Operation>::Element *E) {
 				Resource* res = obj->cast_to<Resource>();
 				if (res)
 					res->set_edited(true);
+
 #endif
+
+				if (method_callback) {
+					method_callback(method_callbck_ud,obj,op.name,VARIANT_ARGS_FROM_ARRAY(op.args));
+				}
 			} break;
 			case Operation::TYPE_PROPERTY: {
 
@@ -254,6 +259,9 @@ void UndoRedo::_process_operation_list(List<Operation>::Element *E) {
 				if (res)
 					res->set_edited(true);
 #endif
+				if (property_callback) {
+					property_callback(prop_callback_ud,obj,op.name,op.args[0]);
+				}
 			} break;
 			case Operation::TYPE_REFERENCE: {
 				//do nothing
@@ -325,6 +333,19 @@ void UndoRedo::set_commit_notify_callback(CommitNotifyCallback p_callback,void*
 	callback_ud=p_ud;
 }
 
+void UndoRedo::set_method_notify_callback(MethodNotifyCallback p_method_callback,void* p_ud) {
+
+	method_callback=p_method_callback;
+	method_callbck_ud=p_ud;
+}
+
+void UndoRedo::set_property_notify_callback(PropertyNotifyCallback p_property_callback,void* p_ud){
+
+	property_callback=p_property_callback;
+	prop_callback_ud=p_ud;
+}
+
+
 UndoRedo::UndoRedo() {
 
 	version=1;
@@ -334,6 +355,12 @@ UndoRedo::UndoRedo() {
 	merging=true;
 	callback=NULL;
 	callback_ud=NULL;
+
+	method_callbck_ud=NULL;
+	prop_callback_ud=NULL;
+	method_callback=NULL;
+	property_callback=NULL;
+
 }
 
 UndoRedo::~UndoRedo() {

+ 11 - 0
core/undo_redo.h

@@ -45,6 +45,9 @@ public:
 	Variant _add_do_method(const Variant** p_args, int p_argcount, Variant::CallError& r_error);
 	Variant _add_undo_method(const Variant** p_args, int p_argcount, Variant::CallError& r_error);
 
+	typedef void (*MethodNotifyCallback)(void *p_ud,Object*p_base,const StringName& p_name,VARIANT_ARG_DECLARE);
+	typedef void (*PropertyNotifyCallback)(void *p_ud,Object*p_base,const StringName& p_property,const Variant& p_value);
+
 private:
 	struct Operation {
 
@@ -83,6 +86,11 @@ private:
 
 	CommitNotifyCallback callback;
 	void* callback_ud;
+	void* method_callbck_ud;
+	void* prop_callback_ud;
+
+	MethodNotifyCallback method_callback;
+	PropertyNotifyCallback property_callback;
 
 
 protected:
@@ -113,6 +121,9 @@ public:
 
 	void set_commit_notify_callback(CommitNotifyCallback p_callback,void* p_ud);
 
+	void set_method_notify_callback(MethodNotifyCallback p_method_callback,void* p_ud);
+	void set_property_notify_callback(PropertyNotifyCallback p_property_callback,void* p_ud);
+
 	UndoRedo();
 	~UndoRedo();
 };

+ 35 - 4
core/ustring.cpp

@@ -3048,6 +3048,37 @@ bool String::is_valid_identifier() const {
 
 //kind of poor should be rewritten properly
 
+String String::world_wrap(int p_chars_per_line) const {
+
+	int from=0;
+	int last_space=0;
+	String ret;
+	for(int i=0;i<length();i++) {
+		if (i-from>=p_chars_per_line) {
+			if (last_space==-1) {
+				ret+=substr(from,i-from+1)+"\n";
+			} else {
+				ret+=substr(from,last_space-from)+"\n";
+				i=last_space; //rewind
+			}
+			from=i+1;
+			last_space=-1;
+		} else if (operator[](i)==' ' || operator[](i)=='\t') {
+			last_space=i;
+		} else if (operator[](i)=='\n') {
+			ret+=substr(from,i-from)+"\n";
+			from=i+1;
+			last_space=-1;
+		}
+	}
+
+	if (from<length()) {
+		ret+=substr(from,length());
+	}
+
+	return ret;
+}
+
 String String::c_unescape() const {
 
 	String escaped=*this;
@@ -3088,8 +3119,8 @@ String String::xml_escape(bool p_escape_quotes) const {
 
 	String str=*this;
 	str=str.replace("&","&amp;");
-	str=str.replace("<","&gt;");
-	str=str.replace(">","&lt;");
+	str=str.replace("<","&lt;");
+	str=str.replace(">","&gt;");
 	if (p_escape_quotes) {
 		str=str.replace("'","&apos;");
 		str=str.replace("\"","&quot;");
@@ -3141,12 +3172,12 @@ static _FORCE_INLINE_ int _xml_unescape(const CharType *p_src,int p_src_len,Char
 			} else if (p_src_len>=4 && p_src[1]=='g' && p_src[2]=='t' && p_src[3]==';') {
 
 				if (p_dst)
-					*p_dst='<';
+					*p_dst='>';
 				eat=4;
 			} else if (p_src_len>=4 && p_src[1]=='l' && p_src[2]=='t' && p_src[3]==';') {
 
 				if (p_dst)
-					*p_dst='>';
+					*p_dst='<';
 				eat=4;
 			} else if (p_src_len>=5 && p_src[1]=='a' && p_src[2]=='m' && p_src[3]=='p' && p_src[4]==';') {
 

+ 1 - 0
core/ustring.h

@@ -209,6 +209,7 @@ public:
 	String xml_unescape() const;
 	String c_escape() const;
 	String c_unescape() const;
+	String world_wrap(int p_chars_per_line) const;
 	
 	String percent_encode() const;
 	String percent_decode() const;

+ 10 - 2
core/variant.cpp

@@ -1592,9 +1592,17 @@ Variant::operator String() const {
 		} break;
 		case OBJECT: {
 
-			if (_get_obj().obj)
+			if (_get_obj().obj) {
+				#ifdef DEBUG_ENABLED
+					if (ScriptDebugger::get_singleton() && _get_obj().ref.is_null()) {
+						//only if debugging!
+						if (!ObjectDB::instance_validate(_get_obj().obj)) {
+							return "[Deleted Object]";
+						};
+					};
+				#endif
 				return "["+_get_obj().obj->get_type()+":"+itos(_get_obj().obj->get_instance_ID())+"]";
-			else
+			} else
 				return "[Object:null]";
 
 		} break;

+ 8 - 10
core/variant_call.cpp

@@ -359,6 +359,8 @@ static void _call_##m_type##_##m_method(Variant& r_ret,Variant& p_self,const Var
 	VCALL_LOCALMEM1R(Vector3, dot);
 	VCALL_LOCALMEM1R(Vector3, cross);
 	VCALL_LOCALMEM0R(Vector3, abs);
+	VCALL_LOCALMEM0R(Vector3, floor);
+	VCALL_LOCALMEM0R(Vector3, ceil);
 	VCALL_LOCALMEM1R(Vector3, distance_to);
 	VCALL_LOCALMEM1R(Vector3, distance_squared_to);
 	VCALL_LOCALMEM1R(Vector3, slide);
@@ -753,7 +755,7 @@ static void _call_##m_type##_##m_method(Variant& r_ret,Variant& p_self,const Var
 	}
 
 	static void Matrix32_init2(Variant& r_ret,const Variant** p_args) {
-		
+
 		Matrix32 m(*p_args[0], *p_args[1]);
 		r_ret=m;
 	}
@@ -1133,7 +1135,7 @@ void Variant::get_method_list(List<MethodInfo> *p_list) const {
 		if (fd.returns)
 			ret.name="ret";
 		mi.return_val=ret;
-#endif		
+#endif
 
 		p_list->push_back(mi);
 	}
@@ -1336,6 +1338,8 @@ _VariantCall::addfunc(Variant::m_vtype,Variant::m_ret,_SCS(#m_method),VCALL(m_cl
 	ADDFUNC1(VECTOR3,REAL,Vector3,dot,VECTOR3,"b",varray());
 	ADDFUNC1(VECTOR3,VECTOR3,Vector3,cross,VECTOR3,"b",varray());
 	ADDFUNC0(VECTOR3,VECTOR3,Vector3,abs,varray());
+	ADDFUNC0(VECTOR3,VECTOR3,Vector3,floor,varray());
+	ADDFUNC0(VECTOR3,VECTOR3,Vector3,ceil,varray());
 	ADDFUNC1(VECTOR3,REAL,Vector3,distance_to,VECTOR3,"b",varray());
 	ADDFUNC1(VECTOR3,REAL,Vector3,distance_squared_to,VECTOR3,"b",varray());
 	ADDFUNC1(VECTOR3,VECTOR3,Vector3,slide,VECTOR3,"by",varray());
@@ -1535,10 +1539,10 @@ _VariantCall::addfunc(Variant::m_vtype,Variant::m_ret,_SCS(#m_method),VCALL(m_cl
 	ADDFUNC1(TRANSFORM,NIL,Transform,xform,NIL,"v",varray());
 	ADDFUNC1(TRANSFORM,NIL,Transform,xform_inv,NIL,"v",varray());
 
-#ifdef DEBUG_ENABLED	
+#ifdef DEBUG_ENABLED
 	_VariantCall::type_funcs[Variant::TRANSFORM].functions["xform"].returns=true;
 	_VariantCall::type_funcs[Variant::TRANSFORM].functions["xform_inv"].returns=true;
-#endif	
+#endif
 
 	ADDFUNC0(INPUT_EVENT,BOOL,InputEvent,is_pressed,varray());
 	ADDFUNC1(INPUT_EVENT,BOOL,InputEvent,is_action,STRING,"action",varray());
@@ -1635,9 +1639,3 @@ void unregister_variant_methods() {
 
 
 }
-
-
-
-
-
-

+ 21 - 0
demos/2d/dynamic_collision_shapes/ball.gd

@@ -0,0 +1,21 @@
+
+extends RigidBody2D
+
+# member variables here, example:
+# var a=2
+# var b="textvar"
+
+var timeout=5
+
+func _process(delta):
+	timeout-=delta
+	if (timeout<1):
+		set_opacity(timeout)
+	if (timeout<0):
+		queue_free()
+func _ready():
+	set_process(true)
+	# Initialization here
+	pass
+
+

BIN
demos/2d/dynamic_collision_shapes/ball.png


BIN
demos/2d/dynamic_collision_shapes/ball.scn


BIN
demos/2d/dynamic_collision_shapes/box.png


BIN
demos/2d/dynamic_collision_shapes/circle.png


+ 23 - 0
demos/2d/dynamic_collision_shapes/dynamic_colobjs.gd

@@ -0,0 +1,23 @@
+
+extends Node2D
+
+# member variables here, example:
+# var a=2
+# var b="textvar"
+const EMIT_INTERVAL=0.1
+var timeout=EMIT_INTERVAL
+
+func _process(delta):
+	timeout-=delta
+	if (timeout<0):
+		timeout=EMIT_INTERVAL
+		var ball = preload("res://ball.scn").instance()
+		ball.set_pos( Vector2(randf() * get_viewport_rect().size.x, 0) )
+		add_child(ball)
+			
+func _ready():
+	set_process(true)
+	# Initialization here
+	pass
+
+

BIN
demos/2d/dynamic_collision_shapes/dynamic_colobjs.scn


+ 4 - 0
demos/2d/dynamic_collision_shapes/engine.cfg

@@ -0,0 +1,4 @@
+[application]
+
+name="Run-Time CollisionShape"
+main_scene="res://dynamic_colobjs.scn"

BIN
demos/2d/dynamic_collision_shapes/poly.png


BIN
demos/2d/isometric/dungeon.scn


BIN
demos/2d/lights_shadows/light_shadows.scn


+ 8 - 0
demos/2d/platformer/coin.gd

@@ -18,3 +18,11 @@ func _ready():
 	# Initalization here
 	pass
 
+
+
+func _on_coin_area_enter( area ):
+	pass # replace with function body
+
+
+func _on_coin_area_enter_shape( area_id, area, area_shape, area_shape ):
+	pass # replace with function body

+ 174 - 169
demos/2d/platformer/enemy.xml

@@ -1,10 +1,10 @@
 <?xml version="1.0" encoding="UTF-8" ?>
-<resource_file type="PackedScene" subresource_count="11" version="1.0" version_name="Godot Engine v1.0.3917-beta1">
-	<ext_resource path="res://sound_explode.*" type="Sample"></ext_resource>
-	<ext_resource path="res://enemy.*" type="Texture"></ext_resource>
-	<ext_resource path="res://enemy.*" type="Script"></ext_resource>
-	<ext_resource path="res://sound_hit.*" type="Sample"></ext_resource>
-	<ext_resource path="res://bullet.*" type="Texture"></ext_resource>
+<resource_file type="PackedScene" subresource_count="12" version="2.0" version_name="Godot Engine v2.0.alpha.custom_build">
+	<ext_resource path="res://bullet.png" type="Texture" index="2"></ext_resource>
+	<ext_resource path="res://enemy.gd" type="Script" index="0"></ext_resource>
+	<ext_resource path="res://enemy.png" type="Texture" index="1"></ext_resource>
+	<ext_resource path="res://sound_hit.wav" type="Sample" index="4"></ext_resource>
+	<ext_resource path="res://sound_explode.wav" type="Sample" index="3"></ext_resource>
 	<resource type="CircleShape2D" path="local://1">
 		<real name="custom_solver_bias"> 0 </real>
 		<real name="radius"> 14 </real>
@@ -21,6 +21,8 @@
 		<dictionary name="tracks/0/keys" shared="false">
 			<string> "cont" </string>
 			<bool> False </bool>
+			<string> "times" </string>
+			<real_array  len="10"> 				0, 0.75, 1.5, 2.25, 3, 3.75, 4.5, 5.25, 6, 6.75 </real_array>
 			<string> "transitions" </string>
 			<real_array  len="10"> 				1, 1, 1, 1, 1, 1, 1, 1, 1, 1 </real_array>
 			<string> "values" </string>
@@ -36,8 +38,33 @@
 				<int> 7 </int>
 				<int> 5 </int>
 			</array>
+		</dictionary>
+
+	</resource>
+	<resource type="Animation" path="local://4">
+		<string name="resource/name"> "walk" </string>
+		<real name="length"> 1.25 </real>
+		<bool name="loop"> True </bool>
+		<real name="step"> 0.25 </real>
+		<string name="tracks/0/type"> "value" </string>
+		<node_path name="tracks/0/path"> "sprite:frame" </node_path>
+		<int name="tracks/0/interp"> 1 </int>
+		<dictionary name="tracks/0/keys" shared="false">
+			<string> "cont" </string>
+			<bool> False </bool>
 			<string> "times" </string>
-			<real_array  len="10"> 				0, 0.75, 1.5, 2.25, 3, 3.75, 4.5, 5.25, 6, 6.75 </real_array>
+			<real_array  len="6"> 				0, 0.25, 0.5, 0.75, 1, 1.25 </real_array>
+			<string> "transitions" </string>
+			<real_array  len="6"> 				1, 1, 1, 1, 1, 1 </real_array>
+			<string> "values" </string>
+			<array  len="6" shared="false">
+				<int> 0 </int>
+				<int> 1 </int>
+				<int> 2 </int>
+				<int> 3 </int>
+				<int> 4 </int>
+				<int> 0 </int>
+			</array>
 		</dictionary>
 
 	</resource>
@@ -52,6 +79,8 @@
 		<dictionary name="tracks/0/keys" shared="false">
 			<string> "cont" </string>
 			<bool> True </bool>
+			<string> "times" </string>
+			<real_array  len="2"> 				3.58422, 4.33851 </real_array>
 			<string> "transitions" </string>
 			<real_array  len="2"> 				1, 1 </real_array>
 			<string> "values" </string>
@@ -59,8 +88,6 @@
 				<real> 1 </real>
 				<real> 0 </real>
 			</array>
-			<string> "times" </string>
-			<real_array  len="2"> 				3.58422, 4.33851 </real_array>
 		</dictionary>
 		<string name="tracks/1/type"> "value" </string>
 		<node_path name="tracks/1/path"> "sprite:frame" </node_path>
@@ -68,14 +95,14 @@
 		<dictionary name="tracks/1/keys" shared="false">
 			<string> "cont" </string>
 			<bool> True </bool>
+			<string> "times" </string>
+			<real_array  len="1"> 				0 </real_array>
 			<string> "transitions" </string>
 			<real_array  len="1"> 				1 </real_array>
 			<string> "values" </string>
 			<array  len="1" shared="false">
 				<int> 4 </int>
 			</array>
-			<string> "times" </string>
-			<real_array  len="1"> 				0 </real_array>
 		</dictionary>
 		<string name="tracks/2/type"> "value" </string>
 		<node_path name="tracks/2/path"> "Particles2D:config/emitting" </node_path>
@@ -83,19 +110,21 @@
 		<dictionary name="tracks/2/keys" shared="false">
 			<string> "cont" </string>
 			<bool> False </bool>
+			<string> "times" </string>
+			<real_array  len="1"> 				3.47394 </real_array>
 			<string> "transitions" </string>
 			<real_array  len="1"> 				1 </real_array>
 			<string> "values" </string>
 			<array  len="1" shared="false">
 				<bool> True </bool>
 			</array>
-			<string> "times" </string>
-			<real_array  len="1"> 				3.47394 </real_array>
 		</dictionary>
 		<string name="tracks/3/type"> "method" </string>
 		<node_path name="tracks/3/path"> "." </node_path>
 		<int name="tracks/3/interp"> 1 </int>
 		<dictionary name="tracks/3/keys" shared="false">
+			<string> "times" </string>
+			<real_array  len="2"> 				3.20357, 5.07305 </real_array>
 			<string> "transitions" </string>
 			<real_array  len="2"> 				1, 1 </real_array>
 			<string> "values" </string>
@@ -115,36 +144,12 @@
 					<string> "_die" </string>
 				</dictionary>
 			</array>
-			<string> "times" </string>
-			<real_array  len="2"> 				3.20357, 5.07305 </real_array>
 		</dictionary>
 
 	</resource>
-	<resource type="Animation" path="local://4">
-		<string name="resource/name"> "walk" </string>
-		<real name="length"> 1.25 </real>
-		<bool name="loop"> True </bool>
-		<real name="step"> 0.25 </real>
-		<string name="tracks/0/type"> "value" </string>
-		<node_path name="tracks/0/path"> "sprite:frame" </node_path>
-		<int name="tracks/0/interp"> 1 </int>
-		<dictionary name="tracks/0/keys" shared="false">
-			<string> "cont" </string>
-			<bool> False </bool>
-			<string> "transitions" </string>
-			<real_array  len="6"> 				1, 1, 1, 1, 1, 1 </real_array>
-			<string> "values" </string>
-			<array  len="6" shared="false">
-				<int> 0 </int>
-				<int> 1 </int>
-				<int> 2 </int>
-				<int> 3 </int>
-				<int> 4 </int>
-				<int> 0 </int>
-			</array>
-			<string> "times" </string>
-			<real_array  len="6"> 				0, 0.25, 0.5, 0.75, 1, 1.25 </real_array>
-		</dictionary>
+	<resource type="ColorRamp" path="local://6">
+		<real_array name="offsets" len="2"> 			0, 1 </real_array>
+		<color_array name="colors" len="2"> 			1, 0.884956, 0.823009, 1, 0.768627, 0.389381, 0, 0 </color_array>
 
 	</resource>
 	<resource type="SampleLibrary" path="local://5">
@@ -154,7 +159,7 @@
 			<string> "pitch" </string>
 			<real> 1 </real>
 			<string> "sample" </string>
-			<resource  resource_type="Sample" path="res://sound_explode.*">  </resource>
+			<resource  external="3">  </resource>
 		</dictionary>
 		<dictionary name="samples/hit" shared="false">
 			<string> "db" </string>
@@ -162,24 +167,21 @@
 			<string> "pitch" </string>
 			<real> 1 </real>
 			<string> "sample" </string>
-			<resource  resource_type="Sample" path="res://sound_hit.*">  </resource>
+			<resource  external="4">  </resource>
 		</dictionary>
 
 	</resource>
 	<main_resource>
 		<dictionary name="_bundled" shared="false">
+			<string> "conn_count" </string>
+			<int> 0 </int>
+			<string> "conns" </string>
+			<int_array  len="0"> 				 </int_array>
 			<string> "names" </string>
-			<string_array  len="132">
+			<string_array  len="107">
 				<string> "enemy" </string>
 				<string> "RigidBody2D" </string>
-				<string> "visibility/visible" </string>
-				<string> "visibility/opacity" </string>
-				<string> "visibility/self_opacity" </string>
-				<string> "visibility/on_top" </string>
-				<string> "transform/pos" </string>
-				<string> "transform/rot" </string>
-				<string> "transform/scale" </string>
-				<string> "shape_count" </string>
+				<string> "input/pickable" </string>
 				<string> "shapes/0/shape" </string>
 				<string> "shapes/0/transform" </string>
 				<string> "shapes/0/trigger" </string>
@@ -189,71 +191,71 @@
 				<string> "shapes/2/shape" </string>
 				<string> "shapes/2/transform" </string>
 				<string> "shapes/2/trigger" </string>
+				<string> "collision/layers" </string>
+				<string> "collision/mask" </string>
 				<string> "mode" </string>
 				<string> "mass" </string>
 				<string> "friction" </string>
 				<string> "bounce" </string>
+				<string> "gravity_scale" </string>
 				<string> "custom_integrator" </string>
 				<string> "continuous_cd" </string>
 				<string> "contacts_reported" </string>
 				<string> "contact_monitor" </string>
-				<string> "active" </string>
+				<string> "sleeping" </string>
 				<string> "can_sleep" </string>
 				<string> "velocity/linear" </string>
 				<string> "velocity/angular" </string>
+				<string> "damp_override/linear" </string>
+				<string> "damp_override/angular" </string>
 				<string> "script/script" </string>
 				<string> "__meta__" </string>
 				<string> "enabler" </string>
 				<string> "VisibilityEnabler2D" </string>
+				<string> "transform/pos" </string>
+				<string> "transform/scale" </string>
 				<string> "rect" </string>
 				<string> "enabler/pause_animations" </string>
 				<string> "enabler/freeze_bodies" </string>
+				<string> "enabler/pause_particles" </string>
+				<string> "enabler/process_parent" </string>
+				<string> "enabler/fixed_process_parent" </string>
 				<string> "anim" </string>
 				<string> "AnimationPlayer" </string>
 				<string> "playback/process_mode" </string>
 				<string> "playback/default_blend_time" </string>
 				<string> "root/root" </string>
 				<string> "anims/idle" </string>
-				<string> "anims/explode" </string>
 				<string> "anims/walk" </string>
+				<string> "anims/explode" </string>
 				<string> "playback/active" </string>
 				<string> "playback/speed" </string>
 				<string> "blend_times" </string>
 				<string> "autoplay" </string>
-				<string> "CollisionShape2D" </string>
-				<string> "shape" </string>
-				<string> "trigger" </string>
-				<string> "CollisionShape2D 2" </string>
-				<string> "CollisionShape2D 3" </string>
 				<string> "sprite" </string>
 				<string> "Sprite" </string>
 				<string> "texture" </string>
-				<string> "centered" </string>
-				<string> "offset" </string>
-				<string> "flip_h" </string>
-				<string> "flip_v" </string>
-				<string> "vframes" </string>
 				<string> "hframes" </string>
 				<string> "frame" </string>
-				<string> "modulate" </string>
-				<string> "region" </string>
-				<string> "region_rect" </string>
+				<string> "CollisionShape2D" </string>
+				<string> "shape" </string>
+				<string> "trigger" </string>
+				<string> "_update_shape_index" </string>
+				<string> "CollisionShape2D 2" </string>
+				<string> "CollisionShape2D 3" </string>
 				<string> "raycast_left" </string>
 				<string> "RayCast2D" </string>
 				<string> "enabled" </string>
 				<string> "cast_to" </string>
+				<string> "layer_mask" </string>
 				<string> "raycast_right" </string>
 				<string> "Particles2D" </string>
+				<string> "visibility/self_opacity" </string>
 				<string> "visibility/blend_mode" </string>
 				<string> "config/amount" </string>
 				<string> "config/lifetime" </string>
-				<string> "config/time_scale" </string>
-				<string> "config/preprocess" </string>
 				<string> "config/emit_timeout" </string>
 				<string> "config/emitting" </string>
-				<string> "config/offset" </string>
-				<string> "config/half_extents" </string>
-				<string> "config/local_space" </string>
 				<string> "config/explosiveness" </string>
 				<string> "config/texture" </string>
 				<string> "params/direction" </string>
@@ -266,32 +268,14 @@
 				<string> "params/radial_accel" </string>
 				<string> "params/tangential_accel" </string>
 				<string> "params/damping" </string>
+				<string> "params/initial_angle" </string>
 				<string> "params/initial_size" </string>
 				<string> "params/final_size" </string>
 				<string> "params/hue_variation" </string>
-				<string> "randomness/direction" </string>
-				<string> "randomness/spread" </string>
-				<string> "randomness/linear_velocity" </string>
+				<string> "params/anim_speed_scale" </string>
+				<string> "params/anim_initial_pos" </string>
 				<string> "randomness/spin_velocity" </string>
-				<string> "randomness/orbit_velocity" </string>
-				<string> "randomness/gravity_direction" </string>
-				<string> "randomness/gravity_strength" </string>
-				<string> "randomness/radial_accel" </string>
-				<string> "randomness/tangential_accel" </string>
-				<string> "randomness/damping" </string>
-				<string> "randomness/initial_size" </string>
-				<string> "randomness/final_size" </string>
-				<string> "randomness/hue_variation" </string>
-				<string> "color_phases/count" </string>
-				<string> "phase_0/pos" </string>
-				<string> "phase_0/color" </string>
-				<string> "phase_1/pos" </string>
-				<string> "phase_1/color" </string>
-				<string> "phase_2/pos" </string>
-				<string> "phase_2/color" </string>
-				<string> "phase_3/pos" </string>
-				<string> "phase_3/color" </string>
-				<string> "emission_points" </string>
+				<string> "color/color_ramp" </string>
 				<string> "sound" </string>
 				<string> "SamplePlayer2D" </string>
 				<string> "params/volume_db" </string>
@@ -303,125 +287,154 @@
 				<string> "config/samples" </string>
 				<string> "config/pitch_random" </string>
 			</string_array>
-			<string> "version" </string>
-			<int> 1 </int>
-			<string> "conn_count" </string>
-			<int> 0 </int>
 			<string> "node_count" </string>
 			<int> 11 </int>
+			<string> "nodes" </string>
+			<int_array  len="285"> 				-1, -1, 1, 0, -1, 29, 2, 0, 3, 1, 4, 2, 5, 0, 6, 1, 7, 3, 8, 0, 9, 1, 10, 4, 11, 0, 12, 5, 13, 5, 14, 6, 15, 7, 16, 8, 17, 8, 18, 7, 19, 0, 20, 9, 21, 10, 22, 0, 23, 0, 24, 11, 25, 12, 26, 8, 27, 13, 28, 13, 29, 14, 30, 15, 0, 0, 0, 32, 31, -1, 8, 33, 16, 34, 17, 35, 18, 36, 11, 37, 11, 38, 11, 39, 0, 40, 0, 0, 0, 0, 42, 41, -1, 10, 43, 5, 44, 8, 45, 19, 46, 20, 47, 21, 48, 22, 49, 11, 50, 23, 51, 24, 52, 25, 0, 0, 0, 54, 53, -1, 3, 55, 26, 56, 27, 57, 10, 0, 0, 0, 58, 58, -1, 4, 33, 28, 59, 1, 60, 0, 61, 29, 0, 0, 0, 58, 62, -1, 4, 33, 30, 59, 1, 60, 0, 61, 29, 0, 0, 0, 58, 63, -1, 4, 33, 31, 59, 1, 60, 0, 61, 29, 0, 0, 0, 65, 64, -1, 4, 33, 32, 66, 11, 67, 33, 68, 5, 0, 0, 0, 65, 69, -1, 4, 33, 34, 66, 11, 67, 33, 68, 5, 0, 0, 0, 70, 70, -1, 26, 71, 35, 72, 5, 73, 36, 74, 37, 75, 37, 76, 0, 77, 38, 78, 39, 79, 8, 80, 40, 81, 41, 82, 42, 83, 8, 84, 8, 85, 43, 86, 8, 87, 8, 88, 8, 89, 8, 90, 42, 91, 23, 92, 8, 93, 7, 94, 8, 95, 7, 96, 44, 0, 0, 0, 98, 97, -1, 8, 99, 8, 100, 7, 101, 7, 102, 45, 103, 7, 104, 46, 105, 47, 106, 8, 0 </int_array>
 			<string> "variants" </string>
-			<array  len="51" shared="false">
-				<bool> True </bool>
-				<real> 1 </real>
-				<vector2> 0, 0 </vector2>
-				<real> 0 </real>
-				<vector2> 1, 1 </vector2>
-				<int> 3 </int>
+			<array  len="48" shared="false">
+				<bool> False </bool>
 				<resource  resource_type="Shape2D" path="local://1">  </resource>
 				<matrix32> 1, -0, 0, 1, -1.08072, -2.16144 </matrix32>
-				<bool> False </bool>
 				<matrix32> 1, -0, 0, 1, 6.48431, 3.24216 </matrix32>
 				<matrix32> 1, -0, 0, 1, -12.495, 3.53415 </matrix32>
+				<int> 1 </int>
 				<int> 2 </int>
+				<real> 1 </real>
+				<real> 0 </real>
+				<int> 0 </int>
 				<int> 4 </int>
-				<resource  resource_type="Script" path="res://enemy.*">  </resource>
+				<bool> True </bool>
+				<vector2> 0, 0 </vector2>
+				<real> -1 </real>
+				<resource  external="0">  </resource>
 				<dictionary  shared="false">
+					<string> "__editor_plugin_screen__" </string>
+					<string> "2D" </string>
 					<string> "__editor_plugin_states__" </string>
 					<dictionary  shared="false">
-						<string> "Script" </string>
-						<dictionary  shared="false">
-							<string> "current" </string>
-							<int> 0 </int>
-							<string> "sources" </string>
-							<array  len="1" shared="false">
-								<string> "res://enemy.gd" </string>
-							</array>
-						</dictionary>
 						<string> "2D" </string>
 						<dictionary  shared="false">
-							<string> "pixel_snap" </string>
+							<string> "ofs" </string>
+							<vector2> -227.625, -197.9 </vector2>
+							<string> "snap_grid" </string>
+							<bool> False </bool>
+							<string> "snap_offset" </string>
+							<vector2> 0, 0 </vector2>
+							<string> "snap_pixel" </string>
+							<bool> False </bool>
+							<string> "snap_relative" </string>
+							<bool> False </bool>
+							<string> "snap_rotation" </string>
 							<bool> False </bool>
+							<string> "snap_rotation_offset" </string>
+							<real> 0 </real>
+							<string> "snap_rotation_step" </string>
+							<real> 0.261799 </real>
+							<string> "snap_show_grid" </string>
+							<bool> False </bool>
+							<string> "snap_step" </string>
+							<vector2> 10, 10 </vector2>
 							<string> "zoom" </string>
 							<real> 1.108033 </real>
-							<string> "ofs" </string>
-							<vector2> -227.625, -197.9 </vector2>
 						</dictionary>
 						<string> "3D" </string>
 						<dictionary  shared="false">
-							<string> "zfar" </string>
-							<real> 500 </real>
+							<string> "ambient_light_color" </string>
+							<color> 0.15, 0.15, 0.15, 1 </color>
+							<string> "default_light" </string>
+							<bool> True </bool>
+							<string> "default_srgb" </string>
+							<bool> False </bool>
+							<string> "deflight_rot_x" </string>
+							<real> 0.942478 </real>
+							<string> "deflight_rot_y" </string>
+							<real> 0.628319 </real>
 							<string> "fov" </string>
 							<real> 45 </real>
+							<string> "show_grid" </string>
+							<bool> True </bool>
+							<string> "show_origin" </string>
+							<bool> True </bool>
+							<string> "viewport_mode" </string>
+							<int> 1 </int>
 							<string> "viewports" </string>
 							<array  len="4" shared="false">
 								<dictionary  shared="false">
 									<string> "distance" </string>
 									<real> 4 </real>
+									<string> "listener" </string>
+									<bool> True </bool>
+									<string> "pos" </string>
+									<vector3> 0, 0, 0 </vector3>
+									<string> "use_environment" </string>
+									<bool> False </bool>
+									<string> "use_orthogonal" </string>
+									<bool> False </bool>
 									<string> "x_rot" </string>
 									<real> 0 </real>
 									<string> "y_rot" </string>
 									<real> 0 </real>
-									<string> "use_orthogonal" </string>
-									<bool> False </bool>
-									<string> "use_environment" </string>
-									<bool> False </bool>
-									<string> "pos" </string>
-									<vector3> 0, 0, 0 </vector3>
 								</dictionary>
 								<dictionary  shared="false">
 									<string> "distance" </string>
 									<real> 4 </real>
+									<string> "listener" </string>
+									<bool> False </bool>
+									<string> "pos" </string>
+									<vector3> 0, 0, 0 </vector3>
+									<string> "use_environment" </string>
+									<bool> False </bool>
+									<string> "use_orthogonal" </string>
+									<bool> False </bool>
 									<string> "x_rot" </string>
 									<real> 0 </real>
 									<string> "y_rot" </string>
 									<real> 0 </real>
-									<string> "use_orthogonal" </string>
-									<bool> False </bool>
-									<string> "use_environment" </string>
-									<bool> False </bool>
-									<string> "pos" </string>
-									<vector3> 0, 0, 0 </vector3>
 								</dictionary>
 								<dictionary  shared="false">
 									<string> "distance" </string>
 									<real> 4 </real>
+									<string> "listener" </string>
+									<bool> False </bool>
+									<string> "pos" </string>
+									<vector3> 0, 0, 0 </vector3>
+									<string> "use_environment" </string>
+									<bool> False </bool>
+									<string> "use_orthogonal" </string>
+									<bool> False </bool>
 									<string> "x_rot" </string>
 									<real> 0 </real>
 									<string> "y_rot" </string>
 									<real> 0 </real>
-									<string> "use_orthogonal" </string>
-									<bool> False </bool>
-									<string> "use_environment" </string>
-									<bool> False </bool>
-									<string> "pos" </string>
-									<vector3> 0, 0, 0 </vector3>
 								</dictionary>
 								<dictionary  shared="false">
 									<string> "distance" </string>
 									<real> 4 </real>
+									<string> "listener" </string>
+									<bool> False </bool>
+									<string> "pos" </string>
+									<vector3> 0, 0, 0 </vector3>
+									<string> "use_environment" </string>
+									<bool> False </bool>
+									<string> "use_orthogonal" </string>
+									<bool> False </bool>
 									<string> "x_rot" </string>
 									<real> 0 </real>
 									<string> "y_rot" </string>
 									<real> 0 </real>
-									<string> "use_orthogonal" </string>
-									<bool> False </bool>
-									<string> "use_environment" </string>
-									<bool> False </bool>
-									<string> "pos" </string>
-									<vector3> 0, 0, 0 </vector3>
 								</dictionary>
 							</array>
-							<string> "viewport_mode" </string>
-							<int> 1 </int>
-							<string> "default_light" </string>
-							<bool> True </bool>
-							<string> "show_grid" </string>
-							<bool> True </bool>
-							<string> "show_origin" </string>
-							<bool> True </bool>
+							<string> "zfar" </string>
+							<real> 500 </real>
 							<string> "znear" </string>
 							<real> 0.1 </real>
 						</dictionary>
+						<string> "Anim" </string>
+						<dictionary  shared="false">
+							<string> "visible" </string>
+							<bool> False </bool>
+						</dictionary>
 					</dictionary>
 					<string> "__editor_run_settings__" </string>
 					<dictionary  shared="false">
@@ -430,28 +443,24 @@
 						<string> "run_mode" </string>
 						<int> 0 </int>
 					</dictionary>
-					<string> "__editor_plugin_screen__" </string>
-					<string> "2D" </string>
 				</dictionary>
 				<vector2> 16.2569, 11.0034 </vector2>
 				<vector2> 23.5056, 10.8629 </vector2>
 				<rect2> -10, -10, 20, 20 </rect2>
-				<int> 1 </int>
 				<node_path> ".." </node_path>
 				<resource  resource_type="Animation" path="local://2">  </resource>
-				<resource  resource_type="Animation" path="local://3">  </resource>
 				<resource  resource_type="Animation" path="local://4">  </resource>
+				<resource  resource_type="Animation" path="local://3">  </resource>
 				<real> 3 </real>
 				<array  len="0" shared="false">
 				</array>
 				<string> "" </string>
+				<resource  external="1">  </resource>
+				<int> 8 </int>
 				<vector2> -1.08072, -2.16144 </vector2>
+				<int> -1 </int>
 				<vector2> 6.48431, 3.24216 </vector2>
 				<vector2> -12.495, 3.53415 </vector2>
-				<resource  resource_type="Texture" path="res://enemy.*">  </resource>
-				<int> 8 </int>
-				<color> 1, 1, 1, 1 </color>
-				<rect2> 0, 0, 0, 0 </rect2>
 				<vector2> -33.2868, -9.34363 </vector2>
 				<vector2> 0, 45 </vector2>
 				<vector2> 29.1987, -9.34363 </vector2>
@@ -459,22 +468,18 @@
 				<int> 32 </int>
 				<real> 0.5 </real>
 				<real> 0.1 </real>
-				<resource  resource_type="Texture" path="res://bullet.*">  </resource>
+				<resource  external="2">  </resource>
 				<real> 180 </real>
 				<real> 90 </real>
 				<real> 2 </real>
 				<real> 9.8 </real>
-				<color> 1, 0.884956, 0.823009, 1 </color>
-				<color> 0.768627, 0.389381, 0, 0 </color>
-				<color> 0, 0, 0, 1 </color>
-				<vector2_array  len="0"> 					 </vector2_array>
+				<resource  resource_type="ColorRamp" path="local://6">  </resource>
 				<real> 2048 </real>
+				<int> 3 </int>
 				<resource  resource_type="SampleLibrary" path="local://5">  </resource>
 			</array>
-			<string> "nodes" </string>
-			<int_array  len="445"> 				-1, -1, 1, 0, -1, 31, 2, 0, 3, 1, 4, 1, 5, 0, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 8, 13, 6, 14, 9, 15, 8, 16, 6, 17, 10, 18, 8, 19, 11, 20, 1, 21, 3, 22, 3, 23, 8, 24, 8, 25, 12, 26, 8, 27, 0, 28, 0, 29, 2, 30, 3, 31, 13, 32, 14, 0, 0, 0, 34, 33, -1, 10, 2, 0, 3, 1, 4, 1, 5, 0, 6, 15, 7, 3, 8, 16, 35, 17, 36, 0, 37, 0, 0, 0, 0, 39, 38, -1, 10, 40, 18, 41, 3, 42, 19, 43, 20, 44, 21, 45, 22, 46, 0, 47, 23, 48, 24, 49, 25, 0, 0, 0, 50, 50, -1, 9, 2, 0, 3, 1, 4, 1, 5, 0, 6, 26, 7, 3, 8, 4, 51, 6, 52, 8, 0, 0, 0, 50, 53, -1, 9, 2, 0, 3, 1, 4, 1, 5, 0, 6, 27, 7, 3, 8, 4, 51, 6, 52, 8, 0, 0, 0, 50, 54, -1, 9, 2, 0, 3, 1, 4, 1, 5, 0, 6, 28, 7, 3, 8, 4, 51, 6, 52, 8, 0, 0, 0, 56, 55, -1, 18, 2, 0, 3, 1, 4, 1, 5, 0, 6, 2, 7, 3, 8, 4, 57, 29, 58, 0, 59, 2, 60, 8, 61, 8, 62, 18, 63, 30, 64, 12, 65, 31, 66, 8, 67, 32, 0, 0, 0, 69, 68, -1, 9, 2, 0, 3, 1, 4, 1, 5, 0, 6, 33, 7, 3, 8, 4, 70, 0, 71, 34, 0, 0, 0, 69, 72, -1, 9, 2, 0, 3, 1, 4, 1, 5, 0, 6, 35, 7, 3, 8, 4, 70, 0, 71, 34, 0, 0, 0, 73, 73, -1, 55, 2, 0, 3, 1, 4, 36, 5, 0, 74, 18, 6, 2, 7, 3, 8, 4, 75, 37, 76, 38, 77, 1, 78, 3, 79, 38, 80, 8, 81, 2, 82, 2, 83, 0, 84, 39, 85, 40, 86, 3, 87, 41, 88, 42, 89, 43, 90, 3, 91, 3, 92, 44, 93, 3, 94, 3, 95, 3, 96, 43, 97, 23, 98, 3, 99, 3, 100, 3, 101, 3, 102, 1, 103, 3, 104, 3, 105, 3, 106, 3, 107, 3, 108, 3, 109, 3, 110, 3, 111, 3, 112, 11, 113, 3, 114, 45, 115, 1, 116, 46, 117, 1, 118, 47, 119, 1, 120, 47, 121, 48, 0, 0, 0, 123, 122, -1, 15, 2, 0, 3, 1, 4, 1, 5, 0, 6, 2, 7, 3, 8, 4, 124, 3, 125, 1, 126, 1, 127, 49, 128, 1, 129, 5, 130, 50, 131, 3, 0 </int_array>
-			<string> "conns" </string>
-			<int_array  len="0"> 				 </int_array>
+			<string> "version" </string>
+			<int> 1 </int>
 		</dictionary>
 
 	</main_resource>

+ 3 - 3
demos/2d/platformer/engine.cfg

@@ -4,14 +4,14 @@ name="Platformer"
 main_scene="res://stage.xml"
 icon="res://icon.png"
 name_es="Plataformero"
+target_fps="60"
 
 [display]
 
 width=800
 height=480
-stretch_2d=false
-stretch_mode="viewport"
-stretch_aspect="keep"
+stretch_mode="2d"
+stretch_aspect="keep_height"
 
 [image_loader]
 

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 157 - 191
demos/2d/platformer/player.xml


Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 4 - 20
demos/2d/platformer/stage.xml


+ 1 - 0
demos/2d/platformer/tiles_demo.png.flags

@@ -0,0 +1 @@
+filter=false

+ 2 - 1
demos/3d/platformer/enemy.gd

@@ -91,4 +91,5 @@ func _ready():
 	# Initalization here
 	pass
 
-
+func _die():
+	queue_free()

BIN
demos/3d/platformer/enemy.scn


+ 3 - 3
demos/3d/platformer/player.gd

@@ -69,9 +69,9 @@ func _integrate_forces( state ):
 	var lv = state.get_linear_velocity() # linear velocity
 	var g = state.get_total_gravity()
 	var delta = state.get_step()
-	var d = 1.0 - delta*state.get_total_density()
-	if (d<0):
-		d=0
+#	var d = 1.0 - delta*state.get_total_density()
+#	if (d<0):
+#		d=0
 	lv += g * delta #apply gravity
 
 	var anim = ANIM_FLOOR

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 190 - 182
doc/base/classes.xml


Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 123 - 123
doc/engine_classes.xml


+ 5 - 3
drivers/SCsub

@@ -32,15 +32,17 @@ SConscript("rtaudio/SCsub");
 SConscript("nedmalloc/SCsub");
 SConscript("nrex/SCsub");
 SConscript("chibi/SCsub");
-if (env["vorbis"]=="yes" or env["speex"]=="yes" or env["theora"]=="yes"):
+if (env["vorbis"]=="yes" or env["speex"]=="yes" or env["theora"]=="yes" or env["opus"]=="yes"):
         SConscript("ogg/SCsub");
 if (env["vorbis"]=="yes"):
         SConscript("vorbis/SCsub");
+if (env["opus"]=="yes"):
+		SConscript('opus/SCsub');
 if (env["tools"]=="yes"):
 	SConscript("convex_decomp/SCsub");
 
-if env["theora"]=="yes":
-	SConscript("theoraplayer/SCsub")
+#if env["theora"]=="yes":
+#	SConscript("theoraplayer/SCsub")
 if (env["theora"]=="yes"):
 	SConscript("theora/SCsub");
 if (env['speex']=='yes'):

+ 12 - 1
drivers/chibi/event_stream_chibi.cpp

@@ -779,8 +779,10 @@ EventStreamChibi::EventStreamChibi() {
 
 
 
-RES ResourceFormatLoaderChibi::load(const String &p_path,const String& p_original_path) {
+RES ResourceFormatLoaderChibi::load(const String &p_path, const String& p_original_path, Error *r_error) {
 
+	if (r_error)
+		*r_error=ERR_FILE_CANT_OPEN;
 	String el = p_path.extension().to_lower();
 
 	CPFileAccessWrapperImpl f;
@@ -791,6 +793,8 @@ RES ResourceFormatLoaderChibi::load(const String &p_path,const String& p_origina
 		CPLoader_IT loader(&f);
 		CPLoader::Error err = loader.load_song(p_path.utf8().get_data(),&esc->song,false);
 		ERR_FAIL_COND_V(err!=CPLoader::FILE_OK,RES());
+		if (r_error)
+			*r_error=OK;
 
 		return esc;
 
@@ -800,6 +804,8 @@ RES ResourceFormatLoaderChibi::load(const String &p_path,const String& p_origina
 		CPLoader_XM loader(&f);
 		CPLoader::Error err=loader.load_song(p_path.utf8().get_data(),&esc->song,false);
 		ERR_FAIL_COND_V(err!=CPLoader::FILE_OK,RES());
+		if (r_error)
+			*r_error=OK;
 		return esc;
 
 	} else if (el=="s3m") {
@@ -808,6 +814,9 @@ RES ResourceFormatLoaderChibi::load(const String &p_path,const String& p_origina
 		CPLoader_S3M loader(&f);
 		CPLoader::Error err=loader.load_song(p_path.utf8().get_data(),&esc->song,false);
 		ERR_FAIL_COND_V(err!=CPLoader::FILE_OK,RES());
+		if (r_error)
+			*r_error=OK;
+
 		return esc;
 
 	} else if (el=="mod") {
@@ -816,6 +825,8 @@ RES ResourceFormatLoaderChibi::load(const String &p_path,const String& p_origina
 		CPLoader_MOD loader(&f);
 		CPLoader::Error err=loader.load_song(p_path.utf8().get_data(),&esc->song,false);
 		ERR_FAIL_COND_V(err!=CPLoader::FILE_OK,RES());
+		if (r_error)
+			*r_error=OK;
 		return esc;
 	}
 

+ 1 - 1
drivers/chibi/event_stream_chibi.h

@@ -301,7 +301,7 @@ public:
 class ResourceFormatLoaderChibi : public ResourceFormatLoader {
 
 public:
-	virtual RES load(const String &p_path,const String& p_original_path="");
+	virtual RES load(const String &p_path,const String& p_original_path="",Error *r_error=NULL);
 	virtual void get_recognized_extensions(List<String> *p_extensions) const;
 	virtual bool handles_type(const String& p_type) const;
 	virtual String get_resource_type(const String &p_path) const;

+ 2 - 1
drivers/convex_decomp/b2Glue.h

@@ -20,7 +20,8 @@
 #define B2GLUE_H
 
 #include "math_2d.h"
-#include <limits>
+#include <limits.h>
+
 namespace b2ConvexDecomp {
 
 typedef real_t float32;

+ 2 - 2
drivers/convex_decomp/b2Polygon.cpp

@@ -21,8 +21,8 @@
 #include "b2Triangle.h"
 #include "b2Polygon.h"
 
-#include <cmath>
-#include <climits>
+#include <math.h>
+#include <limits.h>
 #include <assert.h>
 #define b2Assert assert
 

+ 1 - 1
drivers/convex_decomp/b2Polygon.h

@@ -22,7 +22,7 @@
 #include "b2Triangle.h"
 #include "stdio.h"
 #include <string.h>
-#include <limits>
+#include <limits.h>
 namespace b2ConvexDecomp {
 
 static bool B2_POLYGON_REPORT_ERRORS = false;

+ 9 - 1
drivers/dds/texture_loader_dds.cpp

@@ -64,8 +64,10 @@ static const DDSFormatInfo dds_format_info[DDS_MAX]={
 };
 
 
-RES ResourceFormatDDS::load(const String &p_path,const String& p_original_path) {
+RES ResourceFormatDDS::load(const String &p_path, const String& p_original_path, Error *r_error) {
 
+	if (r_error)
+		*r_error=ERR_CANT_OPEN;
 
 	Error err;
 	FileAccess *f = FileAccess::open(p_path,FileAccess::READ,&err);
@@ -73,6 +75,8 @@ RES ResourceFormatDDS::load(const String &p_path,const String& p_original_path)
 		return RES();
 
 	FileAccessRef fref(f);
+	if (r_error)
+		*r_error=ERR_FILE_CORRUPT;
 
 	ERR_EXPLAIN("Unable to open DDS texture file: "+p_path);
 	ERR_FAIL_COND_V(err!=OK,RES());
@@ -427,6 +431,10 @@ RES ResourceFormatDDS::load(const String &p_path,const String& p_original_path)
 	Ref<ImageTexture> texture = memnew( ImageTexture );
 	texture->create_from_image(img);
 
+	if (r_error)
+		*r_error=OK;
+
+
 	return texture;
 
 }

+ 1 - 1
drivers/dds/texture_loader_dds.h

@@ -7,7 +7,7 @@
 class ResourceFormatDDS : public ResourceFormatLoader{
 public:
 
-	virtual RES load(const String &p_path,const String& p_original_path="");
+	virtual RES load(const String &p_path,const String& p_original_path="",Error *r_error=NULL);
 	virtual void get_recognized_extensions(List<String> *p_extensions) const;
 	virtual bool handles_type(const String& p_type) const;
 	virtual String get_resource_type(const String &p_path) const;

+ 2454 - 2454
drivers/etc1/rg_etc1.cpp

@@ -1,2454 +1,2454 @@
-// File: rg_etc1.cpp - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
-// Please see ZLIB license at the end of rg_etc1.h.
-//
-// For more information Ericsson Texture Compression (ETC/ETC1), see:
-// http://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8_texture.txt
-//
-// v1.03 - 5/12/13 - Initial public release
-#include "rg_etc1.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-//#include <stdio.h>
-#include <math.h>
-#include <stdio.h>
-#pragma warning (disable: 4201) //  nonstandard extension used : nameless struct/union
-
-#if defined(_DEBUG) || defined(DEBUG)
-#define RG_ETC1_BUILD_DEBUG
-#endif
-
-#define RG_ETC1_ASSERT assert
-
-namespace rg_etc1
-{
-
-   inline long labs(long val) {
-        return val < 0 ? -val : val;
-   }
-
-   inline int intabs(int val) {
-
-       return val<0?-val:val;
-   }
-
-   typedef unsigned char uint8;
-   typedef unsigned short uint16;
-   typedef unsigned int uint;
-   typedef unsigned int uint32;
-   typedef long long int64;
-   typedef unsigned long long uint64;
-
-   const uint32 cUINT32_MAX = 0xFFFFFFFFU;
-   const uint64 cUINT64_MAX = 0xFFFFFFFFFFFFFFFFULL; //0xFFFFFFFFFFFFFFFFui64;
-   
-   template<typename T> inline T minimum(T a, T b) { return (a < b) ? a : b; }
-   template<typename T> inline T minimum(T a, T b, T c) { return minimum(minimum(a, b), c); }
-   template<typename T> inline T maximum(T a, T b) { return (a > b) ? a : b; }
-   template<typename T> inline T maximum(T a, T b, T c) { return maximum(maximum(a, b), c); }
-   template<typename T> inline T clamp(T value, T low, T high) { return (value < low) ? low : ((value > high) ? high : value); }
-   template<typename T> inline T square(T value) { return value * value; }
-   template<typename T> inline void zero_object(T& obj) { memset((void*)&obj, 0, sizeof(obj)); }
-   template<typename T> inline void zero_this(T* pObj) { memset((void*)pObj, 0, sizeof(*pObj)); }
-
-   template<class T, size_t N> T decay_array_to_subtype(T (&a)[N]);   
-
-#define RG_ETC1_ARRAY_SIZE(X) (sizeof(X) / sizeof(decay_array_to_subtype(X)))
-
-   enum eNoClamp { cNoClamp };
-
-   struct color_quad_u8
-   {
-      static inline int clamp(int v) { if (v & 0xFFFFFF00U) v = (~(static_cast<int>(v) >> 31)) & 0xFF; return v; }
-
-      struct component_traits { enum { cSigned = false, cFloat = false, cMin = 0U, cMax = 255U }; };
-
-   public:
-      typedef unsigned char component_t;
-      typedef int parameter_t;
-
-      enum { cNumComps = 4 };
-
-      union
-      {
-         struct
-         {
-            component_t r;
-            component_t g;
-            component_t b;
-            component_t a;
-         };
-
-         component_t c[cNumComps];
-
-         uint32 m_u32;
-      };
-
-      inline color_quad_u8()
-      {
-      }
-
-      inline color_quad_u8(const color_quad_u8& other) : m_u32(other.m_u32)
-      {
-      }
-
-      explicit inline color_quad_u8(parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         set(y, alpha);
-      }
-
-      inline color_quad_u8(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
-      {
-         set(red, green, blue, alpha);
-      }
-
-      explicit inline color_quad_u8(eNoClamp, parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         set_noclamp_y_alpha(y, alpha);
-      }
-
-      inline color_quad_u8(eNoClamp, parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
-      {
-         set_noclamp_rgba(red, green, blue, alpha);
-      }
-
-      inline void clear()
-      {
-         m_u32 = 0;
-      }
-
-      inline color_quad_u8& operator= (const color_quad_u8& other)
-      {
-         m_u32 = other.m_u32;
-         return *this;
-      }
-
-      inline color_quad_u8& set_rgb(const color_quad_u8& other)
-      {
-         r = other.r;
-         g = other.g;
-         b = other.b;
-         return *this;
-      }
-
-      inline color_quad_u8& operator= (parameter_t y)
-      {
-         set(y, component_traits::cMax);
-         return *this;
-      }
-
-      inline color_quad_u8& set(parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         y = clamp(y);
-         alpha = clamp(alpha);
-         r = static_cast<component_t>(y);
-         g = static_cast<component_t>(y);
-         b = static_cast<component_t>(y);
-         a = static_cast<component_t>(alpha);
-         return *this;
-      }
-
-      inline color_quad_u8& set_noclamp_y_alpha(parameter_t y, parameter_t alpha = component_traits::cMax)
-      {
-         RG_ETC1_ASSERT( (y >= component_traits::cMin) && (y <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
-
-         r = static_cast<component_t>(y);
-         g = static_cast<component_t>(y);
-         b = static_cast<component_t>(y);
-         a = static_cast<component_t>(alpha);
-         return *this;
-      }
-
-      inline color_quad_u8& set(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
-      {
-         r = static_cast<component_t>(clamp(red));
-         g = static_cast<component_t>(clamp(green));
-         b = static_cast<component_t>(clamp(blue));
-         a = static_cast<component_t>(clamp(alpha));
-         return *this;
-      }
-
-      inline color_quad_u8& set_noclamp_rgba(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha)
-      {
-         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
-
-         r = static_cast<component_t>(red);
-         g = static_cast<component_t>(green);
-         b = static_cast<component_t>(blue);
-         a = static_cast<component_t>(alpha);
-         return *this;
-      }
-
-      inline color_quad_u8& set_noclamp_rgb(parameter_t red, parameter_t green, parameter_t blue)
-      {
-         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
-         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
-
-         r = static_cast<component_t>(red);
-         g = static_cast<component_t>(green);
-         b = static_cast<component_t>(blue);
-         return *this;
-      }
-
-      static inline parameter_t get_min_comp() { return component_traits::cMin; }
-      static inline parameter_t get_max_comp() { return component_traits::cMax; }
-      static inline bool get_comps_are_signed() { return component_traits::cSigned; }
-
-      inline component_t operator[] (uint i) const { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
-      inline component_t& operator[] (uint i) { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
-
-      inline color_quad_u8& set_component(uint i, parameter_t f)
-      {
-         RG_ETC1_ASSERT(i < cNumComps);
-
-         c[i] = static_cast<component_t>(clamp(f));
-
-         return *this;
-      }
-
-      inline color_quad_u8& set_grayscale(parameter_t l)
-      {
-         component_t x = static_cast<component_t>(clamp(l));
-         c[0] = x;
-         c[1] = x;
-         c[2] = x;
-         return *this;
-      }
-
-      inline color_quad_u8& clamp(const color_quad_u8& l, const color_quad_u8& h)
-      {
-         for (uint i = 0; i < cNumComps; i++)
-            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l[i], h[i]));
-         return *this;
-      }
-
-      inline color_quad_u8& clamp(parameter_t l, parameter_t h)
-      {
-         for (uint i = 0; i < cNumComps; i++)
-            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l, h));
-         return *this;
-      }
-
-      // Returns CCIR 601 luma (consistent with color_utils::RGB_To_Y).
-      inline parameter_t get_luma() const
-      {
-         return static_cast<parameter_t>((19595U * r + 38470U * g + 7471U * b + 32768U) >> 16U);
-      }
-
-      // Returns REC 709 luma.
-      inline parameter_t get_luma_rec709() const
-      {
-         return static_cast<parameter_t>((13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U);
-      }
-
-      inline uint squared_distance_rgb(const color_quad_u8& c) const
-      {
-         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b);
-      }
-
-      inline uint squared_distance_rgba(const color_quad_u8& c) const
-      {
-         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b) + rg_etc1::square(a - c.a);
-      }
-
-      inline bool rgb_equals(const color_quad_u8& rhs) const
-      {
-         return (r == rhs.r) && (g == rhs.g) && (b == rhs.b);
-      }
-
-      inline bool operator== (const color_quad_u8& rhs) const
-      {
-         return m_u32 == rhs.m_u32;
-      }
-
-      color_quad_u8& operator+= (const color_quad_u8& other)
-      {
-         for (uint i = 0; i < 4; i++)
-            c[i] = static_cast<component_t>(clamp(c[i] + other.c[i]));
-         return *this;
-      }
-
-      color_quad_u8& operator-= (const color_quad_u8& other)
-      {
-         for (uint i = 0; i < 4; i++)
-            c[i] = static_cast<component_t>(clamp(c[i] - other.c[i]));
-         return *this;
-      }
-
-      friend color_quad_u8 operator+ (const color_quad_u8& lhs, const color_quad_u8& rhs)
-      {
-         color_quad_u8 result(lhs);
-         result += rhs;
-         return result;
-      }
-
-      friend color_quad_u8 operator- (const color_quad_u8& lhs, const color_quad_u8& rhs)
-      {
-         color_quad_u8 result(lhs);
-         result -= rhs;
-         return result;
-      }
-   }; // class color_quad_u8
-
-   struct vec3F
-   {
-      float m_s[3];
-      
-      inline vec3F() { }
-      inline vec3F(float s) { m_s[0] = s; m_s[1] = s; m_s[2] = s; }
-      inline vec3F(float x, float y, float z) { m_s[0] = x; m_s[1] = y; m_s[2] = z; }
-      
-      inline float operator[] (uint i) const { RG_ETC1_ASSERT(i < 3); return m_s[i]; }
-
-      inline vec3F& operator += (const vec3F& other) { for (uint i = 0; i < 3; i++) m_s[i] += other.m_s[i]; return *this; }
-
-      inline vec3F& operator *= (float s) { for (uint i = 0; i < 3; i++) m_s[i] *= s; return *this; }
-   };
-     
-   enum etc_constants
-   {
-      cETC1BytesPerBlock = 8U,
-
-      cETC1SelectorBits = 2U,
-      cETC1SelectorValues = 1U << cETC1SelectorBits,
-      cETC1SelectorMask = cETC1SelectorValues - 1U,
-
-      cETC1BlockShift = 2U,
-      cETC1BlockSize = 1U << cETC1BlockShift,
-
-      cETC1LSBSelectorIndicesBitOffset = 0,
-      cETC1MSBSelectorIndicesBitOffset = 16,
-
-      cETC1FlipBitOffset = 32,
-      cETC1DiffBitOffset = 33,
-
-      cETC1IntenModifierNumBits = 3,
-      cETC1IntenModifierValues = 1 << cETC1IntenModifierNumBits,
-      cETC1RightIntenModifierTableBitOffset = 34,
-      cETC1LeftIntenModifierTableBitOffset = 37,
-
-      // Base+Delta encoding (5 bit bases, 3 bit delta)
-      cETC1BaseColorCompNumBits = 5,
-      cETC1BaseColorCompMax = 1 << cETC1BaseColorCompNumBits,
-
-      cETC1DeltaColorCompNumBits = 3,
-      cETC1DeltaColorComp = 1 << cETC1DeltaColorCompNumBits,
-      cETC1DeltaColorCompMax = 1 << cETC1DeltaColorCompNumBits,
-
-      cETC1BaseColor5RBitOffset = 59,
-      cETC1BaseColor5GBitOffset = 51,
-      cETC1BaseColor5BBitOffset = 43,
-
-      cETC1DeltaColor3RBitOffset = 56,
-      cETC1DeltaColor3GBitOffset = 48,
-      cETC1DeltaColor3BBitOffset = 40,
-
-      // Absolute (non-delta) encoding (two 4-bit per component bases)
-      cETC1AbsColorCompNumBits = 4,
-      cETC1AbsColorCompMax = 1 << cETC1AbsColorCompNumBits,
-
-      cETC1AbsColor4R1BitOffset = 60,
-      cETC1AbsColor4G1BitOffset = 52,
-      cETC1AbsColor4B1BitOffset = 44,
-
-      cETC1AbsColor4R2BitOffset = 56,
-      cETC1AbsColor4G2BitOffset = 48,
-      cETC1AbsColor4B2BitOffset = 40,
-
-      cETC1ColorDeltaMin = -4,
-      cETC1ColorDeltaMax = 3,
-
-      // Delta3:
-      // 0   1   2   3   4   5   6   7
-      // 000 001 010 011 100 101 110 111
-      // 0   1   2   3   -4  -3  -2  -1
-   };
-   
-   static uint8 g_quant5_tab[256+16];
-
-
-   static const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues] = 
-   { 
-      { -8,  -2,   2,   8 }, { -17,  -5,  5,  17 }, { -29,  -9,   9,  29 }, {  -42, -13, 13,  42 }, 
-      { -60, -18, 18,  60 }, { -80, -24, 24,  80 }, { -106, -33, 33, 106 }, { -183, -47, 47, 183 } 
-   };
-
-   static const uint8 g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
-   static const uint8 g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
-      
-   // Given an ETC1 diff/inten_table/selector, and an 8-bit desired color, this table encodes the best packed_color in the low byte, and the abs error in the high byte.
-   static uint16 g_etc1_inverse_lookup[2*8*4][256];      // [diff/inten_table/selector][desired_color]
-
-   // g_color8_to_etc_block_config[color][table_index] = Supplies for each 8-bit color value a list of packed ETC1 diff/intensity table/selectors/packed_colors that map to that color.
-   // To pack: diff | (inten << 1) | (selector << 4) | (packed_c << 8)
-   static const uint16 g_color8_to_etc_block_config_0_255[2][33] =
-   {
-      { 0x0000,  0x0010,  0x0002,  0x0012,  0x0004,  0x0014,  0x0006,  0x0016,  0x0008,  0x0018,  0x000A,  0x001A,  0x000C,  0x001C,  0x000E,  0x001E,
-        0x0001,  0x0011,  0x0003,  0x0013,  0x0005,  0x0015,  0x0007,  0x0017,  0x0009,  0x0019,  0x000B,  0x001B,  0x000D,  0x001D,  0x000F,  0x001F, 0xFFFF },
-      { 0x0F20,  0x0F30,  0x0E32,  0x0F22,  0x0E34,  0x0F24,  0x0D36,  0x0F26,  0x0C38,  0x0E28,  0x0B3A,  0x0E2A,  0x093C,  0x0E2C,  0x053E,  0x0D2E,
-        0x1E31,  0x1F21,  0x1D33,  0x1F23,  0x1C35,  0x1E25,  0x1A37,  0x1E27,  0x1839,  0x1D29,  0x163B,  0x1C2B,  0x133D,  0x1B2D,  0x093F,  0x1A2F, 0xFFFF },
-   };
-
-   // Really only [254][11].
-   static const uint16 g_color8_to_etc_block_config_1_to_254[254][12] = 
-   {
-      { 0x021C, 0x0D0D, 0xFFFF }, { 0x0020, 0x0021, 0x0A0B, 0x061F, 0xFFFF }, { 0x0113, 0x0217, 0xFFFF }, { 0x0116, 0x031E,
-      0x0B0E, 0x0405, 0xFFFF }, { 0x0022, 0x0204, 0x050A, 0x0023, 0xFFFF }, { 0x0111, 0x0319, 0x0809, 0x170F, 0xFFFF }, {
-      0x0303, 0x0215, 0x0607, 0xFFFF }, { 0x0030, 0x0114, 0x0408, 0x0031, 0x0201, 0x051D, 0xFFFF }, { 0x0100, 0x0024, 0x0306,
-      0x0025, 0x041B, 0x0E0D, 0xFFFF }, { 0x021A, 0x0121, 0x0B0B, 0x071F, 0xFFFF }, { 0x0213, 0x0317, 0xFFFF }, { 0x0112,
-      0x0505, 0xFFFF }, { 0x0026, 0x070C, 0x0123, 0x0027, 0xFFFF }, { 0x0211, 0x0909, 0xFFFF }, { 0x0110, 0x0315, 0x0707,
-      0x0419, 0x180F, 0xFFFF }, { 0x0218, 0x0131, 0x0301, 0x0403, 0x061D, 0xFFFF }, { 0x0032, 0x0202, 0x0033, 0x0125, 0x051B,
-      0x0F0D, 0xFFFF }, { 0x0028, 0x031C, 0x0221, 0x0029, 0xFFFF }, { 0x0120, 0x0313, 0x0C0B, 0x081F, 0xFFFF }, { 0x0605,
-      0x0417, 0xFFFF }, { 0x0216, 0x041E, 0x0C0E, 0x0223, 0x0127, 0xFFFF }, { 0x0122, 0x0304, 0x060A, 0x0311, 0x0A09, 0xFFFF
-      }, { 0x0519, 0x190F, 0xFFFF }, { 0x002A, 0x0231, 0x0503, 0x0415, 0x0807, 0x002B, 0x071D, 0xFFFF }, { 0x0130, 0x0214,
-      0x0508, 0x0401, 0x0133, 0x0225, 0x061B, 0xFFFF }, { 0x0200, 0x0124, 0x0406, 0x0321, 0x0129, 0x100D, 0xFFFF }, { 0x031A,
-      0x0D0B, 0x091F, 0xFFFF }, { 0x0413, 0x0705, 0x0517, 0xFFFF }, { 0x0212, 0x0034, 0x0323, 0x0035, 0x0227, 0xFFFF }, {
-      0x0126, 0x080C, 0x0B09, 0xFFFF }, { 0x0411, 0x0619, 0x1A0F, 0xFFFF }, { 0x0210, 0x0331, 0x0603, 0x0515, 0x0907, 0x012B,
-      0xFFFF }, { 0x0318, 0x002C, 0x0501, 0x0233, 0x0325, 0x071B, 0x002D, 0x081D, 0xFFFF }, { 0x0132, 0x0302, 0x0229, 0x110D,
-      0xFFFF }, { 0x0128, 0x041C, 0x0421, 0x0E0B, 0x0A1F, 0xFFFF }, { 0x0220, 0x0513, 0x0617, 0xFFFF }, { 0x0135, 0x0805,
-      0x0327, 0xFFFF }, { 0x0316, 0x051E, 0x0D0E, 0x0423, 0xFFFF }, { 0x0222, 0x0404, 0x070A, 0x0511, 0x0719, 0x0C09, 0x1B0F,
-      0xFFFF }, { 0x0703, 0x0615, 0x0A07, 0x022B, 0xFFFF }, { 0x012A, 0x0431, 0x0601, 0x0333, 0x012D, 0x091D, 0xFFFF }, {
-      0x0230, 0x0314, 0x0036, 0x0608, 0x0425, 0x0037, 0x0329, 0x081B, 0x120D, 0xFFFF }, { 0x0300, 0x0224, 0x0506, 0x0521,
-      0x0F0B, 0x0B1F, 0xFFFF }, { 0x041A, 0x0613, 0x0717, 0xFFFF }, { 0x0235, 0x0905, 0xFFFF }, { 0x0312, 0x0134, 0x0523,
-      0x0427, 0xFFFF }, { 0x0226, 0x090C, 0x002E, 0x0611, 0x0D09, 0x002F, 0xFFFF }, { 0x0715, 0x0B07, 0x0819, 0x032B, 0x1C0F,
-      0xFFFF }, { 0x0310, 0x0531, 0x0701, 0x0803, 0x022D, 0x0A1D, 0xFFFF }, { 0x0418, 0x012C, 0x0433, 0x0525, 0x0137, 0x091B,
-      0x130D, 0xFFFF }, { 0x0232, 0x0402, 0x0621, 0x0429, 0xFFFF }, { 0x0228, 0x051C, 0x0713, 0x100B, 0x0C1F, 0xFFFF }, {
-      0x0320, 0x0335, 0x0A05, 0x0817, 0xFFFF }, { 0x0623, 0x0527, 0xFFFF }, { 0x0416, 0x061E, 0x0E0E, 0x0711, 0x0E09, 0x012F,
-      0xFFFF }, { 0x0322, 0x0504, 0x080A, 0x0919, 0x1D0F, 0xFFFF }, { 0x0631, 0x0903, 0x0815, 0x0C07, 0x042B, 0x032D, 0x0B1D,
-      0xFFFF }, { 0x022A, 0x0801, 0x0533, 0x0625, 0x0237, 0x0A1B, 0xFFFF }, { 0x0330, 0x0414, 0x0136, 0x0708, 0x0721, 0x0529,
-      0x140D, 0xFFFF }, { 0x0400, 0x0324, 0x0606, 0x0038, 0x0039, 0x110B, 0x0D1F, 0xFFFF }, { 0x051A, 0x0813, 0x0B05, 0x0917,
-      0xFFFF }, { 0x0723, 0x0435, 0x0627, 0xFFFF }, { 0x0412, 0x0234, 0x0F09, 0x022F, 0xFFFF }, { 0x0326, 0x0A0C, 0x012E,
-      0x0811, 0x0A19, 0x1E0F, 0xFFFF }, { 0x0731, 0x0A03, 0x0915, 0x0D07, 0x052B, 0xFFFF }, { 0x0410, 0x0901, 0x0633, 0x0725,
-      0x0337, 0x0B1B, 0x042D, 0x0C1D, 0xFFFF }, { 0x0518, 0x022C, 0x0629, 0x150D, 0xFFFF }, { 0x0332, 0x0502, 0x0821, 0x0139,
-      0x120B, 0x0E1F, 0xFFFF }, { 0x0328, 0x061C, 0x0913, 0x0A17, 0xFFFF }, { 0x0420, 0x0535, 0x0C05, 0x0727, 0xFFFF }, {
-      0x0823, 0x032F, 0xFFFF }, { 0x0516, 0x071E, 0x0F0E, 0x0911, 0x0B19, 0x1009, 0x1F0F, 0xFFFF }, { 0x0422, 0x0604, 0x090A,
-      0x0B03, 0x0A15, 0x0E07, 0x062B, 0xFFFF }, { 0x0831, 0x0A01, 0x0733, 0x052D, 0x0D1D, 0xFFFF }, { 0x032A, 0x0825, 0x0437,
-      0x0729, 0x0C1B, 0x160D, 0xFFFF }, { 0x0430, 0x0514, 0x0236, 0x0808, 0x0921, 0x0239, 0x130B, 0x0F1F, 0xFFFF }, { 0x0500,
-      0x0424, 0x0706, 0x0138, 0x0A13, 0x0B17, 0xFFFF }, { 0x061A, 0x0635, 0x0D05, 0xFFFF }, { 0x0923, 0x0827, 0xFFFF }, {
-      0x0512, 0x0334, 0x003A, 0x0A11, 0x1109, 0x003B, 0x042F, 0xFFFF }, { 0x0426, 0x0B0C, 0x022E, 0x0B15, 0x0F07, 0x0C19,
-      0x072B, 0xFFFF }, { 0x0931, 0x0B01, 0x0C03, 0x062D, 0x0E1D, 0xFFFF }, { 0x0510, 0x0833, 0x0925, 0x0537, 0x0D1B, 0x170D,
-      0xFFFF }, { 0x0618, 0x032C, 0x0A21, 0x0339, 0x0829, 0xFFFF }, { 0x0432, 0x0602, 0x0B13, 0x140B, 0x101F, 0xFFFF }, {
-      0x0428, 0x071C, 0x0735, 0x0E05, 0x0C17, 0xFFFF }, { 0x0520, 0x0A23, 0x0927, 0xFFFF }, { 0x0B11, 0x1209, 0x013B, 0x052F,
-      0xFFFF }, { 0x0616, 0x081E, 0x0D19, 0xFFFF }, { 0x0522, 0x0704, 0x0A0A, 0x0A31, 0x0D03, 0x0C15, 0x1007, 0x082B, 0x072D,
-      0x0F1D, 0xFFFF }, { 0x0C01, 0x0933, 0x0A25, 0x0637, 0x0E1B, 0xFFFF }, { 0x042A, 0x0B21, 0x0929, 0x180D, 0xFFFF }, {
-	      0x0530, 0x0614, 0x0336, 0x0908, 0x0439, 0x150B, 0x111F, 0xFFFF }, { 0x0600, 0x0524, 0x0806, 0x0238, 0x0C13, 0x0F05,
-      0x0D17, 0xFFFF }, { 0x071A, 0x0B23, 0x0835, 0x0A27, 0xFFFF }, { 0x1309, 0x023B, 0x062F, 0xFFFF }, { 0x0612, 0x0434,
-      0x013A, 0x0C11, 0x0E19, 0xFFFF }, { 0x0526, 0x0C0C, 0x032E, 0x0B31, 0x0E03, 0x0D15, 0x1107, 0x092B, 0xFFFF }, { 0x0D01,
-      0x0A33, 0x0B25, 0x0737, 0x0F1B, 0x082D, 0x101D, 0xFFFF }, { 0x0610, 0x0A29, 0x190D, 0xFFFF }, { 0x0718, 0x042C, 0x0C21,
-      0x0539, 0x160B, 0x121F, 0xFFFF }, { 0x0532, 0x0702, 0x0D13, 0x0E17, 0xFFFF }, { 0x0528, 0x081C, 0x0935, 0x1005, 0x0B27,
-      0xFFFF }, { 0x0620, 0x0C23, 0x033B, 0x072F, 0xFFFF }, { 0x0D11, 0x0F19, 0x1409, 0xFFFF }, { 0x0716, 0x003C, 0x091E,
-      0x0F03, 0x0E15, 0x1207, 0x0A2B, 0x003D, 0xFFFF }, { 0x0622, 0x0804, 0x0B0A, 0x0C31, 0x0E01, 0x0B33, 0x092D, 0x111D,
-      0xFFFF }, { 0x0C25, 0x0837, 0x0B29, 0x101B, 0x1A0D, 0xFFFF }, { 0x052A, 0x0D21, 0x0639, 0x170B, 0x131F, 0xFFFF }, {
-      0x0630, 0x0714, 0x0436, 0x0A08, 0x0E13, 0x0F17, 0xFFFF }, { 0x0700, 0x0624, 0x0906, 0x0338, 0x0A35, 0x1105, 0xFFFF }, {
-      0x081A, 0x0D23, 0x0C27, 0xFFFF }, { 0x0E11, 0x1509, 0x043B, 0x082F, 0xFFFF }, { 0x0712, 0x0534, 0x023A, 0x0F15, 0x1307,
-      0x1019, 0x0B2B, 0x013D, 0xFFFF }, { 0x0626, 0x0D0C, 0x042E, 0x0D31, 0x0F01, 0x1003, 0x0A2D, 0x121D, 0xFFFF }, { 0x0C33,
-      0x0D25, 0x0937, 0x111B, 0x1B0D, 0xFFFF }, { 0x0710, 0x0E21, 0x0739, 0x0C29, 0xFFFF }, { 0x0818, 0x052C, 0x0F13, 0x180B,
-      0x141F, 0xFFFF }, { 0x0632, 0x0802, 0x0B35, 0x1205, 0x1017, 0xFFFF }, { 0x0628, 0x091C, 0x0E23, 0x0D27, 0xFFFF }, {
-      0x0720, 0x0F11, 0x1609, 0x053B, 0x092F, 0xFFFF }, { 0x1119, 0x023D, 0xFFFF }, { 0x0816, 0x013C, 0x0A1E, 0x0E31, 0x1103,
-      0x1015, 0x1407, 0x0C2B, 0x0B2D, 0x131D, 0xFFFF }, { 0x0722, 0x0904, 0x0C0A, 0x1001, 0x0D33, 0x0E25, 0x0A37, 0x121B,
-      0xFFFF }, { 0x0F21, 0x0D29, 0x1C0D, 0xFFFF }, { 0x062A, 0x0839, 0x190B, 0x151F, 0xFFFF }, { 0x0730, 0x0814, 0x0536,
-      0x0B08, 0x1013, 0x1305, 0x1117, 0xFFFF }, { 0x0800, 0x0724, 0x0A06, 0x0438, 0x0F23, 0x0C35, 0x0E27, 0xFFFF }, { 0x091A,
-      0x1709, 0x063B, 0x0A2F, 0xFFFF }, { 0x1011, 0x1219, 0x033D, 0xFFFF }, { 0x0812, 0x0634, 0x033A, 0x0F31, 0x1203, 0x1115,
-      0x1507, 0x0D2B, 0xFFFF }, { 0x0726, 0x0E0C, 0x052E, 0x1101, 0x0E33, 0x0F25, 0x0B37, 0x131B, 0x0C2D, 0x141D, 0xFFFF }, {
-      0x0E29, 0x1D0D, 0xFFFF }, { 0x0810, 0x1021, 0x0939, 0x1A0B, 0x161F, 0xFFFF }, { 0x0918, 0x062C, 0x1113, 0x1217, 0xFFFF
-      }, { 0x0732, 0x0902, 0x0D35, 0x1405, 0x0F27, 0xFFFF }, { 0x0728, 0x0A1C, 0x1023, 0x073B, 0x0B2F, 0xFFFF }, { 0x0820,
-      0x1111, 0x1319, 0x1809, 0xFFFF }, { 0x1303, 0x1215, 0x1607, 0x0E2B, 0x043D, 0xFFFF }, { 0x0916, 0x023C, 0x0B1E, 0x1031,
-      0x1201, 0x0F33, 0x0D2D, 0x151D, 0xFFFF }, { 0x0822, 0x0A04, 0x0D0A, 0x1025, 0x0C37, 0x0F29, 0x141B, 0x1E0D, 0xFFFF }, {
-      0x1121, 0x0A39, 0x1B0B, 0x171F, 0xFFFF }, { 0x072A, 0x1213, 0x1317, 0xFFFF }, { 0x0830, 0x0914, 0x0636, 0x0C08, 0x0E35,
-      0x1505, 0xFFFF }, { 0x0900, 0x0824, 0x0B06, 0x0538, 0x1123, 0x1027, 0xFFFF }, { 0x0A1A, 0x1211, 0x1909, 0x083B, 0x0C2F,
-      0xFFFF }, { 0x1315, 0x1707, 0x1419, 0x0F2B, 0x053D, 0xFFFF }, { 0x0912, 0x0734, 0x043A, 0x1131, 0x1301, 0x1403, 0x0E2D,
-      0x161D, 0xFFFF }, { 0x0826, 0x0F0C, 0x062E, 0x1033, 0x1125, 0x0D37, 0x151B, 0x1F0D, 0xFFFF }, { 0x1221, 0x0B39, 0x1029,
-      0xFFFF }, { 0x0910, 0x1313, 0x1C0B, 0x181F, 0xFFFF }, { 0x0A18, 0x072C, 0x0F35, 0x1605, 0x1417, 0xFFFF }, { 0x0832,
-      0x0A02, 0x1223, 0x1127, 0xFFFF }, { 0x0828, 0x0B1C, 0x1311, 0x1A09, 0x093B, 0x0D2F, 0xFFFF }, { 0x0920, 0x1519, 0x063D,
-      0xFFFF }, { 0x1231, 0x1503, 0x1415, 0x1807, 0x102B, 0x0F2D, 0x171D, 0xFFFF }, { 0x0A16, 0x033C, 0x0C1E, 0x1401, 0x1133,
-      0x1225, 0x0E37, 0x161B, 0xFFFF }, { 0x0922, 0x0B04, 0x0E0A, 0x1321, 0x1129, 0xFFFF }, { 0x0C39, 0x1D0B, 0x191F, 0xFFFF
-      }, { 0x082A, 0x1413, 0x1705, 0x1517, 0xFFFF }, { 0x0930, 0x0A14, 0x0736, 0x0D08, 0x1323, 0x1035, 0x1227, 0xFFFF }, {
-      0x0A00, 0x0924, 0x0C06, 0x0638, 0x1B09, 0x0A3B, 0x0E2F, 0xFFFF }, { 0x0B1A, 0x1411, 0x1619, 0x073D, 0xFFFF }, { 0x1331,
-      0x1603, 0x1515, 0x1907, 0x112B, 0xFFFF }, { 0x0A12, 0x0834, 0x053A, 0x1501, 0x1233, 0x1325, 0x0F37, 0x171B, 0x102D,
-      0x181D, 0xFFFF }, { 0x0926, 0x072E, 0x1229, 0xFFFF }, { 0x1421, 0x0D39, 0x1E0B, 0x1A1F, 0xFFFF }, { 0x0A10, 0x1513,
-      0x1617, 0xFFFF }, { 0x0B18, 0x082C, 0x1135, 0x1805, 0x1327, 0xFFFF }, { 0x0932, 0x0B02, 0x1423, 0x0B3B, 0x0F2F, 0xFFFF
-      }, { 0x0928, 0x0C1C, 0x1511, 0x1719, 0x1C09, 0xFFFF }, { 0x0A20, 0x1703, 0x1615, 0x1A07, 0x122B, 0x083D, 0xFFFF }, {
-      0x1431, 0x1601, 0x1333, 0x112D, 0x191D, 0xFFFF }, { 0x0B16, 0x043C, 0x0D1E, 0x1425, 0x1037, 0x1329, 0x181B, 0xFFFF }, {
-      0x0A22, 0x0C04, 0x0F0A, 0x1521, 0x0E39, 0x1F0B, 0x1B1F, 0xFFFF }, { 0x1613, 0x1717, 0xFFFF }, { 0x092A, 0x1235, 0x1905,
-      0xFFFF }, { 0x0A30, 0x0B14, 0x0836, 0x0E08, 0x1523, 0x1427, 0xFFFF }, { 0x0B00, 0x0A24, 0x0D06, 0x0738, 0x1611, 0x1D09,
-      0x0C3B, 0x102F, 0xFFFF }, { 0x0C1A, 0x1715, 0x1B07, 0x1819, 0x132B, 0x093D, 0xFFFF }, { 0x1531, 0x1701, 0x1803, 0x122D,
-      0x1A1D, 0xFFFF }, { 0x0B12, 0x0934, 0x063A, 0x1433, 0x1525, 0x1137, 0x191B, 0xFFFF }, { 0x0A26, 0x003E, 0x082E, 0x1621,
-      0x0F39, 0x1429, 0x003F, 0xFFFF }, { 0x1713, 0x1C1F, 0xFFFF }, { 0x0B10, 0x1335, 0x1A05, 0x1817, 0xFFFF }, { 0x0C18,
-      0x092C, 0x1623, 0x1527, 0xFFFF }, { 0x0A32, 0x0C02, 0x1711, 0x1E09, 0x0D3B, 0x112F, 0xFFFF }, { 0x0A28, 0x0D1C, 0x1919,
-      0x0A3D, 0xFFFF }, { 0x0B20, 0x1631, 0x1903, 0x1815, 0x1C07, 0x142B, 0x132D, 0x1B1D, 0xFFFF }, { 0x1801, 0x1533, 0x1625,
-      0x1237, 0x1A1B, 0xFFFF }, { 0x0C16, 0x053C, 0x0E1E, 0x1721, 0x1529, 0x013F, 0xFFFF }, { 0x0B22, 0x0D04, 0x1039, 0x1D1F,
-      0xFFFF }, { 0x1813, 0x1B05, 0x1917, 0xFFFF }, { 0x0A2A, 0x1723, 0x1435, 0x1627, 0xFFFF }, { 0x0B30, 0x0C14, 0x0936,
-      0x0F08, 0x1F09, 0x0E3B, 0x122F, 0xFFFF }, { 0x0C00, 0x0B24, 0x0E06, 0x0838, 0x1811, 0x1A19, 0x0B3D, 0xFFFF }, { 0x0D1A,
-      0x1731, 0x1A03, 0x1915, 0x1D07, 0x152B, 0xFFFF }, { 0x1901, 0x1633, 0x1725, 0x1337, 0x1B1B, 0x142D, 0x1C1D, 0xFFFF }, {
-      0x0C12, 0x0A34, 0x073A, 0x1629, 0x023F, 0xFFFF }, { 0x0B26, 0x013E, 0x092E, 0x1821, 0x1139, 0x1E1F, 0xFFFF }, { 0x1913,
-      0x1A17, 0xFFFF }, { 0x0C10, 0x1535, 0x1C05, 0x1727, 0xFFFF }, { 0x0D18, 0x0A2C, 0x1823, 0x0F3B, 0x132F, 0xFFFF }, {
-      0x0B32, 0x0D02, 0x1911, 0x1B19, 0xFFFF }, { 0x0B28, 0x0E1C, 0x1B03, 0x1A15, 0x1E07, 0x162B, 0x0C3D, 0xFFFF }, { 0x0C20,
-      0x1831, 0x1A01, 0x1733, 0x152D, 0x1D1D, 0xFFFF }, { 0x1825, 0x1437, 0x1729, 0x1C1B, 0x033F, 0xFFFF }, { 0x0D16, 0x063C,
-      0x0F1E, 0x1921, 0x1239, 0x1F1F, 0xFFFF }, { 0x0C22, 0x0E04, 0x1A13, 0x1B17, 0xFFFF }, { 0x1635, 0x1D05, 0xFFFF }, {
-      0x0B2A, 0x1923, 0x1827, 0xFFFF }, { 0x0C30, 0x0D14, 0x0A36, 0x1A11, 0x103B, 0x142F, 0xFFFF }, { 0x0D00, 0x0C24, 0x0F06,
-      0x0938, 0x1B15, 0x1F07, 0x1C19, 0x172B, 0x0D3D, 0xFFFF }, { 0x0E1A, 0x1931, 0x1B01, 0x1C03, 0x162D, 0x1E1D, 0xFFFF }, {
-      0x1833, 0x1925, 0x1537, 0x1D1B, 0xFFFF }, { 0x0D12, 0x0B34, 0x083A, 0x1A21, 0x1339, 0x1829, 0x043F, 0xFFFF }, { 0x0C26,
-      0x023E, 0x0A2E, 0x1B13, 0xFFFF }, { 0x1735, 0x1E05, 0x1C17, 0xFFFF }, { 0x0D10, 0x1A23, 0x1927, 0xFFFF }, { 0x0E18,
-      0x0B2C, 0x1B11, 0x113B, 0x152F, 0xFFFF }, { 0x0C32, 0x0E02, 0x1D19, 0x0E3D, 0xFFFF }, { 0x0C28, 0x0F1C, 0x1A31, 0x1D03,
-      0x1C15, 0x182B, 0x172D, 0x1F1D, 0xFFFF }, { 0x0D20, 0x1C01, 0x1933, 0x1A25, 0x1637, 0x1E1B, 0xFFFF }, { 0x1B21, 0x1929,
-      0x053F, 0xFFFF }, { 0x0E16, 0x073C, 0x1439, 0xFFFF }, { 0x0D22, 0x0F04, 0x1C13, 0x1F05, 0x1D17, 0xFFFF }, { 0x1B23,
-      0x1835, 0x1A27, 0xFFFF }, { 0x0C2A, 0x123B, 0x162F, 0xFFFF }, { 0x0D30, 0x0E14, 0x0B36, 0x1C11, 0x1E19, 0x0F3D, 0xFFFF
-      }, { 0x0E00, 0x0D24, 0x0A38, 0x1B31, 0x1E03, 0x1D15, 0x192B, 0xFFFF }, { 0x0F1A, 0x1D01, 0x1A33, 0x1B25, 0x1737, 0x1F1B,
-      0x182D, 0xFFFF }, { 0x1A29, 0x063F, 0xFFFF }, { 0x0E12, 0x0C34, 0x093A, 0x1C21, 0x1539, 0xFFFF }, { 0x0D26, 0x033E,
-      0x0B2E, 0x1D13, 0x1E17, 0xFFFF }, { 0x1935, 0x1B27, 0xFFFF }, { 0x0E10, 0x1C23, 0x133B, 0x172F, 0xFFFF }, { 0x0F18,
-      0x0C2C, 0x1D11, 0x1F19, 0xFFFF }, { 0x0D32, 0x0F02, 0x1F03, 0x1E15, 0x1A2B, 0x103D, 0xFFFF }, { 0x0D28, 0x1C31, 0x1E01,
-      0x1B33, 0x192D, 0xFFFF }, { 0x0E20, 0x1C25, 0x1837, 0x1B29, 0x073F, 0xFFFF }, { 0x1D21, 0x1639, 0xFFFF }, { 0x0F16,
-      0x083C, 0x1E13, 0x1F17, 0xFFFF }, { 0x0E22, 0x1A35, 0xFFFF }, { 0x1D23, 0x1C27, 0xFFFF }, { 0x0D2A, 0x1E11, 0x143B,
-      0x182F, 0xFFFF }, { 0x0E30, 0x0F14, 0x0C36, 0x1F15, 0x1B2B, 0x113D, 0xFFFF }, { 0x0F00, 0x0E24, 0x0B38, 0x1D31, 0x1F01,
-      0x1A2D, 0xFFFF }, { 0x1C33, 0x1D25, 0x1937, 0xFFFF }, { 0x1E21, 0x1739, 0x1C29, 0x083F, 0xFFFF }, { 0x0F12, 0x0D34,
-      0x0A3A, 0x1F13, 0xFFFF }, { 0x0E26, 0x043E, 0x0C2E, 0x1B35, 0xFFFF }, { 0x1E23, 0x1D27, 0xFFFF }, { 0x0F10, 0x1F11,
-      0x153B, 0x192F, 0xFFFF }, { 0x0D2C, 0x123D, 0xFFFF },
-   };
-
-   struct etc1_block
-   {
-      // big endian uint64:
-      // bit ofs:  56  48  40  32  24  16   8   0
-      // byte ofs: b0, b1, b2, b3, b4, b5, b6, b7 
-      union 
-      {
-         uint64 m_uint64;
-         uint8 m_bytes[8];
-      };
-
-      uint8 m_low_color[2];
-      uint8 m_high_color[2];
-
-      enum { cNumSelectorBytes = 4 };
-      uint8 m_selectors[cNumSelectorBytes];
-
-      inline void clear()
-      {
-         zero_this(this);
-      }
-
-      inline uint get_byte_bits(uint ofs, uint num) const
-      {
-         RG_ETC1_ASSERT((ofs + num) <= 64U);
-         RG_ETC1_ASSERT(num && (num <= 8U));
-         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
-         const uint byte_ofs = 7 - (ofs >> 3);
-         const uint byte_bit_ofs = ofs & 7;
-         return (m_bytes[byte_ofs] >> byte_bit_ofs) & ((1 << num) - 1);
-      }
-
-      inline void set_byte_bits(uint ofs, uint num, uint bits)
-      {
-         RG_ETC1_ASSERT((ofs + num) <= 64U);
-         RG_ETC1_ASSERT(num && (num < 32U));
-         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
-         RG_ETC1_ASSERT(bits < (1U << num));
-         const uint byte_ofs = 7 - (ofs >> 3);
-         const uint byte_bit_ofs = ofs & 7;
-         const uint mask = (1 << num) - 1;
-         m_bytes[byte_ofs] &= ~(mask << byte_bit_ofs);
-         m_bytes[byte_ofs] |= (bits << byte_bit_ofs);
-      }
-
-      // false = left/right subblocks
-      // true = upper/lower subblocks
-      inline bool get_flip_bit() const 
-      {
-         return (m_bytes[3] & 1) != 0;
-      }   
-
-      inline void set_flip_bit(bool flip)
-      {
-         m_bytes[3] &= ~1;
-         m_bytes[3] |= static_cast<uint8>(flip);
-      }
-
-      inline bool get_diff_bit() const
-      {
-         return (m_bytes[3] & 2) != 0;
-      }
-
-      inline void set_diff_bit(bool diff)
-      {
-         m_bytes[3] &= ~2;
-         m_bytes[3] |= (static_cast<uint>(diff) << 1);
-      }
-
-      // Returns intensity modifier table (0-7) used by subblock subblock_id.
-      // subblock_id=0 left/top (CW 1), 1=right/bottom (CW 2)
-      inline uint get_inten_table(uint subblock_id) const
-      {
-         RG_ETC1_ASSERT(subblock_id < 2);
-         const uint ofs = subblock_id ? 2 : 5;
-         return (m_bytes[3] >> ofs) & 7;
-      }
-
-      // Sets intensity modifier table (0-7) used by subblock subblock_id (0 or 1)
-      inline void set_inten_table(uint subblock_id, uint t)
-      {
-         RG_ETC1_ASSERT(subblock_id < 2);
-         RG_ETC1_ASSERT(t < 8);
-         const uint ofs = subblock_id ? 2 : 5;
-         m_bytes[3] &= ~(7 << ofs);
-         m_bytes[3] |= (t << ofs);
-      }
-
-      // Returned selector value ranges from 0-3 and is a direct index into g_etc1_inten_tables.
-      inline uint get_selector(uint x, uint y) const
-      {
-         RG_ETC1_ASSERT((x | y) < 4);
-
-         const uint bit_index = x * 4 + y;
-         const uint byte_bit_ofs = bit_index & 7;
-         const uint8 *p = &m_bytes[7 - (bit_index >> 3)];
-         const uint lsb = (p[0] >> byte_bit_ofs) & 1;
-         const uint msb = (p[-2] >> byte_bit_ofs) & 1;
-         const uint val = lsb | (msb << 1);
-
-         return g_etc1_to_selector_index[val];
-      }
-
-      // Selector "val" ranges from 0-3 and is a direct index into g_etc1_inten_tables.
-      inline void set_selector(uint x, uint y, uint val)
-      {
-         RG_ETC1_ASSERT((x | y | val) < 4);
-         const uint bit_index = x * 4 + y;
-
-         uint8 *p = &m_bytes[7 - (bit_index >> 3)];
-
-         const uint byte_bit_ofs = bit_index & 7;
-         const uint mask = 1 << byte_bit_ofs;
-
-         const uint etc1_val = g_selector_index_to_etc1[val];
-
-         const uint lsb = etc1_val & 1;
-         const uint msb = etc1_val >> 1;
-
-         p[0] &= ~mask;
-         p[0] |= (lsb << byte_bit_ofs);
-
-         p[-2] &= ~mask;
-         p[-2] |= (msb << byte_bit_ofs);
-      }
-
-      inline void set_base4_color(uint idx, uint16 c)
-      {
-         if (idx)
-         {
-            set_byte_bits(cETC1AbsColor4R2BitOffset, 4, (c >> 8) & 15);
-            set_byte_bits(cETC1AbsColor4G2BitOffset, 4, (c >> 4) & 15);
-            set_byte_bits(cETC1AbsColor4B2BitOffset, 4, c & 15);
-         }
-         else
-         {
-            set_byte_bits(cETC1AbsColor4R1BitOffset, 4, (c >> 8) & 15);
-            set_byte_bits(cETC1AbsColor4G1BitOffset, 4, (c >> 4) & 15);
-            set_byte_bits(cETC1AbsColor4B1BitOffset, 4, c & 15);
-         }
-      }
-
-      inline uint16 get_base4_color(uint idx) const
-      {
-         uint r, g, b;
-         if (idx)
-         {
-            r = get_byte_bits(cETC1AbsColor4R2BitOffset, 4);
-            g = get_byte_bits(cETC1AbsColor4G2BitOffset, 4);
-            b = get_byte_bits(cETC1AbsColor4B2BitOffset, 4);
-         }
-         else
-         {
-            r = get_byte_bits(cETC1AbsColor4R1BitOffset, 4);
-            g = get_byte_bits(cETC1AbsColor4G1BitOffset, 4);
-            b = get_byte_bits(cETC1AbsColor4B1BitOffset, 4);
-         }
-         return static_cast<uint16>(b | (g << 4U) | (r << 8U));
-      }
-
-      inline void set_base5_color(uint16 c)
-      {
-         set_byte_bits(cETC1BaseColor5RBitOffset, 5, (c >> 10) & 31);
-         set_byte_bits(cETC1BaseColor5GBitOffset, 5, (c >> 5) & 31);
-         set_byte_bits(cETC1BaseColor5BBitOffset, 5, c & 31);
-      }
-
-      inline uint16 get_base5_color() const
-      {
-         const uint r = get_byte_bits(cETC1BaseColor5RBitOffset, 5);
-         const uint g = get_byte_bits(cETC1BaseColor5GBitOffset, 5);
-         const uint b = get_byte_bits(cETC1BaseColor5BBitOffset, 5);
-         return static_cast<uint16>(b | (g << 5U) | (r << 10U));
-      }
-
-      void set_delta3_color(uint16 c)
-      {
-         set_byte_bits(cETC1DeltaColor3RBitOffset, 3, (c >> 6) & 7);
-         set_byte_bits(cETC1DeltaColor3GBitOffset, 3, (c >> 3) & 7);
-         set_byte_bits(cETC1DeltaColor3BBitOffset, 3, c & 7);
-      }
-
-      inline uint16 get_delta3_color() const
-      {
-         const uint r = get_byte_bits(cETC1DeltaColor3RBitOffset, 3);
-         const uint g = get_byte_bits(cETC1DeltaColor3GBitOffset, 3);
-         const uint b = get_byte_bits(cETC1DeltaColor3BBitOffset, 3);
-         return static_cast<uint16>(b | (g << 3U) | (r << 6U));
-      }
-
-      // Base color 5
-      static uint16 pack_color5(const color_quad_u8& color, bool scaled, uint bias = 127U);
-      static uint16 pack_color5(uint r, uint g, uint b, bool scaled, uint bias = 127U);
-
-      static color_quad_u8 unpack_color5(uint16 packed_color5, bool scaled, uint alpha = 255U);
-      static void unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color, bool scaled);
-
-      static bool unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
-      static bool unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
-
-      // Delta color 3
-      // Inputs range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
-      static uint16 pack_delta3(int r, int g, int b);
-
-      // Results range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
-      static void unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3);
-
-      // Abs color 4
-      static uint16 pack_color4(const color_quad_u8& color, bool scaled, uint bias = 127U);
-      static uint16 pack_color4(uint r, uint g, uint b, bool scaled, uint bias = 127U);
-
-      static color_quad_u8 unpack_color4(uint16 packed_color4, bool scaled, uint alpha = 255U);
-      static void unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled);
-
-      // subblock colors
-      static void get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx);
-      static bool get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx);
-      static void get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx);
-
-      static inline void unscaled_to_scaled_color(color_quad_u8& dst, const color_quad_u8& src, bool color4)
-      {
-         if (color4)
-         {
-            dst.r = src.r | (src.r << 4);
-            dst.g = src.g | (src.g << 4);
-            dst.b = src.b | (src.b << 4);
-         }
-         else
-         {
-            dst.r = (src.r >> 2) | (src.r << 3);
-            dst.g = (src.g >> 2) | (src.g << 3);
-            dst.b = (src.b >> 2) | (src.b << 3);
-         }
-         dst.a = src.a;
-      }
-   };
-
-   // Returns pointer to sorted array.
-   template<typename T, typename Q>
-   T* indirect_radix_sort(uint num_indices, T* pIndices0, T* pIndices1, const Q* pKeys, uint key_ofs, uint key_size, bool init_indices)
-   {  
-      RG_ETC1_ASSERT((key_ofs >= 0) && (key_ofs < sizeof(T)));
-      RG_ETC1_ASSERT((key_size >= 1) && (key_size <= 4));
-
-      if (init_indices)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + (num_indices >> 1) * 2;
-         uint i;
-         for (i = 0; p != q; p += 2, i += 2)
-         {
-            p[0] = static_cast<T>(i);
-            p[1] = static_cast<T>(i + 1); 
-         }
-
-         if (num_indices & 1)
-            *p = static_cast<T>(i);
-      }
-
-      uint hist[256 * 4];
-
-      memset(hist, 0, sizeof(hist[0]) * 256 * key_size);
-
-#define RG_ETC1_GET_KEY(p) (*(const uint*)((const uint8*)(pKeys + *(p)) + key_ofs))
-#define RG_ETC1_GET_KEY_FROM_INDEX(i) (*(const uint*)((const uint8*)(pKeys + (i)) + key_ofs))
-
-      if (key_size == 4)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + num_indices;
-         for ( ; p != q; p++)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[        key        & 0xFF]++;
-            hist[256 + ((key >>  8) & 0xFF)]++;
-            hist[512 + ((key >> 16) & 0xFF)]++;
-            hist[768 + ((key >> 24) & 0xFF)]++;
-         }
-      }
-      else if (key_size == 3)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + num_indices;
-         for ( ; p != q; p++)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[        key        & 0xFF]++;
-            hist[256 + ((key >>  8) & 0xFF)]++;
-            hist[512 + ((key >> 16) & 0xFF)]++;
-         }
-      }   
-      else if (key_size == 2)
-      {
-         T* p = pIndices0;
-         T* q = pIndices0 + (num_indices >> 1) * 2;
-
-         for ( ; p != q; p += 2)
-         {
-            const uint key0 = RG_ETC1_GET_KEY(p);
-            const uint key1 = RG_ETC1_GET_KEY(p+1);
-
-            hist[        key0         & 0xFF]++;
-            hist[256 + ((key0 >>  8) & 0xFF)]++;
-
-            hist[        key1        & 0xFF]++;
-            hist[256 + ((key1 >>  8) & 0xFF)]++;
-         }
-
-         if (num_indices & 1)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[        key        & 0xFF]++;
-            hist[256 + ((key >>  8) & 0xFF)]++;
-         }
-      }      
-      else
-      {
-         RG_ETC1_ASSERT(key_size == 1);
-         if (key_size != 1)
-            return NULL;
-
-         T* p = pIndices0;
-         T* q = pIndices0 + (num_indices >> 1) * 2;
-
-         for ( ; p != q; p += 2)
-         {
-            const uint key0 = RG_ETC1_GET_KEY(p);
-            const uint key1 = RG_ETC1_GET_KEY(p+1);
-
-            hist[key0 & 0xFF]++;
-            hist[key1 & 0xFF]++;
-         }
-
-         if (num_indices & 1)
-         {
-            const uint key = RG_ETC1_GET_KEY(p);
-
-            hist[key & 0xFF]++;
-         }
-      }      
-
-      T* pCur = pIndices0;
-      T* pNew = pIndices1;
-
-      for (uint pass = 0; pass < key_size; pass++)
-      {
-         const uint* pHist = &hist[pass << 8];
-
-         uint offsets[256];
-
-         uint cur_ofs = 0;
-         for (uint i = 0; i < 256; i += 2)
-         {
-            offsets[i] = cur_ofs;
-            cur_ofs += pHist[i];
-
-            offsets[i+1] = cur_ofs;
-            cur_ofs += pHist[i+1];
-         }
-
-         const uint pass_shift = pass << 3;
-
-         T* p = pCur;
-         T* q = pCur + (num_indices >> 1) * 2;
-
-         for ( ; p != q; p += 2)
-         {
-            uint index0 = p[0];
-            uint index1 = p[1];
-
-            uint c0 = (RG_ETC1_GET_KEY_FROM_INDEX(index0) >> pass_shift) & 0xFF;
-            uint c1 = (RG_ETC1_GET_KEY_FROM_INDEX(index1) >> pass_shift) & 0xFF;
-
-            if (c0 == c1)
-            {
-               uint dst_offset0 = offsets[c0];
-
-               offsets[c0] = dst_offset0 + 2;
-
-               pNew[dst_offset0] = static_cast<T>(index0);
-               pNew[dst_offset0 + 1] = static_cast<T>(index1);
-            }
-            else
-            {
-               uint dst_offset0 = offsets[c0]++;
-               uint dst_offset1 = offsets[c1]++;
-
-               pNew[dst_offset0] = static_cast<T>(index0);
-               pNew[dst_offset1] = static_cast<T>(index1);
-            }
-         }
-
-         if (num_indices & 1)
-         {
-            uint index = *p;
-            uint c = (RG_ETC1_GET_KEY_FROM_INDEX(index) >> pass_shift) & 0xFF;
-
-            uint dst_offset = offsets[c];
-            offsets[c] = dst_offset + 1;
-
-            pNew[dst_offset] = static_cast<T>(index);
-         }
-
-         T* t = pCur;
-         pCur = pNew;
-         pNew = t;
-      }            
-
-      return pCur;
-   }
-
-#undef RG_ETC1_GET_KEY
-#undef RG_ETC1_GET_KEY_FROM_INDEX
-
-   uint16 etc1_block::pack_color5(const color_quad_u8& color, bool scaled, uint bias)
-   {
-      return pack_color5(color.r, color.g, color.b, scaled, bias);
-   }
-   
-   uint16 etc1_block::pack_color5(uint r, uint g, uint b, bool scaled, uint bias)
-   {
-      if (scaled)
-      {
-         r = (r * 31U + bias) / 255U;
-         g = (g * 31U + bias) / 255U;
-         b = (b * 31U + bias) / 255U;
-      }
-
-      r = rg_etc1::minimum(r, 31U);
-      g = rg_etc1::minimum(g, 31U);
-      b = rg_etc1::minimum(b, 31U);
-
-      return static_cast<uint16>(b | (g << 5U) | (r << 10U));
-   }
-
-   color_quad_u8 etc1_block::unpack_color5(uint16 packed_color5, bool scaled, uint alpha)
-   {
-      uint b = packed_color5 & 31U;
-      uint g = (packed_color5 >> 5U) & 31U;
-      uint r = (packed_color5 >> 10U) & 31U;
-
-      if (scaled)
-      {
-         b = (b << 3U) | (b >> 2U);
-         g = (g << 3U) | (g >> 2U);
-         r = (r << 3U) | (r >> 2U);
-      }
-
-      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
-   }
-
-   void etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, bool scaled)
-   {
-      color_quad_u8 c(unpack_color5(packed_color5, scaled, 0));
-      r = c.r;
-      g = c.g;
-      b = c.b;
-   }
-
-   bool etc1_block::unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
-   {
-      int dc_r, dc_g, dc_b;
-      unpack_delta3(dc_r, dc_g, dc_b, packed_delta3);
-      
-      int b = (packed_color5 & 31U) + dc_b;
-      int g = ((packed_color5 >> 5U) & 31U) + dc_g;
-      int r = ((packed_color5 >> 10U) & 31U) + dc_r;
-
-      bool success = true;
-      if (static_cast<uint>(r | g | b) > 31U)
-      {
-         success = false;
-         r = rg_etc1::clamp<int>(r, 0, 31);
-         g = rg_etc1::clamp<int>(g, 0, 31);
-         b = rg_etc1::clamp<int>(b, 0, 31);
-      }
-
-      if (scaled)
-      {
-         b = (b << 3U) | (b >> 2U);
-         g = (g << 3U) | (g >> 2U);
-         r = (r << 3U) | (r >> 2U);
-      }
-
-      result.set_noclamp_rgba(r, g, b, rg_etc1::minimum(alpha, 255U));
-      return success;
-   }
-
-   bool etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
-   {
-      color_quad_u8 result;
-      const bool success = unpack_color5(result, packed_color5, packed_delta3, scaled, alpha);
-      r = result.r;
-      g = result.g;
-      b = result.b;
-      return success;
-   }
-     
-   uint16 etc1_block::pack_delta3(int r, int g, int b)
-   {
-      RG_ETC1_ASSERT((r >= cETC1ColorDeltaMin) && (r <= cETC1ColorDeltaMax));
-      RG_ETC1_ASSERT((g >= cETC1ColorDeltaMin) && (g <= cETC1ColorDeltaMax));
-      RG_ETC1_ASSERT((b >= cETC1ColorDeltaMin) && (b <= cETC1ColorDeltaMax));
-      if (r < 0) r += 8;
-      if (g < 0) g += 8;
-      if (b < 0) b += 8;
-      return static_cast<uint16>(b | (g << 3) | (r << 6));
-   }
-   
-   void etc1_block::unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3)
-   {
-      r = (packed_delta3 >> 6) & 7;
-      g = (packed_delta3 >> 3) & 7;
-      b = packed_delta3 & 7;
-      if (r >= 4) r -= 8;
-      if (g >= 4) g -= 8;
-      if (b >= 4) b -= 8;
-   }
-
-   uint16 etc1_block::pack_color4(const color_quad_u8& color, bool scaled, uint bias)
-   {
-      return pack_color4(color.r, color.g, color.b, scaled, bias);
-   }
-   
-   uint16 etc1_block::pack_color4(uint r, uint g, uint b, bool scaled, uint bias)
-   {
-      if (scaled)
-      {
-         r = (r * 15U + bias) / 255U;
-         g = (g * 15U + bias) / 255U;
-         b = (b * 15U + bias) / 255U;
-      }
-
-      r = rg_etc1::minimum(r, 15U);
-      g = rg_etc1::minimum(g, 15U);
-      b = rg_etc1::minimum(b, 15U);
-
-      return static_cast<uint16>(b | (g << 4U) | (r << 8U));
-   }
-
-   color_quad_u8 etc1_block::unpack_color4(uint16 packed_color4, bool scaled, uint alpha)
-   {
-      uint b = packed_color4 & 15U;
-      uint g = (packed_color4 >> 4U) & 15U;
-      uint r = (packed_color4 >> 8U) & 15U;
-
-      if (scaled)
-      {
-         b = (b << 4U) | b;
-         g = (g << 4U) | g;
-         r = (r << 4U) | r;
-      }
-
-      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
-   }
-   
-   void etc1_block::unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled)
-   {
-      color_quad_u8 c(unpack_color4(packed_color4, scaled, 0));
-      r = c.r;
-      g = c.g;
-      b = c.b;
-   }
-
-   void etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx)
-   {
-      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
-      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-      uint r, g, b;
-      unpack_color5(r, g, b, packed_color5, true);
-
-      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-      const int y0 = pInten_modifer_table[0];
-      pDst[0].set(ir + y0, ig + y0, ib + y0);
-
-      const int y1 = pInten_modifer_table[1];
-      pDst[1].set(ir + y1, ig + y1, ib + y1);
-
-      const int y2 = pInten_modifer_table[2];
-      pDst[2].set(ir + y2, ig + y2, ib + y2);
-
-      const int y3 = pInten_modifer_table[3];
-      pDst[3].set(ir + y3, ig + y3, ib + y3);
-   }
-   
-   bool etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx)
-   {
-      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
-      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-      uint r, g, b;
-      bool success = unpack_color5(r, g, b, packed_color5, packed_delta3, true);
-
-      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-      const int y0 = pInten_modifer_table[0];
-      pDst[0].set(ir + y0, ig + y0, ib + y0);
-
-      const int y1 = pInten_modifer_table[1];
-      pDst[1].set(ir + y1, ig + y1, ib + y1);
-
-      const int y2 = pInten_modifer_table[2];
-      pDst[2].set(ir + y2, ig + y2, ib + y2);
-
-      const int y3 = pInten_modifer_table[3];
-      pDst[3].set(ir + y3, ig + y3, ib + y3);
-
-      return success;
-   }
-   
-   void etc1_block::get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx)
-   {
-      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
-      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
-
-      uint r, g, b;
-      unpack_color4(r, g, b, packed_color4, true);
-      
-      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
-
-      const int y0 = pInten_modifer_table[0];
-      pDst[0].set(ir + y0, ig + y0, ib + y0);
-      
-      const int y1 = pInten_modifer_table[1];
-      pDst[1].set(ir + y1, ig + y1, ib + y1);
-
-      const int y2 = pInten_modifer_table[2];
-      pDst[2].set(ir + y2, ig + y2, ib + y2);
-
-      const int y3 = pInten_modifer_table[3];
-      pDst[3].set(ir + y3, ig + y3, ib + y3);
-   }
-      
-   bool unpack_etc1_block(const void* pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha)
-   {
-      color_quad_u8* pDst = reinterpret_cast<color_quad_u8*>(pDst_pixels_rgba);
-      const etc1_block& block = *static_cast<const etc1_block*>(pETC1_block);
-
-      const bool diff_flag = block.get_diff_bit();
-      const bool flip_flag = block.get_flip_bit();
-      const uint table_index0 = block.get_inten_table(0);
-      const uint table_index1 = block.get_inten_table(1);
-
-      color_quad_u8 subblock_colors0[4];
-      color_quad_u8 subblock_colors1[4];
-      bool success = true;
-
-      if (diff_flag)
-      {
-         const uint16 base_color5 = block.get_base5_color();
-         const uint16 delta_color3 = block.get_delta3_color();
-         etc1_block::get_diff_subblock_colors(subblock_colors0, base_color5, table_index0);
-            
-         if (!etc1_block::get_diff_subblock_colors(subblock_colors1, base_color5, delta_color3, table_index1))
-            success = false;
-      }
-      else
-      {
-         const uint16 base_color4_0 = block.get_base4_color(0);
-         etc1_block::get_abs_subblock_colors(subblock_colors0, base_color4_0, table_index0);
-
-         const uint16 base_color4_1 = block.get_base4_color(1);
-         etc1_block::get_abs_subblock_colors(subblock_colors1, base_color4_1, table_index1);
-      }
-
-      if (preserve_alpha)
-      {
-         if (flip_flag)
-         {
-            for (uint y = 0; y < 2; y++)
-            {
-               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
-               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
-               pDst[2].set_rgb(subblock_colors0[block.get_selector(2, y)]);
-               pDst[3].set_rgb(subblock_colors0[block.get_selector(3, y)]);
-               pDst += 4;
-            }
-
-            for (uint y = 2; y < 4; y++)
-            {
-               pDst[0].set_rgb(subblock_colors1[block.get_selector(0, y)]);
-               pDst[1].set_rgb(subblock_colors1[block.get_selector(1, y)]);
-               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
-               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
-               pDst += 4;
-            }
-         }
-         else
-         {
-            for (uint y = 0; y < 4; y++)
-            {
-               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
-               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
-               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
-               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
-               pDst += 4;
-            }
-         }
-      }
-      else 
-      {
-         if (flip_flag)
-         {
-            // 0000
-            // 0000
-            // 1111
-            // 1111
-            for (uint y = 0; y < 2; y++)
-            {
-               pDst[0] = subblock_colors0[block.get_selector(0, y)];
-               pDst[1] = subblock_colors0[block.get_selector(1, y)];
-               pDst[2] = subblock_colors0[block.get_selector(2, y)];
-               pDst[3] = subblock_colors0[block.get_selector(3, y)];
-               pDst += 4;
-            }
-
-            for (uint y = 2; y < 4; y++)
-            {
-               pDst[0] = subblock_colors1[block.get_selector(0, y)];
-               pDst[1] = subblock_colors1[block.get_selector(1, y)];
-               pDst[2] = subblock_colors1[block.get_selector(2, y)];
-               pDst[3] = subblock_colors1[block.get_selector(3, y)];
-               pDst += 4;
-            }
-         }
-         else
-         {
-            // 0011
-            // 0011
-            // 0011
-            // 0011
-            for (uint y = 0; y < 4; y++)
-            {
-               pDst[0] = subblock_colors0[block.get_selector(0, y)];
-               pDst[1] = subblock_colors0[block.get_selector(1, y)];
-               pDst[2] = subblock_colors1[block.get_selector(2, y)];
-               pDst[3] = subblock_colors1[block.get_selector(3, y)];
-               pDst += 4;
-            }
-         }
-      }
-      
-      return success;
-   }
-
-   struct etc1_solution_coordinates
-   {
-      inline etc1_solution_coordinates() :
-      m_unscaled_color(0, 0, 0, 0),
-         m_inten_table(0),
-         m_color4(false)
-      {
-      }
-
-      inline etc1_solution_coordinates(uint r, uint g, uint b, uint inten_table, bool color4) : 
-      m_unscaled_color(r, g, b, 255),
-         m_inten_table(inten_table),
-         m_color4(color4)
-      {
-      }
-
-      inline etc1_solution_coordinates(const color_quad_u8& c, uint inten_table, bool color4) : 
-      m_unscaled_color(c),
-         m_inten_table(inten_table),
-         m_color4(color4)
-      {
-      }
-
-      inline etc1_solution_coordinates(const etc1_solution_coordinates& other)
-      {
-         *this = other;
-      }
-
-      inline etc1_solution_coordinates& operator= (const etc1_solution_coordinates& rhs)
-      {
-         m_unscaled_color = rhs.m_unscaled_color;
-         m_inten_table = rhs.m_inten_table;
-         m_color4 = rhs.m_color4;
-         return *this;
-      }
-
-      inline void clear()
-      {
-         m_unscaled_color.clear();
-         m_inten_table = 0;
-         m_color4 = false;
-      }
-
-      inline color_quad_u8 get_scaled_color() const
-      {
-         int br, bg, bb;
-         if (m_color4)
-         {
-            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
-            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
-            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
-         }
-         else
-         {
-            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
-            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
-            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
-         }
-         return color_quad_u8(br, bg, bb);
-      }
-
-      inline void get_block_colors(color_quad_u8* pBlock_colors)
-      {
-         int br, bg, bb;
-         if (m_color4)
-         {
-            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
-            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
-            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
-         }
-         else
-         {
-            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
-            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
-            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
-         }
-         const int* pInten_table = g_etc1_inten_tables[m_inten_table];
-         pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0]);
-         pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1]);
-         pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2]);
-         pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3]);
-      }
-
-      color_quad_u8 m_unscaled_color;
-      uint m_inten_table;
-      bool m_color4;
-   };
-
-   class etc1_optimizer
-   {
-      etc1_optimizer(const etc1_optimizer&);
-      etc1_optimizer& operator= (const etc1_optimizer&);
-
-   public:
-      etc1_optimizer()
-      {
-         clear();
-      }
-
-      void clear()
-      {
-         m_pParams = NULL;
-         m_pResult = NULL;
-         m_pSorted_luma = NULL;
-         m_pSorted_luma_indices = NULL;
-      }
-
-      struct params : etc1_pack_params
-      {
-         params()
-         {
-            clear();
-         }
-
-         params(const etc1_pack_params& base_params) : 
-         etc1_pack_params(base_params)
-         {
-            clear_optimizer_params();
-         }
-
-         void clear()
-         {
-            etc1_pack_params::clear();
-            clear_optimizer_params();
-         }
-
-         void clear_optimizer_params()
-         {
-            m_num_src_pixels = 0;
-            m_pSrc_pixels = 0;
-
-            m_use_color4 = false;
-            static const int s_default_scan_delta[] = { 0 };
-            m_pScan_deltas = s_default_scan_delta;
-            m_scan_delta_size = 1;
-
-            m_base_color5.clear();
-            m_constrain_against_base_color5 = false;
-         }
-
-         uint m_num_src_pixels;
-         const color_quad_u8* m_pSrc_pixels;
-
-         bool m_use_color4;
-         const int* m_pScan_deltas;
-         uint m_scan_delta_size;
-
-         color_quad_u8 m_base_color5;
-         bool m_constrain_against_base_color5;
-      };
-
-      struct results
-      {
-         uint64 m_error;
-         color_quad_u8 m_block_color_unscaled;
-         uint m_block_inten_table;
-         uint m_n;
-         uint8* m_pSelectors;
-         bool m_block_color4;
-
-         inline results& operator= (const results& rhs)
-         {
-            m_block_color_unscaled = rhs.m_block_color_unscaled;
-            m_block_color4 = rhs.m_block_color4;
-            m_block_inten_table = rhs.m_block_inten_table;
-            m_error = rhs.m_error;
-            RG_ETC1_ASSERT(m_n == rhs.m_n);
-            memcpy(m_pSelectors, rhs.m_pSelectors, rhs.m_n);
-            return *this;
-         }
-      };
-
-      void init(const params& params, results& result);
-      bool compute();
-
-   private:      
-      struct potential_solution
-      {
-         potential_solution() : m_coords(), m_error(cUINT64_MAX), m_valid(false)
-         {
-         }
-
-         etc1_solution_coordinates  m_coords;
-         uint8                      m_selectors[8];
-         uint64                     m_error;
-         bool                       m_valid;
-
-         void clear()
-         {
-            m_coords.clear();
-            m_error = cUINT64_MAX;
-            m_valid = false;
-         }
-      };
-
-      const params* m_pParams;
-      results* m_pResult;
-
-      int m_limit;
-
-      vec3F m_avg_color;
-      int m_br, m_bg, m_bb;
-      uint16 m_luma[8];
-      uint32 m_sorted_luma[2][8];
-      const uint32* m_pSorted_luma_indices;
-      uint32* m_pSorted_luma;
-
-      uint8 m_selectors[8];
-      uint8 m_best_selectors[8];
-
-      potential_solution m_best_solution;
-      potential_solution m_trial_solution;
-      uint8 m_temp_selectors[8];
-
-      bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
-      bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
-   };
-      
-   bool etc1_optimizer::compute()
-   {
-      const uint n = m_pParams->m_num_src_pixels;
-      const int scan_delta_size = m_pParams->m_scan_delta_size;
-      
-      // Scan through a subset of the 3D lattice centered around the avg block color trying each 3D (555 or 444) lattice point as a potential block color.
-      // Each time a better solution is found try to refine the current solution's block color based of the current selectors and intensity table index.
-      for (int zdi = 0; zdi < scan_delta_size; zdi++)
-      {
-         const int zd = m_pParams->m_pScan_deltas[zdi];
-         const int mbb = m_bb + zd;
-         if (mbb < 0) continue; else if (mbb > m_limit) break;
-         
-         for (int ydi = 0; ydi < scan_delta_size; ydi++)
-         {
-            const int yd = m_pParams->m_pScan_deltas[ydi];
-            const int mbg = m_bg + yd;
-            if (mbg < 0) continue; else if (mbg > m_limit) break;
-
-            for (int xdi = 0; xdi < scan_delta_size; xdi++)
-            {
-               const int xd = m_pParams->m_pScan_deltas[xdi];
-               const int mbr = m_br + xd;
-               if (mbr < 0) continue; else if (mbr > m_limit) break;
-      
-               etc1_solution_coordinates coords(mbr, mbg, mbb, 0, m_pParams->m_use_color4);
-               if (m_pParams->m_quality == cHighQuality)
-               {
-                  if (!evaluate_solution(coords, m_trial_solution, &m_best_solution))
-                     continue;
-               }
-               else
-               {
-                  if (!evaluate_solution_fast(coords, m_trial_solution, &m_best_solution))
-                     continue;
-               }
-               
-               // Now we have the input block, the avg. color of the input pixels, a set of trial selector indices, and the block color+intensity index.
-               // Now, for each component, attempt to refine the current solution by solving a simple linear equation. For example, for 4 colors:
-               // The goal is:
-               // pixel0 - (block_color+inten_table[selector0]) + pixel1 - (block_color+inten_table[selector1]) + pixel2 - (block_color+inten_table[selector2]) + pixel3 - (block_color+inten_table[selector3]) = 0
-               // Rearranging this:
-               // (pixel0 + pixel1 + pixel2 + pixel3) - (block_color+inten_table[selector0]) - (block_color+inten_table[selector1]) - (block_color+inten_table[selector2]) - (block_color+inten_table[selector3]) = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3) - block_color - inten_table[selector0] - block_color-inten_table[selector1] - block_color-inten_table[selector2] - block_color-inten_table[selector3] = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - inten_table[selector0] - inten_table[selector1] - inten_table[selector2] - inten_table[selector3] = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3]) = 0
-               // (pixel0 + pixel1 + pixel2 + pixel3)/4 - block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4 = 0
-               // block_color = (pixel0 + pixel1 + pixel2 + pixel3)/4 - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4
-               // So what this means:
-               // optimal_block_color = avg_input - avg_inten_delta
-               // So the optimal block color can be computed by taking the average block color and subtracting the current average of the intensity delta.
-               // Unfortunately, optimal_block_color must then be quantized to 555 or 444 so it's not always possible to improve matters using this formula.
-               // Also, the above formula is for unclamped intensity deltas. The actual implementation takes into account clamping.
-
-               const uint max_refinement_trials = (m_pParams->m_quality == cLowQuality) ? 2 : (((xd | yd | zd) == 0) ? 4 : 2);
-               for (uint refinement_trial = 0; refinement_trial < max_refinement_trials; refinement_trial++)
-               {
-                  const uint8* pSelectors = m_best_solution.m_selectors;
-                  const int* pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
-
-                  int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
-                  const color_quad_u8 base_color(m_best_solution.m_coords.get_scaled_color());
-                  for (uint r = 0; r < n; r++)
-                  {
-                     const uint s = *pSelectors++;
-                     const int yd = pInten_table[s];
-                     // Compute actual delta being applied to each pixel, taking into account clamping.
-                     delta_sum_r += rg_etc1::clamp<int>(base_color.r + yd, 0, 255) - base_color.r;
-                     delta_sum_g += rg_etc1::clamp<int>(base_color.g + yd, 0, 255) - base_color.g;
-                     delta_sum_b += rg_etc1::clamp<int>(base_color.b + yd, 0, 255) - base_color.b;
-                  }
-                  if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
-                     break;
-                  const float avg_delta_r_f = static_cast<float>(delta_sum_r) / n;
-                  const float avg_delta_g_f = static_cast<float>(delta_sum_g) / n;
-                  const float avg_delta_b_f = static_cast<float>(delta_sum_b) / n;
-                  const int br1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
-                  const int bg1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
-                  const int bb1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
-                  
-                  bool skip = false;
-                  
-                  if ((mbr == br1) && (mbg == bg1) && (mbb == bb1))
-                     skip = true;
-                  else if ((br1 == m_best_solution.m_coords.m_unscaled_color.r) && (bg1 == m_best_solution.m_coords.m_unscaled_color.g) && (bb1 == m_best_solution.m_coords.m_unscaled_color.b))
-                     skip = true;
-                  else if ((m_br == br1) && (m_bg == bg1) && (m_bb == bb1))
-                     skip = true;
-
-                  if (skip)
-                     break;
-
-                  etc1_solution_coordinates coords1(br1, bg1, bb1, 0, m_pParams->m_use_color4);
-                  if (m_pParams->m_quality == cHighQuality)
-                  {
-                     if (!evaluate_solution(coords1, m_trial_solution, &m_best_solution)) 
-                        break;
-                  }
-                  else
-                  {
-                     if (!evaluate_solution_fast(coords1, m_trial_solution, &m_best_solution))
-                        break;
-                  }
-
-               }  // refinement_trial
-
-            } // xdi
-         } // ydi
-      } // zdi
-
-      if (!m_best_solution.m_valid)
-      {
-         m_pResult->m_error = cUINT32_MAX;
-         return false;
-      }
-      
-      const uint8* pSelectors = m_best_solution.m_selectors;
-
-#ifdef RG_ETC1_BUILD_DEBUG
-      {
-         color_quad_u8 block_colors[4];
-         m_best_solution.m_coords.get_block_colors(block_colors);
-
-         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
-         uint64 actual_error = 0;
-         for (uint i = 0; i < n; i++)
-            actual_error += pSrc_pixels[i].squared_distance_rgb(block_colors[pSelectors[i]]);
-         
-         RG_ETC1_ASSERT(actual_error == m_best_solution.m_error);
-      }
-#endif      
-      
-      m_pResult->m_error = m_best_solution.m_error;
-
-      m_pResult->m_block_color_unscaled = m_best_solution.m_coords.m_unscaled_color;
-      m_pResult->m_block_color4 = m_best_solution.m_coords.m_color4;
-      
-      m_pResult->m_block_inten_table = m_best_solution.m_coords.m_inten_table;
-      memcpy(m_pResult->m_pSelectors, pSelectors, n);
-      m_pResult->m_n = n;
-
-      return true;
-   }
-
-   void etc1_optimizer::init(const params& p, results& r)
-   {
-      // This version is hardcoded for 8 pixel subblocks.
-      RG_ETC1_ASSERT(p.m_num_src_pixels == 8);
-      
-      m_pParams = &p;
-      m_pResult = &r;
-                  
-      const uint n = 8;
-      
-      m_limit = m_pParams->m_use_color4 ? 15 : 31;
-
-      vec3F avg_color(0.0f);
-
-      for (uint i = 0; i < n; i++)
-      {
-         const color_quad_u8& c = m_pParams->m_pSrc_pixels[i];
-         const vec3F fc(c.r, c.g, c.b);
-
-         avg_color += fc;
-
-         m_luma[i] = static_cast<uint16>(c.r + c.g + c.b);
-         m_sorted_luma[0][i] = i;
-      }
-      avg_color *= (1.0f / static_cast<float>(n));
-      m_avg_color = avg_color;
-
-      m_br = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
-      m_bg = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
-      m_bb = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
-
-      if (m_pParams->m_quality <= cMediumQuality)
-      {
-         m_pSorted_luma_indices = indirect_radix_sort(n, m_sorted_luma[0], m_sorted_luma[1], m_luma, 0, sizeof(m_luma[0]), false);
-         m_pSorted_luma = m_sorted_luma[0];
-         if (m_pSorted_luma_indices == m_sorted_luma[0])
-            m_pSorted_luma = m_sorted_luma[1];
-      
-         for (uint i = 0; i < n; i++)
-            m_pSorted_luma[i] = m_luma[m_pSorted_luma_indices[i]];
-      }
-      
-      m_best_solution.m_coords.clear();
-      m_best_solution.m_valid = false;
-      m_best_solution.m_error = cUINT64_MAX;
-   }
-
-   bool etc1_optimizer::evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
-   {
-      trial_solution.m_valid = false;
-
-      if (m_pParams->m_constrain_against_base_color5)
-      {
-         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
-         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
-         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
-
-         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
-            return false;
-      }
-
-      const color_quad_u8 base_color(coords.get_scaled_color());
-      
-      const uint n = 8;
-            
-      trial_solution.m_error = cUINT64_MAX;
-            
-      for (uint inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
-      {
-         const int* pInten_table = g_etc1_inten_tables[inten_table];
-
-         color_quad_u8 block_colors[4];
-         for (uint s = 0; s < 4; s++)
-         {
-            const int yd = pInten_table[s];
-            block_colors[s].set(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
-         }
-         
-         uint64 total_error = 0;
-         
-         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
-         for (uint c = 0; c < n; c++)
-         {
-            const color_quad_u8& src_pixel = *pSrc_pixels++;
-            
-            uint best_selector_index = 0;
-            uint best_error = rg_etc1::square(src_pixel.r - block_colors[0].r) + rg_etc1::square(src_pixel.g - block_colors[0].g) + rg_etc1::square(src_pixel.b - block_colors[0].b);
-
-            uint trial_error = rg_etc1::square(src_pixel.r - block_colors[1].r) + rg_etc1::square(src_pixel.g - block_colors[1].g) + rg_etc1::square(src_pixel.b - block_colors[1].b);
-            if (trial_error < best_error)
-            {
-               best_error = trial_error;
-               best_selector_index = 1;
-            }
-
-            trial_error = rg_etc1::square(src_pixel.r - block_colors[2].r) + rg_etc1::square(src_pixel.g - block_colors[2].g) + rg_etc1::square(src_pixel.b - block_colors[2].b);
-            if (trial_error < best_error)
-            {
-               best_error = trial_error;
-               best_selector_index = 2;
-            }
-
-            trial_error = rg_etc1::square(src_pixel.r - block_colors[3].r) + rg_etc1::square(src_pixel.g - block_colors[3].g) + rg_etc1::square(src_pixel.b - block_colors[3].b);
-            if (trial_error < best_error)
-            {
-               best_error = trial_error;
-               best_selector_index = 3;
-            }
-
-            m_temp_selectors[c] = static_cast<uint8>(best_selector_index);
-
-            total_error += best_error;
-            if (total_error >= trial_solution.m_error)
-               break;
-         }
-         
-         if (total_error < trial_solution.m_error)
-         {
-            trial_solution.m_error = total_error;
-            trial_solution.m_coords.m_inten_table = inten_table;
-            memcpy(trial_solution.m_selectors, m_temp_selectors, 8);
-            trial_solution.m_valid = true;
-         }
-      }
-      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
-      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
-
-      bool success = false;
-      if (pBest_solution)
-      {
-         if (trial_solution.m_error < pBest_solution->m_error)
-         {
-            *pBest_solution = trial_solution;
-            success = true;
-         }
-      }
-
-      return success;
-   }
-
-   bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
-   {
-      if (m_pParams->m_constrain_against_base_color5)
-      {
-         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
-         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
-         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
-
-         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
-         {
-            trial_solution.m_valid = false;
-            return false;
-         }
-      }
-
-      const color_quad_u8 base_color(coords.get_scaled_color());
-
-      const uint n = 8;
-      
-      trial_solution.m_error = cUINT64_MAX;
-
-      for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
-      {
-         const int* pInten_table = g_etc1_inten_tables[inten_table];
-
-         uint block_inten[4];
-         color_quad_u8 block_colors[4];
-         for (uint s = 0; s < 4; s++)
-         {
-            const int yd = pInten_table[s];
-            color_quad_u8 block_color(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
-            block_colors[s] = block_color;
-            block_inten[s] = block_color.r + block_color.g + block_color.b;
-         }
-
-         // evaluate_solution_fast() enforces/assumesd a total ordering of the input colors along the intensity (1,1,1) axis to more quickly classify the inputs to selectors.
-         // The inputs colors have been presorted along the projection onto this axis, and ETC1 block colors are always ordered along the intensity axis, so this classification is fast.
-         // 0   1   2   3
-         //   01  12  23
-         const uint block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
-
-         uint64 total_error = 0;
-         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
-         if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
-         {
-            if (block_inten[0] > m_pSorted_luma[n - 1])
-            {
-           const uint min_error = intabs(block_inten[0] - m_pSorted_luma[n - 1]);
-               if (min_error >= trial_solution.m_error)
-                  continue;
-            }
-
-            memset(&m_temp_selectors[0], 0, n);
-
-            for (uint c = 0; c < n; c++)
-               total_error += block_colors[0].squared_distance_rgb(pSrc_pixels[c]);
-         }
-         else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
-         {
-            if (m_pSorted_luma[0] > block_inten[3])
-            {
-           const uint min_error = intabs(m_pSorted_luma[0] - block_inten[3]);
-               if (min_error >= trial_solution.m_error)
-                  continue;
-            }
-
-            memset(&m_temp_selectors[0], 3, n);
-
-            for (uint c = 0; c < n; c++)
-               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[c]);
-         }
-         else
-         {
-            uint cur_selector = 0, c;
-            for (c = 0; c < n; c++)
-            {
-               const uint y = m_pSorted_luma[c];
-               while ((y * 2) >= block_inten_midpoints[cur_selector])
-                  if (++cur_selector > 2)
-                     goto done;
-               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
-               m_temp_selectors[sorted_pixel_index] = static_cast<uint8>(cur_selector);
-               total_error += block_colors[cur_selector].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
-            }
-done:
-            while (c < n)
-            {
-               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
-               m_temp_selectors[sorted_pixel_index] = 3;
-               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
-               ++c;
-            }
-         }
-
-         if (total_error < trial_solution.m_error)
-         {
-            trial_solution.m_error = total_error;
-            trial_solution.m_coords.m_inten_table = inten_table;
-            memcpy(trial_solution.m_selectors, m_temp_selectors, n);
-            trial_solution.m_valid = true;
-            if (!total_error)
-               break;
-         }
-      }
-      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
-      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
-      
-      bool success = false;
-      if (pBest_solution)
-      {
-         if (trial_solution.m_error < pBest_solution->m_error)
-         {
-            *pBest_solution = trial_solution;
-            success = true;
-         }
-      }
-
-      return success;
-   }
-         
-   static uint etc1_decode_value(uint diff, uint inten, uint selector, uint packed_c)
-   {
-      const uint limit = diff ? 32 : 16; limit;
-      RG_ETC1_ASSERT((diff < 2) && (inten < 8) && (selector < 4) && (packed_c < limit));
-      int c;
-      if (diff)
-         c = (packed_c >> 2) | (packed_c << 3);
-      else 
-         c = packed_c | (packed_c << 4);
-      c += g_etc1_inten_tables[inten][selector];
-      c = rg_etc1::clamp<int>(c, 0, 255);
-      return c;
-   }
-
-   static inline int mul_8bit(int a, int b) { int t = a*b + 128; return (t + (t >> 8)) >> 8; }
-
-   void pack_etc1_block_init()
-   {
-      for (uint diff = 0; diff < 2; diff++)
-      {
-         const uint limit = diff ? 32 : 16;
-
-         for (uint inten = 0; inten < 8; inten++)
-         {
-            for (uint selector = 0; selector < 4; selector++)
-            {
-               const uint inverse_table_index = diff + (inten << 1) + (selector << 4);
-               for (uint color = 0; color < 256; color++)
-               {
-                  uint best_error = cUINT32_MAX, best_packed_c = 0;
-                  for (uint packed_c = 0; packed_c < limit; packed_c++)
-                  {
-                     int v = etc1_decode_value(diff, inten, selector, packed_c);
-                     uint err = labs(v - static_cast<int>(color));
-		     //printf("err: %d - %u = %u\n",v,color,err);
-                     if (err < best_error)
-                     {
-                        best_error = err;
-                        best_packed_c = packed_c;
-                        if (!best_error) 
-                           break;
-                     }
-                  }
-                  RG_ETC1_ASSERT(best_error <= 255);
-                  g_etc1_inverse_lookup[inverse_table_index][color] = static_cast<uint16>(best_packed_c | (best_error << 8));
-               }
-            }
-         }
-      }
-      
-      uint expand5[32];
-      for(int i = 0; i < 32; i++)
-         expand5[i] = (i << 3) | (i >> 2);
-
-      for(int i = 0; i < 256 + 16; i++)
-      {
-         int v = clamp<int>(i - 8, 0, 255);
-         g_quant5_tab[i] = static_cast<uint8>(expand5[mul_8bit(v,31)]);
-      }
-   }
-
-   // Packs solid color blocks efficiently using a set of small precomputed tables.
-   // For random 888 inputs, MSE results are better than Erricson's ETC1 packer in "slow" mode ~9.5% of the time, is slightly worse only ~.01% of the time, and is equal the rest of the time.
-   static uint64 pack_etc1_block_solid_color(etc1_block& block, const uint8* pColor, etc1_pack_params& pack_params)
-   {
-      pack_params;
-      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
-            
-      static uint s_next_comp[4] = { 1, 2, 0, 1 };
-            
-      uint best_error = cUINT32_MAX, best_i = 0;
-      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
-
-      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
-      for (uint i = 0; i < 3; i++)
-      {
-         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
-
-         const int delta_range = 1;
-         for (int delta = -delta_range; delta <= delta_range; delta++)
-         {
-            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
-
-            const uint16* pTable;
-            if (!c_plus_delta)
-               pTable = g_color8_to_etc_block_config_0_255[0];
-            else if (c_plus_delta == 255)
-               pTable = g_color8_to_etc_block_config_0_255[1];
-            else
-               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
-
-            do
-            {
-               const uint x = *pTable++;
-
-#ifdef RG_ETC1_BUILD_DEBUG
-               const uint diff = x & 1;
-               const uint inten = (x >> 1) & 7;
-               const uint selector = (x >> 4) & 3;
-               const uint p0 = (x >> 8) & 255;
-               RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
-#endif
-
-               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
-               uint16 p1 = pInverse_table[c1];
-               uint16 p2 = pInverse_table[c2];
-               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
-               if (trial_error < best_error)
-               {
-                  best_error = trial_error;
-                  best_x = x;
-                  best_packed_c1 = p1 & 0xFF;
-                  best_packed_c2 = p2 & 0xFF;
-                  best_i = i;
-                  if (!best_error)
-                     goto found_perfect_match;
-               }
-            } while (*pTable != 0xFFFF);
-         }
-      }
-found_perfect_match:
-
-      const uint diff = best_x & 1;
-      const uint inten = (best_x >> 1) & 7;
-
-      block.m_bytes[3] = static_cast<uint8>(((inten | (inten << 3)) << 2) | (diff << 1));
-                        
-      const uint etc1_selector = g_selector_index_to_etc1[(best_x >> 4) & 3];
-      *reinterpret_cast<uint16*>(&block.m_bytes[4]) = (etc1_selector & 2) ? 0xFFFF : 0;
-      *reinterpret_cast<uint16*>(&block.m_bytes[6]) = (etc1_selector & 1) ? 0xFFFF : 0;
-
-      const uint best_packed_c0 = (best_x >> 8) & 255;
-      if (diff)
-      {
-         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 << 3);
-         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 << 3);
-         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 << 3);
-      }
-      else
-      {
-         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 | (best_packed_c0 << 4));
-         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 | (best_packed_c1 << 4));
-         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 | (best_packed_c2 << 4));
-      }
-
-      return best_error;
-   }
-      
-   static uint pack_etc1_block_solid_color_constrained(
-      etc1_optimizer::results& results, 
-      uint num_colors, const uint8* pColor, 
-      etc1_pack_params& pack_params, 
-      bool use_diff,
-      const color_quad_u8* pBase_color5_unscaled)
-   {
-      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
-
-      pack_params;
-      static uint s_next_comp[4] = { 1, 2, 0, 1 };
-
-      uint best_error = cUINT32_MAX, best_i = 0;
-      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
-
-      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
-      for (uint i = 0; i < 3; i++)
-      {
-         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
-
-         const int delta_range = 1;
-         for (int delta = -delta_range; delta <= delta_range; delta++)
-         {
-            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
-
-            const uint16* pTable;
-            if (!c_plus_delta)
-               pTable = g_color8_to_etc_block_config_0_255[0];
-            else if (c_plus_delta == 255)
-               pTable = g_color8_to_etc_block_config_0_255[1];
-            else
-               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
-
-            do
-            {
-               const uint x = *pTable++;
-               const uint diff = x & 1;
-               if (static_cast<uint>(use_diff) != diff)
-               {
-                  if (*pTable == 0xFFFF)
-                     break;
-                  continue;
-               }
-
-               if ((diff) && (pBase_color5_unscaled))
-               {
-                  const int p0 = (x >> 8) & 255;
-                  int delta = p0 - static_cast<int>(pBase_color5_unscaled->c[i]);
-                  if ((delta < cETC1ColorDeltaMin) || (delta > cETC1ColorDeltaMax))
-                  {
-                     if (*pTable == 0xFFFF)
-                        break;
-                     continue;
-                  }
-               }
-
-#ifdef RG_ETC1_BUILD_DEBUG
-               {
-                  const uint inten = (x >> 1) & 7;
-                  const uint selector = (x >> 4) & 3;
-                  const uint p0 = (x >> 8) & 255;
-                  RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
-               }
-#endif
-
-               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
-               uint16 p1 = pInverse_table[c1];
-               uint16 p2 = pInverse_table[c2];
-
-               if ((diff) && (pBase_color5_unscaled))
-               {
-                  int delta1 = (p1 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i]]);
-                  int delta2 = (p2 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i + 1]]);
-                  if ((delta1 < cETC1ColorDeltaMin) || (delta1 > cETC1ColorDeltaMax) || (delta2 < cETC1ColorDeltaMin) || (delta2 > cETC1ColorDeltaMax))
-                  {
-                     if (*pTable == 0xFFFF)
-                        break;
-                     continue;
-                  }
-               }
-
-               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
-               if (trial_error < best_error)
-               {
-                  best_error = trial_error;
-                  best_x = x;
-                  best_packed_c1 = p1 & 0xFF;
-                  best_packed_c2 = p2 & 0xFF;
-                  best_i = i;
-                  if (!best_error)
-                     goto found_perfect_match;
-               }
-            } while (*pTable != 0xFFFF);
-         }
-      }
-found_perfect_match:
-
-      if (best_error == cUINT32_MAX)
-         return best_error;
-
-      best_error *= num_colors;
-
-      results.m_n = num_colors;
-      results.m_block_color4 = !(best_x & 1);
-      results.m_block_inten_table = (best_x >> 1) & 7;
-      memset(results.m_pSelectors, (best_x >> 4) & 3, num_colors);
-
-      const uint best_packed_c0 = (best_x >> 8) & 255;
-      results.m_block_color_unscaled[best_i] = static_cast<uint8>(best_packed_c0);
-      results.m_block_color_unscaled[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1);
-      results.m_block_color_unscaled[s_next_comp[best_i + 1]] = static_cast<uint8>(best_packed_c2);
-      results.m_error = best_error;
-      
-      return best_error;
-   }
-
-   // Function originally from RYG's public domain real-time DXT1 compressor, modified for 555.
-   static void dither_block_555(color_quad_u8* dest, const color_quad_u8* block)
-   {
-      int err[8],*ep1 = err,*ep2 = err+4;
-      uint8 *quant = g_quant5_tab+8;
-
-      memset(dest, 0xFF, sizeof(color_quad_u8)*16);
-
-      // process channels seperately
-      for(int ch=0;ch<3;ch++)
-      {
-         uint8* bp = (uint8*)block;
-         uint8* dp = (uint8*)dest;
-
-         bp += ch; dp += ch;
-
-         memset(err,0, sizeof(err));
-         for(int y = 0; y < 4; y++)
-         {
-            // pixel 0
-            dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
-            ep1[0] = bp[ 0] - dp[ 0];
-
-            // pixel 1
-            dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
-            ep1[1] = bp[ 4] - dp[ 4];
-
-            // pixel 2
-            dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
-            ep1[2] = bp[ 8] - dp[ 8];
-
-            // pixel 3
-            dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
-            ep1[3] = bp[12] - dp[12];
-
-            // advance to next line
-            int* tmp = ep1; ep1 = ep2; ep2 = tmp;
-            bp += 16;
-            dp += 16;
-         }
-      }
-   }
-
-   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params)
-   {
-      const color_quad_u8* pSrc_pixels = reinterpret_cast<const color_quad_u8*>(pSrc_pixels_rgba);
-      etc1_block& dst_block = *static_cast<etc1_block*>(pETC1_block);
-
-#ifdef RG_ETC1_BUILD_DEBUG
-      // Ensure all alpha values are 0xFF.
-      for (uint i = 0; i < 16; i++)
-      {
-         RG_ETC1_ASSERT(pSrc_pixels[i].a == 255);
-      }
-#endif
-
-      color_quad_u8 src_pixel0(pSrc_pixels[0]);
-
-      // Check for solid block.
-      const uint32 first_pixel_u32 = pSrc_pixels->m_u32;
-      int r;
-      for (r = 15; r >= 1; --r)
-         if (pSrc_pixels[r].m_u32 != first_pixel_u32)
-            break;
-      if (!r)
-         return static_cast<unsigned int>(16 * pack_etc1_block_solid_color(dst_block, &pSrc_pixels[0].r, pack_params));
-      
-      color_quad_u8 dithered_pixels[16];
-      if (pack_params.m_dithering)
-      {
-         dither_block_555(dithered_pixels, pSrc_pixels);
-         pSrc_pixels = dithered_pixels;
-      }
-
-      etc1_optimizer optimizer;
-
-      uint64 best_error = cUINT64_MAX;
-      uint best_flip = false, best_use_color4 = false;
-      
-      uint8 best_selectors[2][8];
-      etc1_optimizer::results best_results[2];
-      for (uint i = 0; i < 2; i++)
-      {
-         best_results[i].m_n = 8;
-         best_results[i].m_pSelectors = best_selectors[i];
-      }
-      
-      uint8 selectors[3][8];
-      etc1_optimizer::results results[3];
-      
-      for (uint i = 0; i < 3; i++)
-      {
-         results[i].m_n = 8;
-         results[i].m_pSelectors = selectors[i];
-      }
-            
-      color_quad_u8 subblock_pixels[8];
-
-      etc1_optimizer::params params(pack_params);
-      params.m_num_src_pixels = 8;
-      params.m_pSrc_pixels = subblock_pixels;
-
-      for (uint flip = 0; flip < 2; flip++)
-      {
-         for (uint use_color4 = 0; use_color4 < 2; use_color4++)
-         {
-            uint64 trial_error = 0;
-
-            uint subblock;
-            for (subblock = 0; subblock < 2; subblock++)
-            {
-               if (flip)
-                  memcpy(subblock_pixels, pSrc_pixels + subblock * 8, sizeof(color_quad_u8) * 8);
-               else
-               {
-                  const color_quad_u8* pSrc_col = pSrc_pixels + subblock * 2;
-                  subblock_pixels[0] = pSrc_col[0]; subblock_pixels[1] = pSrc_col[4]; subblock_pixels[2] = pSrc_col[8]; subblock_pixels[3] = pSrc_col[12];
-                  subblock_pixels[4] = pSrc_col[1]; subblock_pixels[5] = pSrc_col[5]; subblock_pixels[6] = pSrc_col[9]; subblock_pixels[7] = pSrc_col[13];
-               }
-
-               results[2].m_error = cUINT64_MAX;
-               if ((params.m_quality >= cMediumQuality) && ((subblock) || (use_color4)))
-               {
-                  const uint32 subblock_pixel0_u32 = subblock_pixels[0].m_u32;
-                  for (r = 7; r >= 1; --r)
-                     if (subblock_pixels[r].m_u32 != subblock_pixel0_u32)
-                        break;
-                  if (!r)
-                  {
-                     pack_etc1_block_solid_color_constrained(results[2], 8, &subblock_pixels[0].r, pack_params, !use_color4, (subblock && !use_color4) ? &results[0].m_block_color_unscaled : NULL);
-                  }
-               }
-
-               params.m_use_color4 = (use_color4 != 0);
-               params.m_constrain_against_base_color5 = false;
-
-               if ((!use_color4) && (subblock))
-               {
-                  params.m_constrain_against_base_color5 = true;
-                  params.m_base_color5 = results[0].m_block_color_unscaled;
-               }
-                              
-               if (params.m_quality == cHighQuality)
-               {
-                  static const int s_scan_delta_0_to_4[] = { -4, -3, -2, -1, 0, 1, 2, 3, 4 };
-                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_4);
-                  params.m_pScan_deltas = s_scan_delta_0_to_4;
-               }
-               else if (params.m_quality == cMediumQuality)
-               {
-                  static const int s_scan_delta_0_to_1[] = { -1, 0, 1 };
-                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_1);
-                  params.m_pScan_deltas = s_scan_delta_0_to_1;
-               }
-               else
-               {
-                  static const int s_scan_delta_0[] = { 0 };
-                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0);
-                  params.m_pScan_deltas = s_scan_delta_0;
-               }
-               
-               optimizer.init(params, results[subblock]);
-               if (!optimizer.compute())
-                  break;
-                              
-               if (params.m_quality >= cMediumQuality)
-               {
-                  // TODO: Fix fairly arbitrary/unrefined thresholds that control how far away to scan for potentially better solutions.
-                  const uint refinement_error_thresh0 = 3000;
-                  const uint refinement_error_thresh1 = 6000;
-                  if (results[subblock].m_error > refinement_error_thresh0)
-                  {
-                     if (params.m_quality == cMediumQuality)
-                     {
-                        static const int s_scan_delta_2_to_3[] = { -3, -2, 2, 3 };
-                        params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_2_to_3);
-                        params.m_pScan_deltas = s_scan_delta_2_to_3;
-                     }
-                     else
-                     {
-                        static const int s_scan_delta_5_to_5[] = { -5, 5 };
-                        static const int s_scan_delta_5_to_8[] = { -8, -7, -6, -5, 5, 6, 7, 8 };
-                        if (results[subblock].m_error > refinement_error_thresh1)
-                        {
-                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_8);
-                           params.m_pScan_deltas = s_scan_delta_5_to_8;
-                        }
-                        else
-                        {
-                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_5);
-                           params.m_pScan_deltas = s_scan_delta_5_to_5;
-                        }
-                     }
-
-                     if (!optimizer.compute())
-                        break;
-                  }
-
-                  if (results[2].m_error < results[subblock].m_error)
-                     results[subblock] = results[2];
-               }
-                            
-               trial_error += results[subblock].m_error;
-               if (trial_error >= best_error)
-                  break;
-            }
-
-            if (subblock < 2)
-               continue;
-
-            best_error = trial_error;
-            best_results[0] = results[0];
-            best_results[1] = results[1];
-            best_flip = flip;
-            best_use_color4 = use_color4;
-            
-         } // use_color4
-
-      } // flip
-
-      int dr = best_results[1].m_block_color_unscaled.r - best_results[0].m_block_color_unscaled.r;
-      int dg = best_results[1].m_block_color_unscaled.g - best_results[0].m_block_color_unscaled.g;
-      int db = best_results[1].m_block_color_unscaled.b - best_results[0].m_block_color_unscaled.b;
-      RG_ETC1_ASSERT(best_use_color4 || ((rg_etc1::minimum(dr, dg, db) >= cETC1ColorDeltaMin) && (rg_etc1::maximum(dr, dg, db) <= cETC1ColorDeltaMax)));
-           
-      if (best_use_color4)
-      {
-         dst_block.m_bytes[0] = static_cast<uint8>(best_results[1].m_block_color_unscaled.r | (best_results[0].m_block_color_unscaled.r << 4));
-         dst_block.m_bytes[1] = static_cast<uint8>(best_results[1].m_block_color_unscaled.g | (best_results[0].m_block_color_unscaled.g << 4));
-         dst_block.m_bytes[2] = static_cast<uint8>(best_results[1].m_block_color_unscaled.b | (best_results[0].m_block_color_unscaled.b << 4));
-      }
-      else
-      {
-         if (dr < 0) dr += 8; dst_block.m_bytes[0] = static_cast<uint8>((best_results[0].m_block_color_unscaled.r << 3) | dr);
-         if (dg < 0) dg += 8; dst_block.m_bytes[1] = static_cast<uint8>((best_results[0].m_block_color_unscaled.g << 3) | dg);
-         if (db < 0) db += 8; dst_block.m_bytes[2] = static_cast<uint8>((best_results[0].m_block_color_unscaled.b << 3) | db);
-      }
-      
-      dst_block.m_bytes[3] = static_cast<uint8>( (best_results[1].m_block_inten_table << 2) | (best_results[0].m_block_inten_table << 5) | ((~best_use_color4 & 1) << 1) | best_flip );
-      
-      uint selector0 = 0, selector1 = 0;
-      if (best_flip)
-      {
-         // flipped:
-         // { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },               
-         // { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 } 
-         //
-         // { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
-         // { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
-         const uint8* pSelectors0 = best_results[0].m_pSelectors;
-         const uint8* pSelectors1 = best_results[1].m_pSelectors;
-         for (int x = 3; x >= 0; --x)
-         {
-            uint b;
-            b = g_selector_index_to_etc1[pSelectors1[4 + x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-            b = g_selector_index_to_etc1[pSelectors1[x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-            b = g_selector_index_to_etc1[pSelectors0[4 + x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-            b = g_selector_index_to_etc1[pSelectors0[x]];
-            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-         }
-      }
-      else
-      {
-         // non-flipped:
-         // { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
-         // { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
-         //
-         // { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
-         // { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
-         for (int subblock = 1; subblock >= 0; --subblock)
-         {
-            const uint8* pSelectors = best_results[subblock].m_pSelectors + 4;
-            for (uint i = 0; i < 2; i++)
-            {
-               uint b;
-               b = g_selector_index_to_etc1[pSelectors[3]];
-               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-               b = g_selector_index_to_etc1[pSelectors[2]];
-               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-               b = g_selector_index_to_etc1[pSelectors[1]];
-               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
-
-               b = g_selector_index_to_etc1[pSelectors[0]];
-               selector0 = (selector0 << 1) | (b & 1);selector1 = (selector1 << 1) | (b >> 1);
-
-               pSelectors -= 4;
-            }
-         }
-      }
-                  
-      dst_block.m_bytes[4] = static_cast<uint8>(selector1 >> 8); dst_block.m_bytes[5] = static_cast<uint8>(selector1 & 0xFF);
-      dst_block.m_bytes[6] = static_cast<uint8>(selector0 >> 8); dst_block.m_bytes[7] = static_cast<uint8>(selector0 & 0xFF);
-
-      return static_cast<unsigned int>(best_error);
-   }
-
-} // namespace rg_etc1
+// File: rg_etc1.cpp - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
+// Please see ZLIB license at the end of rg_etc1.h.
+//
+// For more information Ericsson Texture Compression (ETC/ETC1), see:
+// http://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8_texture.txt
+//
+// v1.03 - 5/12/13 - Initial public release
+#include "rg_etc1.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+//#include <stdio.h>
+#include <math.h>
+#include <stdio.h>
+#pragma warning (disable: 4201) //  nonstandard extension used : nameless struct/union
+
+#if defined(_DEBUG) || defined(DEBUG)
+#define RG_ETC1_BUILD_DEBUG
+#endif
+
+#define RG_ETC1_ASSERT assert
+
+namespace rg_etc1
+{
+
+   inline long labs(long val) {
+        return val < 0 ? -val : val;
+   }
+
+   inline int intabs(int val) {
+
+       return val<0?-val:val;
+   }
+
+   typedef unsigned char uint8;
+   typedef unsigned short uint16;
+   typedef unsigned int uint;
+   typedef unsigned int uint32;
+   typedef long long int64;
+   typedef unsigned long long uint64;
+
+   const uint32 cUINT32_MAX = 0xFFFFFFFFU;
+   const uint64 cUINT64_MAX = 0xFFFFFFFFFFFFFFFFULL; //0xFFFFFFFFFFFFFFFFui64;
+   
+   template<typename T> inline T minimum(T a, T b) { return (a < b) ? a : b; }
+   template<typename T> inline T minimum(T a, T b, T c) { return minimum(minimum(a, b), c); }
+   template<typename T> inline T maximum(T a, T b) { return (a > b) ? a : b; }
+   template<typename T> inline T maximum(T a, T b, T c) { return maximum(maximum(a, b), c); }
+   template<typename T> inline T clamp(T value, T low, T high) { return (value < low) ? low : ((value > high) ? high : value); }
+   template<typename T> inline T square(T value) { return value * value; }
+   template<typename T> inline void zero_object(T& obj) { memset((void*)&obj, 0, sizeof(obj)); }
+   template<typename T> inline void zero_this(T* pObj) { memset((void*)pObj, 0, sizeof(*pObj)); }
+
+   template<class T, size_t N> T decay_array_to_subtype(T (&a)[N]);   
+
+#define RG_ETC1_ARRAY_SIZE(X) (sizeof(X) / sizeof(decay_array_to_subtype(X)))
+
+   enum eNoClamp { cNoClamp };
+
+   struct color_quad_u8
+   {
+      static inline int clamp(int v) { if (v & 0xFFFFFF00U) v = (~(static_cast<int>(v) >> 31)) & 0xFF; return v; }
+
+      struct component_traits { enum { cSigned = false, cFloat = false, cMin = 0U, cMax = 255U }; };
+
+   public:
+      typedef unsigned char component_t;
+      typedef int parameter_t;
+
+      enum { cNumComps = 4 };
+
+      union
+      {
+         struct
+         {
+            component_t r;
+            component_t g;
+            component_t b;
+            component_t a;
+         };
+
+         component_t c[cNumComps];
+
+         uint32 m_u32;
+      };
+
+      inline color_quad_u8()
+      {
+      }
+
+      inline color_quad_u8(const color_quad_u8& other) : m_u32(other.m_u32)
+      {
+      }
+
+      explicit inline color_quad_u8(parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         set(y, alpha);
+      }
+
+      inline color_quad_u8(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
+      {
+         set(red, green, blue, alpha);
+      }
+
+      explicit inline color_quad_u8(eNoClamp, parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         set_noclamp_y_alpha(y, alpha);
+      }
+
+      inline color_quad_u8(eNoClamp, parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
+      {
+         set_noclamp_rgba(red, green, blue, alpha);
+      }
+
+      inline void clear()
+      {
+         m_u32 = 0;
+      }
+
+      inline color_quad_u8& operator= (const color_quad_u8& other)
+      {
+         m_u32 = other.m_u32;
+         return *this;
+      }
+
+      inline color_quad_u8& set_rgb(const color_quad_u8& other)
+      {
+         r = other.r;
+         g = other.g;
+         b = other.b;
+         return *this;
+      }
+
+      inline color_quad_u8& operator= (parameter_t y)
+      {
+         set(y, component_traits::cMax);
+         return *this;
+      }
+
+      inline color_quad_u8& set(parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         y = clamp(y);
+         alpha = clamp(alpha);
+         r = static_cast<component_t>(y);
+         g = static_cast<component_t>(y);
+         b = static_cast<component_t>(y);
+         a = static_cast<component_t>(alpha);
+         return *this;
+      }
+
+      inline color_quad_u8& set_noclamp_y_alpha(parameter_t y, parameter_t alpha = component_traits::cMax)
+      {
+         RG_ETC1_ASSERT( (y >= component_traits::cMin) && (y <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
+
+         r = static_cast<component_t>(y);
+         g = static_cast<component_t>(y);
+         b = static_cast<component_t>(y);
+         a = static_cast<component_t>(alpha);
+         return *this;
+      }
+
+      inline color_quad_u8& set(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha = component_traits::cMax)
+      {
+         r = static_cast<component_t>(clamp(red));
+         g = static_cast<component_t>(clamp(green));
+         b = static_cast<component_t>(clamp(blue));
+         a = static_cast<component_t>(clamp(alpha));
+         return *this;
+      }
+
+      inline color_quad_u8& set_noclamp_rgba(parameter_t red, parameter_t green, parameter_t blue, parameter_t alpha)
+      {
+         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (alpha >= component_traits::cMin) && (alpha <= component_traits::cMax) );
+
+         r = static_cast<component_t>(red);
+         g = static_cast<component_t>(green);
+         b = static_cast<component_t>(blue);
+         a = static_cast<component_t>(alpha);
+         return *this;
+      }
+
+      inline color_quad_u8& set_noclamp_rgb(parameter_t red, parameter_t green, parameter_t blue)
+      {
+         RG_ETC1_ASSERT( (red >= component_traits::cMin) && (red <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (green >= component_traits::cMin) && (green <= component_traits::cMax) );
+         RG_ETC1_ASSERT( (blue >= component_traits::cMin) && (blue <= component_traits::cMax) );
+
+         r = static_cast<component_t>(red);
+         g = static_cast<component_t>(green);
+         b = static_cast<component_t>(blue);
+         return *this;
+      }
+
+      static inline parameter_t get_min_comp() { return component_traits::cMin; }
+      static inline parameter_t get_max_comp() { return component_traits::cMax; }
+      static inline bool get_comps_are_signed() { return component_traits::cSigned; }
+
+      inline component_t operator[] (uint i) const { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
+      inline component_t& operator[] (uint i) { RG_ETC1_ASSERT(i < cNumComps); return c[i]; }
+
+      inline color_quad_u8& set_component(uint i, parameter_t f)
+      {
+         RG_ETC1_ASSERT(i < cNumComps);
+
+         c[i] = static_cast<component_t>(clamp(f));
+
+         return *this;
+      }
+
+      inline color_quad_u8& set_grayscale(parameter_t l)
+      {
+         component_t x = static_cast<component_t>(clamp(l));
+         c[0] = x;
+         c[1] = x;
+         c[2] = x;
+         return *this;
+      }
+
+      inline color_quad_u8& clamp(const color_quad_u8& l, const color_quad_u8& h)
+      {
+         for (uint i = 0; i < cNumComps; i++)
+            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l[i], h[i]));
+         return *this;
+      }
+
+      inline color_quad_u8& clamp(parameter_t l, parameter_t h)
+      {
+         for (uint i = 0; i < cNumComps; i++)
+            c[i] = static_cast<component_t>(rg_etc1::clamp<parameter_t>(c[i], l, h));
+         return *this;
+      }
+
+      // Returns CCIR 601 luma (consistent with color_utils::RGB_To_Y).
+      inline parameter_t get_luma() const
+      {
+         return static_cast<parameter_t>((19595U * r + 38470U * g + 7471U * b + 32768U) >> 16U);
+      }
+
+      // Returns REC 709 luma.
+      inline parameter_t get_luma_rec709() const
+      {
+         return static_cast<parameter_t>((13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U);
+      }
+
+      inline uint squared_distance_rgb(const color_quad_u8& c) const
+      {
+         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b);
+      }
+
+      inline uint squared_distance_rgba(const color_quad_u8& c) const
+      {
+         return rg_etc1::square(r - c.r) + rg_etc1::square(g - c.g) + rg_etc1::square(b - c.b) + rg_etc1::square(a - c.a);
+      }
+
+      inline bool rgb_equals(const color_quad_u8& rhs) const
+      {
+         return (r == rhs.r) && (g == rhs.g) && (b == rhs.b);
+      }
+
+      inline bool operator== (const color_quad_u8& rhs) const
+      {
+         return m_u32 == rhs.m_u32;
+      }
+
+      color_quad_u8& operator+= (const color_quad_u8& other)
+      {
+         for (uint i = 0; i < 4; i++)
+            c[i] = static_cast<component_t>(clamp(c[i] + other.c[i]));
+         return *this;
+      }
+
+      color_quad_u8& operator-= (const color_quad_u8& other)
+      {
+         for (uint i = 0; i < 4; i++)
+            c[i] = static_cast<component_t>(clamp(c[i] - other.c[i]));
+         return *this;
+      }
+
+      friend color_quad_u8 operator+ (const color_quad_u8& lhs, const color_quad_u8& rhs)
+      {
+         color_quad_u8 result(lhs);
+         result += rhs;
+         return result;
+      }
+
+      friend color_quad_u8 operator- (const color_quad_u8& lhs, const color_quad_u8& rhs)
+      {
+         color_quad_u8 result(lhs);
+         result -= rhs;
+         return result;
+      }
+   }; // class color_quad_u8
+
+   struct vec3F
+   {
+      float m_s[3];
+      
+      inline vec3F() { }
+      inline vec3F(float s) { m_s[0] = s; m_s[1] = s; m_s[2] = s; }
+      inline vec3F(float x, float y, float z) { m_s[0] = x; m_s[1] = y; m_s[2] = z; }
+      
+      inline float operator[] (uint i) const { RG_ETC1_ASSERT(i < 3); return m_s[i]; }
+
+      inline vec3F& operator += (const vec3F& other) { for (uint i = 0; i < 3; i++) m_s[i] += other.m_s[i]; return *this; }
+
+      inline vec3F& operator *= (float s) { for (uint i = 0; i < 3; i++) m_s[i] *= s; return *this; }
+   };
+     
+   enum etc_constants
+   {
+      cETC1BytesPerBlock = 8U,
+
+      cETC1SelectorBits = 2U,
+      cETC1SelectorValues = 1U << cETC1SelectorBits,
+      cETC1SelectorMask = cETC1SelectorValues - 1U,
+
+      cETC1BlockShift = 2U,
+      cETC1BlockSize = 1U << cETC1BlockShift,
+
+      cETC1LSBSelectorIndicesBitOffset = 0,
+      cETC1MSBSelectorIndicesBitOffset = 16,
+
+      cETC1FlipBitOffset = 32,
+      cETC1DiffBitOffset = 33,
+
+      cETC1IntenModifierNumBits = 3,
+      cETC1IntenModifierValues = 1 << cETC1IntenModifierNumBits,
+      cETC1RightIntenModifierTableBitOffset = 34,
+      cETC1LeftIntenModifierTableBitOffset = 37,
+
+      // Base+Delta encoding (5 bit bases, 3 bit delta)
+      cETC1BaseColorCompNumBits = 5,
+      cETC1BaseColorCompMax = 1 << cETC1BaseColorCompNumBits,
+
+      cETC1DeltaColorCompNumBits = 3,
+      cETC1DeltaColorComp = 1 << cETC1DeltaColorCompNumBits,
+      cETC1DeltaColorCompMax = 1 << cETC1DeltaColorCompNumBits,
+
+      cETC1BaseColor5RBitOffset = 59,
+      cETC1BaseColor5GBitOffset = 51,
+      cETC1BaseColor5BBitOffset = 43,
+
+      cETC1DeltaColor3RBitOffset = 56,
+      cETC1DeltaColor3GBitOffset = 48,
+      cETC1DeltaColor3BBitOffset = 40,
+
+      // Absolute (non-delta) encoding (two 4-bit per component bases)
+      cETC1AbsColorCompNumBits = 4,
+      cETC1AbsColorCompMax = 1 << cETC1AbsColorCompNumBits,
+
+      cETC1AbsColor4R1BitOffset = 60,
+      cETC1AbsColor4G1BitOffset = 52,
+      cETC1AbsColor4B1BitOffset = 44,
+
+      cETC1AbsColor4R2BitOffset = 56,
+      cETC1AbsColor4G2BitOffset = 48,
+      cETC1AbsColor4B2BitOffset = 40,
+
+      cETC1ColorDeltaMin = -4,
+      cETC1ColorDeltaMax = 3,
+
+      // Delta3:
+      // 0   1   2   3   4   5   6   7
+      // 000 001 010 011 100 101 110 111
+      // 0   1   2   3   -4  -3  -2  -1
+   };
+   
+   static uint8 g_quant5_tab[256+16];
+
+
+   static const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues] = 
+   { 
+      { -8,  -2,   2,   8 }, { -17,  -5,  5,  17 }, { -29,  -9,   9,  29 }, {  -42, -13, 13,  42 }, 
+      { -60, -18, 18,  60 }, { -80, -24, 24,  80 }, { -106, -33, 33, 106 }, { -183, -47, 47, 183 } 
+   };
+
+   static const uint8 g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
+   static const uint8 g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
+      
+   // Given an ETC1 diff/inten_table/selector, and an 8-bit desired color, this table encodes the best packed_color in the low byte, and the abs error in the high byte.
+   static uint16 g_etc1_inverse_lookup[2*8*4][256];      // [diff/inten_table/selector][desired_color]
+
+   // g_color8_to_etc_block_config[color][table_index] = Supplies for each 8-bit color value a list of packed ETC1 diff/intensity table/selectors/packed_colors that map to that color.
+   // To pack: diff | (inten << 1) | (selector << 4) | (packed_c << 8)
+   static const uint16 g_color8_to_etc_block_config_0_255[2][33] =
+   {
+      { 0x0000,  0x0010,  0x0002,  0x0012,  0x0004,  0x0014,  0x0006,  0x0016,  0x0008,  0x0018,  0x000A,  0x001A,  0x000C,  0x001C,  0x000E,  0x001E,
+        0x0001,  0x0011,  0x0003,  0x0013,  0x0005,  0x0015,  0x0007,  0x0017,  0x0009,  0x0019,  0x000B,  0x001B,  0x000D,  0x001D,  0x000F,  0x001F, 0xFFFF },
+      { 0x0F20,  0x0F30,  0x0E32,  0x0F22,  0x0E34,  0x0F24,  0x0D36,  0x0F26,  0x0C38,  0x0E28,  0x0B3A,  0x0E2A,  0x093C,  0x0E2C,  0x053E,  0x0D2E,
+        0x1E31,  0x1F21,  0x1D33,  0x1F23,  0x1C35,  0x1E25,  0x1A37,  0x1E27,  0x1839,  0x1D29,  0x163B,  0x1C2B,  0x133D,  0x1B2D,  0x093F,  0x1A2F, 0xFFFF },
+   };
+
+   // Really only [254][11].
+   static const uint16 g_color8_to_etc_block_config_1_to_254[254][12] = 
+   {
+      { 0x021C, 0x0D0D, 0xFFFF }, { 0x0020, 0x0021, 0x0A0B, 0x061F, 0xFFFF }, { 0x0113, 0x0217, 0xFFFF }, { 0x0116, 0x031E,
+      0x0B0E, 0x0405, 0xFFFF }, { 0x0022, 0x0204, 0x050A, 0x0023, 0xFFFF }, { 0x0111, 0x0319, 0x0809, 0x170F, 0xFFFF }, {
+      0x0303, 0x0215, 0x0607, 0xFFFF }, { 0x0030, 0x0114, 0x0408, 0x0031, 0x0201, 0x051D, 0xFFFF }, { 0x0100, 0x0024, 0x0306,
+      0x0025, 0x041B, 0x0E0D, 0xFFFF }, { 0x021A, 0x0121, 0x0B0B, 0x071F, 0xFFFF }, { 0x0213, 0x0317, 0xFFFF }, { 0x0112,
+      0x0505, 0xFFFF }, { 0x0026, 0x070C, 0x0123, 0x0027, 0xFFFF }, { 0x0211, 0x0909, 0xFFFF }, { 0x0110, 0x0315, 0x0707,
+      0x0419, 0x180F, 0xFFFF }, { 0x0218, 0x0131, 0x0301, 0x0403, 0x061D, 0xFFFF }, { 0x0032, 0x0202, 0x0033, 0x0125, 0x051B,
+      0x0F0D, 0xFFFF }, { 0x0028, 0x031C, 0x0221, 0x0029, 0xFFFF }, { 0x0120, 0x0313, 0x0C0B, 0x081F, 0xFFFF }, { 0x0605,
+      0x0417, 0xFFFF }, { 0x0216, 0x041E, 0x0C0E, 0x0223, 0x0127, 0xFFFF }, { 0x0122, 0x0304, 0x060A, 0x0311, 0x0A09, 0xFFFF
+      }, { 0x0519, 0x190F, 0xFFFF }, { 0x002A, 0x0231, 0x0503, 0x0415, 0x0807, 0x002B, 0x071D, 0xFFFF }, { 0x0130, 0x0214,
+      0x0508, 0x0401, 0x0133, 0x0225, 0x061B, 0xFFFF }, { 0x0200, 0x0124, 0x0406, 0x0321, 0x0129, 0x100D, 0xFFFF }, { 0x031A,
+      0x0D0B, 0x091F, 0xFFFF }, { 0x0413, 0x0705, 0x0517, 0xFFFF }, { 0x0212, 0x0034, 0x0323, 0x0035, 0x0227, 0xFFFF }, {
+      0x0126, 0x080C, 0x0B09, 0xFFFF }, { 0x0411, 0x0619, 0x1A0F, 0xFFFF }, { 0x0210, 0x0331, 0x0603, 0x0515, 0x0907, 0x012B,
+      0xFFFF }, { 0x0318, 0x002C, 0x0501, 0x0233, 0x0325, 0x071B, 0x002D, 0x081D, 0xFFFF }, { 0x0132, 0x0302, 0x0229, 0x110D,
+      0xFFFF }, { 0x0128, 0x041C, 0x0421, 0x0E0B, 0x0A1F, 0xFFFF }, { 0x0220, 0x0513, 0x0617, 0xFFFF }, { 0x0135, 0x0805,
+      0x0327, 0xFFFF }, { 0x0316, 0x051E, 0x0D0E, 0x0423, 0xFFFF }, { 0x0222, 0x0404, 0x070A, 0x0511, 0x0719, 0x0C09, 0x1B0F,
+      0xFFFF }, { 0x0703, 0x0615, 0x0A07, 0x022B, 0xFFFF }, { 0x012A, 0x0431, 0x0601, 0x0333, 0x012D, 0x091D, 0xFFFF }, {
+      0x0230, 0x0314, 0x0036, 0x0608, 0x0425, 0x0037, 0x0329, 0x081B, 0x120D, 0xFFFF }, { 0x0300, 0x0224, 0x0506, 0x0521,
+      0x0F0B, 0x0B1F, 0xFFFF }, { 0x041A, 0x0613, 0x0717, 0xFFFF }, { 0x0235, 0x0905, 0xFFFF }, { 0x0312, 0x0134, 0x0523,
+      0x0427, 0xFFFF }, { 0x0226, 0x090C, 0x002E, 0x0611, 0x0D09, 0x002F, 0xFFFF }, { 0x0715, 0x0B07, 0x0819, 0x032B, 0x1C0F,
+      0xFFFF }, { 0x0310, 0x0531, 0x0701, 0x0803, 0x022D, 0x0A1D, 0xFFFF }, { 0x0418, 0x012C, 0x0433, 0x0525, 0x0137, 0x091B,
+      0x130D, 0xFFFF }, { 0x0232, 0x0402, 0x0621, 0x0429, 0xFFFF }, { 0x0228, 0x051C, 0x0713, 0x100B, 0x0C1F, 0xFFFF }, {
+      0x0320, 0x0335, 0x0A05, 0x0817, 0xFFFF }, { 0x0623, 0x0527, 0xFFFF }, { 0x0416, 0x061E, 0x0E0E, 0x0711, 0x0E09, 0x012F,
+      0xFFFF }, { 0x0322, 0x0504, 0x080A, 0x0919, 0x1D0F, 0xFFFF }, { 0x0631, 0x0903, 0x0815, 0x0C07, 0x042B, 0x032D, 0x0B1D,
+      0xFFFF }, { 0x022A, 0x0801, 0x0533, 0x0625, 0x0237, 0x0A1B, 0xFFFF }, { 0x0330, 0x0414, 0x0136, 0x0708, 0x0721, 0x0529,
+      0x140D, 0xFFFF }, { 0x0400, 0x0324, 0x0606, 0x0038, 0x0039, 0x110B, 0x0D1F, 0xFFFF }, { 0x051A, 0x0813, 0x0B05, 0x0917,
+      0xFFFF }, { 0x0723, 0x0435, 0x0627, 0xFFFF }, { 0x0412, 0x0234, 0x0F09, 0x022F, 0xFFFF }, { 0x0326, 0x0A0C, 0x012E,
+      0x0811, 0x0A19, 0x1E0F, 0xFFFF }, { 0x0731, 0x0A03, 0x0915, 0x0D07, 0x052B, 0xFFFF }, { 0x0410, 0x0901, 0x0633, 0x0725,
+      0x0337, 0x0B1B, 0x042D, 0x0C1D, 0xFFFF }, { 0x0518, 0x022C, 0x0629, 0x150D, 0xFFFF }, { 0x0332, 0x0502, 0x0821, 0x0139,
+      0x120B, 0x0E1F, 0xFFFF }, { 0x0328, 0x061C, 0x0913, 0x0A17, 0xFFFF }, { 0x0420, 0x0535, 0x0C05, 0x0727, 0xFFFF }, {
+      0x0823, 0x032F, 0xFFFF }, { 0x0516, 0x071E, 0x0F0E, 0x0911, 0x0B19, 0x1009, 0x1F0F, 0xFFFF }, { 0x0422, 0x0604, 0x090A,
+      0x0B03, 0x0A15, 0x0E07, 0x062B, 0xFFFF }, { 0x0831, 0x0A01, 0x0733, 0x052D, 0x0D1D, 0xFFFF }, { 0x032A, 0x0825, 0x0437,
+      0x0729, 0x0C1B, 0x160D, 0xFFFF }, { 0x0430, 0x0514, 0x0236, 0x0808, 0x0921, 0x0239, 0x130B, 0x0F1F, 0xFFFF }, { 0x0500,
+      0x0424, 0x0706, 0x0138, 0x0A13, 0x0B17, 0xFFFF }, { 0x061A, 0x0635, 0x0D05, 0xFFFF }, { 0x0923, 0x0827, 0xFFFF }, {
+      0x0512, 0x0334, 0x003A, 0x0A11, 0x1109, 0x003B, 0x042F, 0xFFFF }, { 0x0426, 0x0B0C, 0x022E, 0x0B15, 0x0F07, 0x0C19,
+      0x072B, 0xFFFF }, { 0x0931, 0x0B01, 0x0C03, 0x062D, 0x0E1D, 0xFFFF }, { 0x0510, 0x0833, 0x0925, 0x0537, 0x0D1B, 0x170D,
+      0xFFFF }, { 0x0618, 0x032C, 0x0A21, 0x0339, 0x0829, 0xFFFF }, { 0x0432, 0x0602, 0x0B13, 0x140B, 0x101F, 0xFFFF }, {
+      0x0428, 0x071C, 0x0735, 0x0E05, 0x0C17, 0xFFFF }, { 0x0520, 0x0A23, 0x0927, 0xFFFF }, { 0x0B11, 0x1209, 0x013B, 0x052F,
+      0xFFFF }, { 0x0616, 0x081E, 0x0D19, 0xFFFF }, { 0x0522, 0x0704, 0x0A0A, 0x0A31, 0x0D03, 0x0C15, 0x1007, 0x082B, 0x072D,
+      0x0F1D, 0xFFFF }, { 0x0C01, 0x0933, 0x0A25, 0x0637, 0x0E1B, 0xFFFF }, { 0x042A, 0x0B21, 0x0929, 0x180D, 0xFFFF }, {
+	      0x0530, 0x0614, 0x0336, 0x0908, 0x0439, 0x150B, 0x111F, 0xFFFF }, { 0x0600, 0x0524, 0x0806, 0x0238, 0x0C13, 0x0F05,
+      0x0D17, 0xFFFF }, { 0x071A, 0x0B23, 0x0835, 0x0A27, 0xFFFF }, { 0x1309, 0x023B, 0x062F, 0xFFFF }, { 0x0612, 0x0434,
+      0x013A, 0x0C11, 0x0E19, 0xFFFF }, { 0x0526, 0x0C0C, 0x032E, 0x0B31, 0x0E03, 0x0D15, 0x1107, 0x092B, 0xFFFF }, { 0x0D01,
+      0x0A33, 0x0B25, 0x0737, 0x0F1B, 0x082D, 0x101D, 0xFFFF }, { 0x0610, 0x0A29, 0x190D, 0xFFFF }, { 0x0718, 0x042C, 0x0C21,
+      0x0539, 0x160B, 0x121F, 0xFFFF }, { 0x0532, 0x0702, 0x0D13, 0x0E17, 0xFFFF }, { 0x0528, 0x081C, 0x0935, 0x1005, 0x0B27,
+      0xFFFF }, { 0x0620, 0x0C23, 0x033B, 0x072F, 0xFFFF }, { 0x0D11, 0x0F19, 0x1409, 0xFFFF }, { 0x0716, 0x003C, 0x091E,
+      0x0F03, 0x0E15, 0x1207, 0x0A2B, 0x003D, 0xFFFF }, { 0x0622, 0x0804, 0x0B0A, 0x0C31, 0x0E01, 0x0B33, 0x092D, 0x111D,
+      0xFFFF }, { 0x0C25, 0x0837, 0x0B29, 0x101B, 0x1A0D, 0xFFFF }, { 0x052A, 0x0D21, 0x0639, 0x170B, 0x131F, 0xFFFF }, {
+      0x0630, 0x0714, 0x0436, 0x0A08, 0x0E13, 0x0F17, 0xFFFF }, { 0x0700, 0x0624, 0x0906, 0x0338, 0x0A35, 0x1105, 0xFFFF }, {
+      0x081A, 0x0D23, 0x0C27, 0xFFFF }, { 0x0E11, 0x1509, 0x043B, 0x082F, 0xFFFF }, { 0x0712, 0x0534, 0x023A, 0x0F15, 0x1307,
+      0x1019, 0x0B2B, 0x013D, 0xFFFF }, { 0x0626, 0x0D0C, 0x042E, 0x0D31, 0x0F01, 0x1003, 0x0A2D, 0x121D, 0xFFFF }, { 0x0C33,
+      0x0D25, 0x0937, 0x111B, 0x1B0D, 0xFFFF }, { 0x0710, 0x0E21, 0x0739, 0x0C29, 0xFFFF }, { 0x0818, 0x052C, 0x0F13, 0x180B,
+      0x141F, 0xFFFF }, { 0x0632, 0x0802, 0x0B35, 0x1205, 0x1017, 0xFFFF }, { 0x0628, 0x091C, 0x0E23, 0x0D27, 0xFFFF }, {
+      0x0720, 0x0F11, 0x1609, 0x053B, 0x092F, 0xFFFF }, { 0x1119, 0x023D, 0xFFFF }, { 0x0816, 0x013C, 0x0A1E, 0x0E31, 0x1103,
+      0x1015, 0x1407, 0x0C2B, 0x0B2D, 0x131D, 0xFFFF }, { 0x0722, 0x0904, 0x0C0A, 0x1001, 0x0D33, 0x0E25, 0x0A37, 0x121B,
+      0xFFFF }, { 0x0F21, 0x0D29, 0x1C0D, 0xFFFF }, { 0x062A, 0x0839, 0x190B, 0x151F, 0xFFFF }, { 0x0730, 0x0814, 0x0536,
+      0x0B08, 0x1013, 0x1305, 0x1117, 0xFFFF }, { 0x0800, 0x0724, 0x0A06, 0x0438, 0x0F23, 0x0C35, 0x0E27, 0xFFFF }, { 0x091A,
+      0x1709, 0x063B, 0x0A2F, 0xFFFF }, { 0x1011, 0x1219, 0x033D, 0xFFFF }, { 0x0812, 0x0634, 0x033A, 0x0F31, 0x1203, 0x1115,
+      0x1507, 0x0D2B, 0xFFFF }, { 0x0726, 0x0E0C, 0x052E, 0x1101, 0x0E33, 0x0F25, 0x0B37, 0x131B, 0x0C2D, 0x141D, 0xFFFF }, {
+      0x0E29, 0x1D0D, 0xFFFF }, { 0x0810, 0x1021, 0x0939, 0x1A0B, 0x161F, 0xFFFF }, { 0x0918, 0x062C, 0x1113, 0x1217, 0xFFFF
+      }, { 0x0732, 0x0902, 0x0D35, 0x1405, 0x0F27, 0xFFFF }, { 0x0728, 0x0A1C, 0x1023, 0x073B, 0x0B2F, 0xFFFF }, { 0x0820,
+      0x1111, 0x1319, 0x1809, 0xFFFF }, { 0x1303, 0x1215, 0x1607, 0x0E2B, 0x043D, 0xFFFF }, { 0x0916, 0x023C, 0x0B1E, 0x1031,
+      0x1201, 0x0F33, 0x0D2D, 0x151D, 0xFFFF }, { 0x0822, 0x0A04, 0x0D0A, 0x1025, 0x0C37, 0x0F29, 0x141B, 0x1E0D, 0xFFFF }, {
+      0x1121, 0x0A39, 0x1B0B, 0x171F, 0xFFFF }, { 0x072A, 0x1213, 0x1317, 0xFFFF }, { 0x0830, 0x0914, 0x0636, 0x0C08, 0x0E35,
+      0x1505, 0xFFFF }, { 0x0900, 0x0824, 0x0B06, 0x0538, 0x1123, 0x1027, 0xFFFF }, { 0x0A1A, 0x1211, 0x1909, 0x083B, 0x0C2F,
+      0xFFFF }, { 0x1315, 0x1707, 0x1419, 0x0F2B, 0x053D, 0xFFFF }, { 0x0912, 0x0734, 0x043A, 0x1131, 0x1301, 0x1403, 0x0E2D,
+      0x161D, 0xFFFF }, { 0x0826, 0x0F0C, 0x062E, 0x1033, 0x1125, 0x0D37, 0x151B, 0x1F0D, 0xFFFF }, { 0x1221, 0x0B39, 0x1029,
+      0xFFFF }, { 0x0910, 0x1313, 0x1C0B, 0x181F, 0xFFFF }, { 0x0A18, 0x072C, 0x0F35, 0x1605, 0x1417, 0xFFFF }, { 0x0832,
+      0x0A02, 0x1223, 0x1127, 0xFFFF }, { 0x0828, 0x0B1C, 0x1311, 0x1A09, 0x093B, 0x0D2F, 0xFFFF }, { 0x0920, 0x1519, 0x063D,
+      0xFFFF }, { 0x1231, 0x1503, 0x1415, 0x1807, 0x102B, 0x0F2D, 0x171D, 0xFFFF }, { 0x0A16, 0x033C, 0x0C1E, 0x1401, 0x1133,
+      0x1225, 0x0E37, 0x161B, 0xFFFF }, { 0x0922, 0x0B04, 0x0E0A, 0x1321, 0x1129, 0xFFFF }, { 0x0C39, 0x1D0B, 0x191F, 0xFFFF
+      }, { 0x082A, 0x1413, 0x1705, 0x1517, 0xFFFF }, { 0x0930, 0x0A14, 0x0736, 0x0D08, 0x1323, 0x1035, 0x1227, 0xFFFF }, {
+      0x0A00, 0x0924, 0x0C06, 0x0638, 0x1B09, 0x0A3B, 0x0E2F, 0xFFFF }, { 0x0B1A, 0x1411, 0x1619, 0x073D, 0xFFFF }, { 0x1331,
+      0x1603, 0x1515, 0x1907, 0x112B, 0xFFFF }, { 0x0A12, 0x0834, 0x053A, 0x1501, 0x1233, 0x1325, 0x0F37, 0x171B, 0x102D,
+      0x181D, 0xFFFF }, { 0x0926, 0x072E, 0x1229, 0xFFFF }, { 0x1421, 0x0D39, 0x1E0B, 0x1A1F, 0xFFFF }, { 0x0A10, 0x1513,
+      0x1617, 0xFFFF }, { 0x0B18, 0x082C, 0x1135, 0x1805, 0x1327, 0xFFFF }, { 0x0932, 0x0B02, 0x1423, 0x0B3B, 0x0F2F, 0xFFFF
+      }, { 0x0928, 0x0C1C, 0x1511, 0x1719, 0x1C09, 0xFFFF }, { 0x0A20, 0x1703, 0x1615, 0x1A07, 0x122B, 0x083D, 0xFFFF }, {
+      0x1431, 0x1601, 0x1333, 0x112D, 0x191D, 0xFFFF }, { 0x0B16, 0x043C, 0x0D1E, 0x1425, 0x1037, 0x1329, 0x181B, 0xFFFF }, {
+      0x0A22, 0x0C04, 0x0F0A, 0x1521, 0x0E39, 0x1F0B, 0x1B1F, 0xFFFF }, { 0x1613, 0x1717, 0xFFFF }, { 0x092A, 0x1235, 0x1905,
+      0xFFFF }, { 0x0A30, 0x0B14, 0x0836, 0x0E08, 0x1523, 0x1427, 0xFFFF }, { 0x0B00, 0x0A24, 0x0D06, 0x0738, 0x1611, 0x1D09,
+      0x0C3B, 0x102F, 0xFFFF }, { 0x0C1A, 0x1715, 0x1B07, 0x1819, 0x132B, 0x093D, 0xFFFF }, { 0x1531, 0x1701, 0x1803, 0x122D,
+      0x1A1D, 0xFFFF }, { 0x0B12, 0x0934, 0x063A, 0x1433, 0x1525, 0x1137, 0x191B, 0xFFFF }, { 0x0A26, 0x003E, 0x082E, 0x1621,
+      0x0F39, 0x1429, 0x003F, 0xFFFF }, { 0x1713, 0x1C1F, 0xFFFF }, { 0x0B10, 0x1335, 0x1A05, 0x1817, 0xFFFF }, { 0x0C18,
+      0x092C, 0x1623, 0x1527, 0xFFFF }, { 0x0A32, 0x0C02, 0x1711, 0x1E09, 0x0D3B, 0x112F, 0xFFFF }, { 0x0A28, 0x0D1C, 0x1919,
+      0x0A3D, 0xFFFF }, { 0x0B20, 0x1631, 0x1903, 0x1815, 0x1C07, 0x142B, 0x132D, 0x1B1D, 0xFFFF }, { 0x1801, 0x1533, 0x1625,
+      0x1237, 0x1A1B, 0xFFFF }, { 0x0C16, 0x053C, 0x0E1E, 0x1721, 0x1529, 0x013F, 0xFFFF }, { 0x0B22, 0x0D04, 0x1039, 0x1D1F,
+      0xFFFF }, { 0x1813, 0x1B05, 0x1917, 0xFFFF }, { 0x0A2A, 0x1723, 0x1435, 0x1627, 0xFFFF }, { 0x0B30, 0x0C14, 0x0936,
+      0x0F08, 0x1F09, 0x0E3B, 0x122F, 0xFFFF }, { 0x0C00, 0x0B24, 0x0E06, 0x0838, 0x1811, 0x1A19, 0x0B3D, 0xFFFF }, { 0x0D1A,
+      0x1731, 0x1A03, 0x1915, 0x1D07, 0x152B, 0xFFFF }, { 0x1901, 0x1633, 0x1725, 0x1337, 0x1B1B, 0x142D, 0x1C1D, 0xFFFF }, {
+      0x0C12, 0x0A34, 0x073A, 0x1629, 0x023F, 0xFFFF }, { 0x0B26, 0x013E, 0x092E, 0x1821, 0x1139, 0x1E1F, 0xFFFF }, { 0x1913,
+      0x1A17, 0xFFFF }, { 0x0C10, 0x1535, 0x1C05, 0x1727, 0xFFFF }, { 0x0D18, 0x0A2C, 0x1823, 0x0F3B, 0x132F, 0xFFFF }, {
+      0x0B32, 0x0D02, 0x1911, 0x1B19, 0xFFFF }, { 0x0B28, 0x0E1C, 0x1B03, 0x1A15, 0x1E07, 0x162B, 0x0C3D, 0xFFFF }, { 0x0C20,
+      0x1831, 0x1A01, 0x1733, 0x152D, 0x1D1D, 0xFFFF }, { 0x1825, 0x1437, 0x1729, 0x1C1B, 0x033F, 0xFFFF }, { 0x0D16, 0x063C,
+      0x0F1E, 0x1921, 0x1239, 0x1F1F, 0xFFFF }, { 0x0C22, 0x0E04, 0x1A13, 0x1B17, 0xFFFF }, { 0x1635, 0x1D05, 0xFFFF }, {
+      0x0B2A, 0x1923, 0x1827, 0xFFFF }, { 0x0C30, 0x0D14, 0x0A36, 0x1A11, 0x103B, 0x142F, 0xFFFF }, { 0x0D00, 0x0C24, 0x0F06,
+      0x0938, 0x1B15, 0x1F07, 0x1C19, 0x172B, 0x0D3D, 0xFFFF }, { 0x0E1A, 0x1931, 0x1B01, 0x1C03, 0x162D, 0x1E1D, 0xFFFF }, {
+      0x1833, 0x1925, 0x1537, 0x1D1B, 0xFFFF }, { 0x0D12, 0x0B34, 0x083A, 0x1A21, 0x1339, 0x1829, 0x043F, 0xFFFF }, { 0x0C26,
+      0x023E, 0x0A2E, 0x1B13, 0xFFFF }, { 0x1735, 0x1E05, 0x1C17, 0xFFFF }, { 0x0D10, 0x1A23, 0x1927, 0xFFFF }, { 0x0E18,
+      0x0B2C, 0x1B11, 0x113B, 0x152F, 0xFFFF }, { 0x0C32, 0x0E02, 0x1D19, 0x0E3D, 0xFFFF }, { 0x0C28, 0x0F1C, 0x1A31, 0x1D03,
+      0x1C15, 0x182B, 0x172D, 0x1F1D, 0xFFFF }, { 0x0D20, 0x1C01, 0x1933, 0x1A25, 0x1637, 0x1E1B, 0xFFFF }, { 0x1B21, 0x1929,
+      0x053F, 0xFFFF }, { 0x0E16, 0x073C, 0x1439, 0xFFFF }, { 0x0D22, 0x0F04, 0x1C13, 0x1F05, 0x1D17, 0xFFFF }, { 0x1B23,
+      0x1835, 0x1A27, 0xFFFF }, { 0x0C2A, 0x123B, 0x162F, 0xFFFF }, { 0x0D30, 0x0E14, 0x0B36, 0x1C11, 0x1E19, 0x0F3D, 0xFFFF
+      }, { 0x0E00, 0x0D24, 0x0A38, 0x1B31, 0x1E03, 0x1D15, 0x192B, 0xFFFF }, { 0x0F1A, 0x1D01, 0x1A33, 0x1B25, 0x1737, 0x1F1B,
+      0x182D, 0xFFFF }, { 0x1A29, 0x063F, 0xFFFF }, { 0x0E12, 0x0C34, 0x093A, 0x1C21, 0x1539, 0xFFFF }, { 0x0D26, 0x033E,
+      0x0B2E, 0x1D13, 0x1E17, 0xFFFF }, { 0x1935, 0x1B27, 0xFFFF }, { 0x0E10, 0x1C23, 0x133B, 0x172F, 0xFFFF }, { 0x0F18,
+      0x0C2C, 0x1D11, 0x1F19, 0xFFFF }, { 0x0D32, 0x0F02, 0x1F03, 0x1E15, 0x1A2B, 0x103D, 0xFFFF }, { 0x0D28, 0x1C31, 0x1E01,
+      0x1B33, 0x192D, 0xFFFF }, { 0x0E20, 0x1C25, 0x1837, 0x1B29, 0x073F, 0xFFFF }, { 0x1D21, 0x1639, 0xFFFF }, { 0x0F16,
+      0x083C, 0x1E13, 0x1F17, 0xFFFF }, { 0x0E22, 0x1A35, 0xFFFF }, { 0x1D23, 0x1C27, 0xFFFF }, { 0x0D2A, 0x1E11, 0x143B,
+      0x182F, 0xFFFF }, { 0x0E30, 0x0F14, 0x0C36, 0x1F15, 0x1B2B, 0x113D, 0xFFFF }, { 0x0F00, 0x0E24, 0x0B38, 0x1D31, 0x1F01,
+      0x1A2D, 0xFFFF }, { 0x1C33, 0x1D25, 0x1937, 0xFFFF }, { 0x1E21, 0x1739, 0x1C29, 0x083F, 0xFFFF }, { 0x0F12, 0x0D34,
+      0x0A3A, 0x1F13, 0xFFFF }, { 0x0E26, 0x043E, 0x0C2E, 0x1B35, 0xFFFF }, { 0x1E23, 0x1D27, 0xFFFF }, { 0x0F10, 0x1F11,
+      0x153B, 0x192F, 0xFFFF }, { 0x0D2C, 0x123D, 0xFFFF },
+   };
+
+   struct etc1_block
+   {
+      // big endian uint64:
+      // bit ofs:  56  48  40  32  24  16   8   0
+      // byte ofs: b0, b1, b2, b3, b4, b5, b6, b7 
+      union 
+      {
+         uint64 m_uint64;
+         uint8 m_bytes[8];
+      };
+
+      uint8 m_low_color[2];
+      uint8 m_high_color[2];
+
+      enum { cNumSelectorBytes = 4 };
+      uint8 m_selectors[cNumSelectorBytes];
+
+      inline void clear()
+      {
+         zero_this(this);
+      }
+
+      inline uint get_byte_bits(uint ofs, uint num) const
+      {
+         RG_ETC1_ASSERT((ofs + num) <= 64U);
+         RG_ETC1_ASSERT(num && (num <= 8U));
+         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
+         const uint byte_ofs = 7 - (ofs >> 3);
+         const uint byte_bit_ofs = ofs & 7;
+         return (m_bytes[byte_ofs] >> byte_bit_ofs) & ((1 << num) - 1);
+      }
+
+      inline void set_byte_bits(uint ofs, uint num, uint bits)
+      {
+         RG_ETC1_ASSERT((ofs + num) <= 64U);
+         RG_ETC1_ASSERT(num && (num < 32U));
+         RG_ETC1_ASSERT((ofs >> 3) == ((ofs + num - 1) >> 3));
+         RG_ETC1_ASSERT(bits < (1U << num));
+         const uint byte_ofs = 7 - (ofs >> 3);
+         const uint byte_bit_ofs = ofs & 7;
+         const uint mask = (1 << num) - 1;
+         m_bytes[byte_ofs] &= ~(mask << byte_bit_ofs);
+         m_bytes[byte_ofs] |= (bits << byte_bit_ofs);
+      }
+
+      // false = left/right subblocks
+      // true = upper/lower subblocks
+      inline bool get_flip_bit() const 
+      {
+         return (m_bytes[3] & 1) != 0;
+      }   
+
+      inline void set_flip_bit(bool flip)
+      {
+         m_bytes[3] &= ~1;
+         m_bytes[3] |= static_cast<uint8>(flip);
+      }
+
+      inline bool get_diff_bit() const
+      {
+         return (m_bytes[3] & 2) != 0;
+      }
+
+      inline void set_diff_bit(bool diff)
+      {
+         m_bytes[3] &= ~2;
+         m_bytes[3] |= (static_cast<uint>(diff) << 1);
+      }
+
+      // Returns intensity modifier table (0-7) used by subblock subblock_id.
+      // subblock_id=0 left/top (CW 1), 1=right/bottom (CW 2)
+      inline uint get_inten_table(uint subblock_id) const
+      {
+         RG_ETC1_ASSERT(subblock_id < 2);
+         const uint ofs = subblock_id ? 2 : 5;
+         return (m_bytes[3] >> ofs) & 7;
+      }
+
+      // Sets intensity modifier table (0-7) used by subblock subblock_id (0 or 1)
+      inline void set_inten_table(uint subblock_id, uint t)
+      {
+         RG_ETC1_ASSERT(subblock_id < 2);
+         RG_ETC1_ASSERT(t < 8);
+         const uint ofs = subblock_id ? 2 : 5;
+         m_bytes[3] &= ~(7 << ofs);
+         m_bytes[3] |= (t << ofs);
+      }
+
+      // Returned selector value ranges from 0-3 and is a direct index into g_etc1_inten_tables.
+      inline uint get_selector(uint x, uint y) const
+      {
+         RG_ETC1_ASSERT((x | y) < 4);
+
+         const uint bit_index = x * 4 + y;
+         const uint byte_bit_ofs = bit_index & 7;
+         const uint8 *p = &m_bytes[7 - (bit_index >> 3)];
+         const uint lsb = (p[0] >> byte_bit_ofs) & 1;
+         const uint msb = (p[-2] >> byte_bit_ofs) & 1;
+         const uint val = lsb | (msb << 1);
+
+         return g_etc1_to_selector_index[val];
+      }
+
+      // Selector "val" ranges from 0-3 and is a direct index into g_etc1_inten_tables.
+      inline void set_selector(uint x, uint y, uint val)
+      {
+         RG_ETC1_ASSERT((x | y | val) < 4);
+         const uint bit_index = x * 4 + y;
+
+         uint8 *p = &m_bytes[7 - (bit_index >> 3)];
+
+         const uint byte_bit_ofs = bit_index & 7;
+         const uint mask = 1 << byte_bit_ofs;
+
+         const uint etc1_val = g_selector_index_to_etc1[val];
+
+         const uint lsb = etc1_val & 1;
+         const uint msb = etc1_val >> 1;
+
+         p[0] &= ~mask;
+         p[0] |= (lsb << byte_bit_ofs);
+
+         p[-2] &= ~mask;
+         p[-2] |= (msb << byte_bit_ofs);
+      }
+
+      inline void set_base4_color(uint idx, uint16 c)
+      {
+         if (idx)
+         {
+            set_byte_bits(cETC1AbsColor4R2BitOffset, 4, (c >> 8) & 15);
+            set_byte_bits(cETC1AbsColor4G2BitOffset, 4, (c >> 4) & 15);
+            set_byte_bits(cETC1AbsColor4B2BitOffset, 4, c & 15);
+         }
+         else
+         {
+            set_byte_bits(cETC1AbsColor4R1BitOffset, 4, (c >> 8) & 15);
+            set_byte_bits(cETC1AbsColor4G1BitOffset, 4, (c >> 4) & 15);
+            set_byte_bits(cETC1AbsColor4B1BitOffset, 4, c & 15);
+         }
+      }
+
+      inline uint16 get_base4_color(uint idx) const
+      {
+         uint r, g, b;
+         if (idx)
+         {
+            r = get_byte_bits(cETC1AbsColor4R2BitOffset, 4);
+            g = get_byte_bits(cETC1AbsColor4G2BitOffset, 4);
+            b = get_byte_bits(cETC1AbsColor4B2BitOffset, 4);
+         }
+         else
+         {
+            r = get_byte_bits(cETC1AbsColor4R1BitOffset, 4);
+            g = get_byte_bits(cETC1AbsColor4G1BitOffset, 4);
+            b = get_byte_bits(cETC1AbsColor4B1BitOffset, 4);
+         }
+         return static_cast<uint16>(b | (g << 4U) | (r << 8U));
+      }
+
+      inline void set_base5_color(uint16 c)
+      {
+         set_byte_bits(cETC1BaseColor5RBitOffset, 5, (c >> 10) & 31);
+         set_byte_bits(cETC1BaseColor5GBitOffset, 5, (c >> 5) & 31);
+         set_byte_bits(cETC1BaseColor5BBitOffset, 5, c & 31);
+      }
+
+      inline uint16 get_base5_color() const
+      {
+         const uint r = get_byte_bits(cETC1BaseColor5RBitOffset, 5);
+         const uint g = get_byte_bits(cETC1BaseColor5GBitOffset, 5);
+         const uint b = get_byte_bits(cETC1BaseColor5BBitOffset, 5);
+         return static_cast<uint16>(b | (g << 5U) | (r << 10U));
+      }
+
+      void set_delta3_color(uint16 c)
+      {
+         set_byte_bits(cETC1DeltaColor3RBitOffset, 3, (c >> 6) & 7);
+         set_byte_bits(cETC1DeltaColor3GBitOffset, 3, (c >> 3) & 7);
+         set_byte_bits(cETC1DeltaColor3BBitOffset, 3, c & 7);
+      }
+
+      inline uint16 get_delta3_color() const
+      {
+         const uint r = get_byte_bits(cETC1DeltaColor3RBitOffset, 3);
+         const uint g = get_byte_bits(cETC1DeltaColor3GBitOffset, 3);
+         const uint b = get_byte_bits(cETC1DeltaColor3BBitOffset, 3);
+         return static_cast<uint16>(b | (g << 3U) | (r << 6U));
+      }
+
+      // Base color 5
+      static uint16 pack_color5(const color_quad_u8& color, bool scaled, uint bias = 127U);
+      static uint16 pack_color5(uint r, uint g, uint b, bool scaled, uint bias = 127U);
+
+      static color_quad_u8 unpack_color5(uint16 packed_color5, bool scaled, uint alpha = 255U);
+      static void unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color, bool scaled);
+
+      static bool unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
+      static bool unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha = 255U);
+
+      // Delta color 3
+      // Inputs range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
+      static uint16 pack_delta3(int r, int g, int b);
+
+      // Results range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
+      static void unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3);
+
+      // Abs color 4
+      static uint16 pack_color4(const color_quad_u8& color, bool scaled, uint bias = 127U);
+      static uint16 pack_color4(uint r, uint g, uint b, bool scaled, uint bias = 127U);
+
+      static color_quad_u8 unpack_color4(uint16 packed_color4, bool scaled, uint alpha = 255U);
+      static void unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled);
+
+      // subblock colors
+      static void get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx);
+      static bool get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx);
+      static void get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx);
+
+      static inline void unscaled_to_scaled_color(color_quad_u8& dst, const color_quad_u8& src, bool color4)
+      {
+         if (color4)
+         {
+            dst.r = src.r | (src.r << 4);
+            dst.g = src.g | (src.g << 4);
+            dst.b = src.b | (src.b << 4);
+         }
+         else
+         {
+            dst.r = (src.r >> 2) | (src.r << 3);
+            dst.g = (src.g >> 2) | (src.g << 3);
+            dst.b = (src.b >> 2) | (src.b << 3);
+         }
+         dst.a = src.a;
+      }
+   };
+
+   // Returns pointer to sorted array.
+   template<typename T, typename Q>
+   T* indirect_radix_sort(uint num_indices, T* pIndices0, T* pIndices1, const Q* pKeys, uint key_ofs, uint key_size, bool init_indices)
+   {  
+      RG_ETC1_ASSERT((key_ofs >= 0) && (key_ofs < sizeof(T)));
+      RG_ETC1_ASSERT((key_size >= 1) && (key_size <= 4));
+
+      if (init_indices)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + (num_indices >> 1) * 2;
+         uint i;
+         for (i = 0; p != q; p += 2, i += 2)
+         {
+            p[0] = static_cast<T>(i);
+            p[1] = static_cast<T>(i + 1); 
+         }
+
+         if (num_indices & 1)
+            *p = static_cast<T>(i);
+      }
+
+      uint hist[256 * 4];
+
+      memset(hist, 0, sizeof(hist[0]) * 256 * key_size);
+
+#define RG_ETC1_GET_KEY(p) (*(const uint*)((const uint8*)(pKeys + *(p)) + key_ofs))
+#define RG_ETC1_GET_KEY_FROM_INDEX(i) (*(const uint*)((const uint8*)(pKeys + (i)) + key_ofs))
+
+      if (key_size == 4)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + num_indices;
+         for ( ; p != q; p++)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[        key        & 0xFF]++;
+            hist[256 + ((key >>  8) & 0xFF)]++;
+            hist[512 + ((key >> 16) & 0xFF)]++;
+            hist[768 + ((key >> 24) & 0xFF)]++;
+         }
+      }
+      else if (key_size == 3)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + num_indices;
+         for ( ; p != q; p++)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[        key        & 0xFF]++;
+            hist[256 + ((key >>  8) & 0xFF)]++;
+            hist[512 + ((key >> 16) & 0xFF)]++;
+         }
+      }   
+      else if (key_size == 2)
+      {
+         T* p = pIndices0;
+         T* q = pIndices0 + (num_indices >> 1) * 2;
+
+         for ( ; p != q; p += 2)
+         {
+            const uint key0 = RG_ETC1_GET_KEY(p);
+            const uint key1 = RG_ETC1_GET_KEY(p+1);
+
+            hist[        key0         & 0xFF]++;
+            hist[256 + ((key0 >>  8) & 0xFF)]++;
+
+            hist[        key1        & 0xFF]++;
+            hist[256 + ((key1 >>  8) & 0xFF)]++;
+         }
+
+         if (num_indices & 1)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[        key        & 0xFF]++;
+            hist[256 + ((key >>  8) & 0xFF)]++;
+         }
+      }      
+      else
+      {
+         RG_ETC1_ASSERT(key_size == 1);
+         if (key_size != 1)
+            return NULL;
+
+         T* p = pIndices0;
+         T* q = pIndices0 + (num_indices >> 1) * 2;
+
+         for ( ; p != q; p += 2)
+         {
+            const uint key0 = RG_ETC1_GET_KEY(p);
+            const uint key1 = RG_ETC1_GET_KEY(p+1);
+
+            hist[key0 & 0xFF]++;
+            hist[key1 & 0xFF]++;
+         }
+
+         if (num_indices & 1)
+         {
+            const uint key = RG_ETC1_GET_KEY(p);
+
+            hist[key & 0xFF]++;
+         }
+      }      
+
+      T* pCur = pIndices0;
+      T* pNew = pIndices1;
+
+      for (uint pass = 0; pass < key_size; pass++)
+      {
+         const uint* pHist = &hist[pass << 8];
+
+         uint offsets[256];
+
+         uint cur_ofs = 0;
+         for (uint i = 0; i < 256; i += 2)
+         {
+            offsets[i] = cur_ofs;
+            cur_ofs += pHist[i];
+
+            offsets[i+1] = cur_ofs;
+            cur_ofs += pHist[i+1];
+         }
+
+         const uint pass_shift = pass << 3;
+
+         T* p = pCur;
+         T* q = pCur + (num_indices >> 1) * 2;
+
+         for ( ; p != q; p += 2)
+         {
+            uint index0 = p[0];
+            uint index1 = p[1];
+
+            uint c0 = (RG_ETC1_GET_KEY_FROM_INDEX(index0) >> pass_shift) & 0xFF;
+            uint c1 = (RG_ETC1_GET_KEY_FROM_INDEX(index1) >> pass_shift) & 0xFF;
+
+            if (c0 == c1)
+            {
+               uint dst_offset0 = offsets[c0];
+
+               offsets[c0] = dst_offset0 + 2;
+
+               pNew[dst_offset0] = static_cast<T>(index0);
+               pNew[dst_offset0 + 1] = static_cast<T>(index1);
+            }
+            else
+            {
+               uint dst_offset0 = offsets[c0]++;
+               uint dst_offset1 = offsets[c1]++;
+
+               pNew[dst_offset0] = static_cast<T>(index0);
+               pNew[dst_offset1] = static_cast<T>(index1);
+            }
+         }
+
+         if (num_indices & 1)
+         {
+            uint index = *p;
+            uint c = (RG_ETC1_GET_KEY_FROM_INDEX(index) >> pass_shift) & 0xFF;
+
+            uint dst_offset = offsets[c];
+            offsets[c] = dst_offset + 1;
+
+            pNew[dst_offset] = static_cast<T>(index);
+         }
+
+         T* t = pCur;
+         pCur = pNew;
+         pNew = t;
+      }            
+
+      return pCur;
+   }
+
+#undef RG_ETC1_GET_KEY
+#undef RG_ETC1_GET_KEY_FROM_INDEX
+
+   uint16 etc1_block::pack_color5(const color_quad_u8& color, bool scaled, uint bias)
+   {
+      return pack_color5(color.r, color.g, color.b, scaled, bias);
+   }
+   
+   uint16 etc1_block::pack_color5(uint r, uint g, uint b, bool scaled, uint bias)
+   {
+      if (scaled)
+      {
+         r = (r * 31U + bias) / 255U;
+         g = (g * 31U + bias) / 255U;
+         b = (b * 31U + bias) / 255U;
+      }
+
+      r = rg_etc1::minimum(r, 31U);
+      g = rg_etc1::minimum(g, 31U);
+      b = rg_etc1::minimum(b, 31U);
+
+      return static_cast<uint16>(b | (g << 5U) | (r << 10U));
+   }
+
+   color_quad_u8 etc1_block::unpack_color5(uint16 packed_color5, bool scaled, uint alpha)
+   {
+      uint b = packed_color5 & 31U;
+      uint g = (packed_color5 >> 5U) & 31U;
+      uint r = (packed_color5 >> 10U) & 31U;
+
+      if (scaled)
+      {
+         b = (b << 3U) | (b >> 2U);
+         g = (g << 3U) | (g >> 2U);
+         r = (r << 3U) | (r >> 2U);
+      }
+
+      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
+   }
+
+   void etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, bool scaled)
+   {
+      color_quad_u8 c(unpack_color5(packed_color5, scaled, 0));
+      r = c.r;
+      g = c.g;
+      b = c.b;
+   }
+
+   bool etc1_block::unpack_color5(color_quad_u8& result, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
+   {
+      int dc_r, dc_g, dc_b;
+      unpack_delta3(dc_r, dc_g, dc_b, packed_delta3);
+      
+      int b = (packed_color5 & 31U) + dc_b;
+      int g = ((packed_color5 >> 5U) & 31U) + dc_g;
+      int r = ((packed_color5 >> 10U) & 31U) + dc_r;
+
+      bool success = true;
+      if (static_cast<uint>(r | g | b) > 31U)
+      {
+         success = false;
+         r = rg_etc1::clamp<int>(r, 0, 31);
+         g = rg_etc1::clamp<int>(g, 0, 31);
+         b = rg_etc1::clamp<int>(b, 0, 31);
+      }
+
+      if (scaled)
+      {
+         b = (b << 3U) | (b >> 2U);
+         g = (g << 3U) | (g >> 2U);
+         r = (r << 3U) | (r >> 2U);
+      }
+
+      result.set_noclamp_rgba(r, g, b, rg_etc1::minimum(alpha, 255U));
+      return success;
+   }
+
+   bool etc1_block::unpack_color5(uint& r, uint& g, uint& b, uint16 packed_color5, uint16 packed_delta3, bool scaled, uint alpha)
+   {
+      color_quad_u8 result;
+      const bool success = unpack_color5(result, packed_color5, packed_delta3, scaled, alpha);
+      r = result.r;
+      g = result.g;
+      b = result.b;
+      return success;
+   }
+     
+   uint16 etc1_block::pack_delta3(int r, int g, int b)
+   {
+      RG_ETC1_ASSERT((r >= cETC1ColorDeltaMin) && (r <= cETC1ColorDeltaMax));
+      RG_ETC1_ASSERT((g >= cETC1ColorDeltaMin) && (g <= cETC1ColorDeltaMax));
+      RG_ETC1_ASSERT((b >= cETC1ColorDeltaMin) && (b <= cETC1ColorDeltaMax));
+      if (r < 0) r += 8;
+      if (g < 0) g += 8;
+      if (b < 0) b += 8;
+      return static_cast<uint16>(b | (g << 3) | (r << 6));
+   }
+   
+   void etc1_block::unpack_delta3(int& r, int& g, int& b, uint16 packed_delta3)
+   {
+      r = (packed_delta3 >> 6) & 7;
+      g = (packed_delta3 >> 3) & 7;
+      b = packed_delta3 & 7;
+      if (r >= 4) r -= 8;
+      if (g >= 4) g -= 8;
+      if (b >= 4) b -= 8;
+   }
+
+   uint16 etc1_block::pack_color4(const color_quad_u8& color, bool scaled, uint bias)
+   {
+      return pack_color4(color.r, color.g, color.b, scaled, bias);
+   }
+   
+   uint16 etc1_block::pack_color4(uint r, uint g, uint b, bool scaled, uint bias)
+   {
+      if (scaled)
+      {
+         r = (r * 15U + bias) / 255U;
+         g = (g * 15U + bias) / 255U;
+         b = (b * 15U + bias) / 255U;
+      }
+
+      r = rg_etc1::minimum(r, 15U);
+      g = rg_etc1::minimum(g, 15U);
+      b = rg_etc1::minimum(b, 15U);
+
+      return static_cast<uint16>(b | (g << 4U) | (r << 8U));
+   }
+
+   color_quad_u8 etc1_block::unpack_color4(uint16 packed_color4, bool scaled, uint alpha)
+   {
+      uint b = packed_color4 & 15U;
+      uint g = (packed_color4 >> 4U) & 15U;
+      uint r = (packed_color4 >> 8U) & 15U;
+
+      if (scaled)
+      {
+         b = (b << 4U) | b;
+         g = (g << 4U) | g;
+         r = (r << 4U) | r;
+      }
+
+      return color_quad_u8(cNoClamp, r, g, b, rg_etc1::minimum(alpha, 255U));
+   }
+   
+   void etc1_block::unpack_color4(uint& r, uint& g, uint& b, uint16 packed_color4, bool scaled)
+   {
+      color_quad_u8 c(unpack_color4(packed_color4, scaled, 0));
+      r = c.r;
+      g = c.g;
+      b = c.b;
+   }
+
+   void etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint table_idx)
+   {
+      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
+      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+      uint r, g, b;
+      unpack_color5(r, g, b, packed_color5, true);
+
+      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+      const int y0 = pInten_modifer_table[0];
+      pDst[0].set(ir + y0, ig + y0, ib + y0);
+
+      const int y1 = pInten_modifer_table[1];
+      pDst[1].set(ir + y1, ig + y1, ib + y1);
+
+      const int y2 = pInten_modifer_table[2];
+      pDst[2].set(ir + y2, ig + y2, ib + y2);
+
+      const int y3 = pInten_modifer_table[3];
+      pDst[3].set(ir + y3, ig + y3, ib + y3);
+   }
+   
+   bool etc1_block::get_diff_subblock_colors(color_quad_u8* pDst, uint16 packed_color5, uint16 packed_delta3, uint table_idx)
+   {
+      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
+      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+      uint r, g, b;
+      bool success = unpack_color5(r, g, b, packed_color5, packed_delta3, true);
+
+      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+      const int y0 = pInten_modifer_table[0];
+      pDst[0].set(ir + y0, ig + y0, ib + y0);
+
+      const int y1 = pInten_modifer_table[1];
+      pDst[1].set(ir + y1, ig + y1, ib + y1);
+
+      const int y2 = pInten_modifer_table[2];
+      pDst[2].set(ir + y2, ig + y2, ib + y2);
+
+      const int y3 = pInten_modifer_table[3];
+      pDst[3].set(ir + y3, ig + y3, ib + y3);
+
+      return success;
+   }
+   
+   void etc1_block::get_abs_subblock_colors(color_quad_u8* pDst, uint16 packed_color4, uint table_idx)
+   {
+      RG_ETC1_ASSERT(table_idx < cETC1IntenModifierValues);
+      const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+      uint r, g, b;
+      unpack_color4(r, g, b, packed_color4, true);
+      
+      const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+      const int y0 = pInten_modifer_table[0];
+      pDst[0].set(ir + y0, ig + y0, ib + y0);
+      
+      const int y1 = pInten_modifer_table[1];
+      pDst[1].set(ir + y1, ig + y1, ib + y1);
+
+      const int y2 = pInten_modifer_table[2];
+      pDst[2].set(ir + y2, ig + y2, ib + y2);
+
+      const int y3 = pInten_modifer_table[3];
+      pDst[3].set(ir + y3, ig + y3, ib + y3);
+   }
+      
+   bool unpack_etc1_block(const void* pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha)
+   {
+      color_quad_u8* pDst = reinterpret_cast<color_quad_u8*>(pDst_pixels_rgba);
+      const etc1_block& block = *static_cast<const etc1_block*>(pETC1_block);
+
+      const bool diff_flag = block.get_diff_bit();
+      const bool flip_flag = block.get_flip_bit();
+      const uint table_index0 = block.get_inten_table(0);
+      const uint table_index1 = block.get_inten_table(1);
+
+      color_quad_u8 subblock_colors0[4];
+      color_quad_u8 subblock_colors1[4];
+      bool success = true;
+
+      if (diff_flag)
+      {
+         const uint16 base_color5 = block.get_base5_color();
+         const uint16 delta_color3 = block.get_delta3_color();
+         etc1_block::get_diff_subblock_colors(subblock_colors0, base_color5, table_index0);
+            
+         if (!etc1_block::get_diff_subblock_colors(subblock_colors1, base_color5, delta_color3, table_index1))
+            success = false;
+      }
+      else
+      {
+         const uint16 base_color4_0 = block.get_base4_color(0);
+         etc1_block::get_abs_subblock_colors(subblock_colors0, base_color4_0, table_index0);
+
+         const uint16 base_color4_1 = block.get_base4_color(1);
+         etc1_block::get_abs_subblock_colors(subblock_colors1, base_color4_1, table_index1);
+      }
+
+      if (preserve_alpha)
+      {
+         if (flip_flag)
+         {
+            for (uint y = 0; y < 2; y++)
+            {
+               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+               pDst[2].set_rgb(subblock_colors0[block.get_selector(2, y)]);
+               pDst[3].set_rgb(subblock_colors0[block.get_selector(3, y)]);
+               pDst += 4;
+            }
+
+            for (uint y = 2; y < 4; y++)
+            {
+               pDst[0].set_rgb(subblock_colors1[block.get_selector(0, y)]);
+               pDst[1].set_rgb(subblock_colors1[block.get_selector(1, y)]);
+               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+               pDst += 4;
+            }
+         }
+         else
+         {
+            for (uint y = 0; y < 4; y++)
+            {
+               pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+               pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+               pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+               pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+               pDst += 4;
+            }
+         }
+      }
+      else 
+      {
+         if (flip_flag)
+         {
+            // 0000
+            // 0000
+            // 1111
+            // 1111
+            for (uint y = 0; y < 2; y++)
+            {
+               pDst[0] = subblock_colors0[block.get_selector(0, y)];
+               pDst[1] = subblock_colors0[block.get_selector(1, y)];
+               pDst[2] = subblock_colors0[block.get_selector(2, y)];
+               pDst[3] = subblock_colors0[block.get_selector(3, y)];
+               pDst += 4;
+            }
+
+            for (uint y = 2; y < 4; y++)
+            {
+               pDst[0] = subblock_colors1[block.get_selector(0, y)];
+               pDst[1] = subblock_colors1[block.get_selector(1, y)];
+               pDst[2] = subblock_colors1[block.get_selector(2, y)];
+               pDst[3] = subblock_colors1[block.get_selector(3, y)];
+               pDst += 4;
+            }
+         }
+         else
+         {
+            // 0011
+            // 0011
+            // 0011
+            // 0011
+            for (uint y = 0; y < 4; y++)
+            {
+               pDst[0] = subblock_colors0[block.get_selector(0, y)];
+               pDst[1] = subblock_colors0[block.get_selector(1, y)];
+               pDst[2] = subblock_colors1[block.get_selector(2, y)];
+               pDst[3] = subblock_colors1[block.get_selector(3, y)];
+               pDst += 4;
+            }
+         }
+      }
+      
+      return success;
+   }
+
+   struct etc1_solution_coordinates
+   {
+      inline etc1_solution_coordinates() :
+      m_unscaled_color(0, 0, 0, 0),
+         m_inten_table(0),
+         m_color4(false)
+      {
+      }
+
+      inline etc1_solution_coordinates(uint r, uint g, uint b, uint inten_table, bool color4) : 
+      m_unscaled_color(r, g, b, 255),
+         m_inten_table(inten_table),
+         m_color4(color4)
+      {
+      }
+
+      inline etc1_solution_coordinates(const color_quad_u8& c, uint inten_table, bool color4) : 
+      m_unscaled_color(c),
+         m_inten_table(inten_table),
+         m_color4(color4)
+      {
+      }
+
+      inline etc1_solution_coordinates(const etc1_solution_coordinates& other)
+      {
+         *this = other;
+      }
+
+      inline etc1_solution_coordinates& operator= (const etc1_solution_coordinates& rhs)
+      {
+         m_unscaled_color = rhs.m_unscaled_color;
+         m_inten_table = rhs.m_inten_table;
+         m_color4 = rhs.m_color4;
+         return *this;
+      }
+
+      inline void clear()
+      {
+         m_unscaled_color.clear();
+         m_inten_table = 0;
+         m_color4 = false;
+      }
+
+      inline color_quad_u8 get_scaled_color() const
+      {
+         int br, bg, bb;
+         if (m_color4)
+         {
+            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
+            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
+            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
+         }
+         else
+         {
+            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
+            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
+            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
+         }
+         return color_quad_u8(br, bg, bb);
+      }
+
+      inline void get_block_colors(color_quad_u8* pBlock_colors)
+      {
+         int br, bg, bb;
+         if (m_color4)
+         {
+            br = m_unscaled_color.r | (m_unscaled_color.r << 4);
+            bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
+            bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
+         }
+         else
+         {
+            br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
+            bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
+            bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
+         }
+         const int* pInten_table = g_etc1_inten_tables[m_inten_table];
+         pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0]);
+         pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1]);
+         pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2]);
+         pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3]);
+      }
+
+      color_quad_u8 m_unscaled_color;
+      uint m_inten_table;
+      bool m_color4;
+   };
+
+   class etc1_optimizer
+   {
+      etc1_optimizer(const etc1_optimizer&);
+      etc1_optimizer& operator= (const etc1_optimizer&);
+
+   public:
+      etc1_optimizer()
+      {
+         clear();
+      }
+
+      void clear()
+      {
+         m_pParams = NULL;
+         m_pResult = NULL;
+         m_pSorted_luma = NULL;
+         m_pSorted_luma_indices = NULL;
+      }
+
+      struct params : etc1_pack_params
+      {
+         params()
+         {
+            clear();
+         }
+
+         params(const etc1_pack_params& base_params) : 
+         etc1_pack_params(base_params)
+         {
+            clear_optimizer_params();
+         }
+
+         void clear()
+         {
+            etc1_pack_params::clear();
+            clear_optimizer_params();
+         }
+
+         void clear_optimizer_params()
+         {
+            m_num_src_pixels = 0;
+            m_pSrc_pixels = 0;
+
+            m_use_color4 = false;
+            static const int s_default_scan_delta[] = { 0 };
+            m_pScan_deltas = s_default_scan_delta;
+            m_scan_delta_size = 1;
+
+            m_base_color5.clear();
+            m_constrain_against_base_color5 = false;
+         }
+
+         uint m_num_src_pixels;
+         const color_quad_u8* m_pSrc_pixels;
+
+         bool m_use_color4;
+         const int* m_pScan_deltas;
+         uint m_scan_delta_size;
+
+         color_quad_u8 m_base_color5;
+         bool m_constrain_against_base_color5;
+      };
+
+      struct results
+      {
+         uint64 m_error;
+         color_quad_u8 m_block_color_unscaled;
+         uint m_block_inten_table;
+         uint m_n;
+         uint8* m_pSelectors;
+         bool m_block_color4;
+
+         inline results& operator= (const results& rhs)
+         {
+            m_block_color_unscaled = rhs.m_block_color_unscaled;
+            m_block_color4 = rhs.m_block_color4;
+            m_block_inten_table = rhs.m_block_inten_table;
+            m_error = rhs.m_error;
+            RG_ETC1_ASSERT(m_n == rhs.m_n);
+            memcpy(m_pSelectors, rhs.m_pSelectors, rhs.m_n);
+            return *this;
+         }
+      };
+
+      void init(const params& params, results& result);
+      bool compute();
+
+   private:      
+      struct potential_solution
+      {
+         potential_solution() : m_coords(), m_error(cUINT64_MAX), m_valid(false)
+         {
+         }
+
+         etc1_solution_coordinates  m_coords;
+         uint8                      m_selectors[8];
+         uint64                     m_error;
+         bool                       m_valid;
+
+         void clear()
+         {
+            m_coords.clear();
+            m_error = cUINT64_MAX;
+            m_valid = false;
+         }
+      };
+
+      const params* m_pParams;
+      results* m_pResult;
+
+      int m_limit;
+
+      vec3F m_avg_color;
+      int m_br, m_bg, m_bb;
+      uint16 m_luma[8];
+      uint32 m_sorted_luma[2][8];
+      const uint32* m_pSorted_luma_indices;
+      uint32* m_pSorted_luma;
+
+      uint8 m_selectors[8];
+      uint8 m_best_selectors[8];
+
+      potential_solution m_best_solution;
+      potential_solution m_trial_solution;
+      uint8 m_temp_selectors[8];
+
+      bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
+      bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
+   };
+      
+   bool etc1_optimizer::compute()
+   {
+      const uint n = m_pParams->m_num_src_pixels;
+      const int scan_delta_size = m_pParams->m_scan_delta_size;
+      
+      // Scan through a subset of the 3D lattice centered around the avg block color trying each 3D (555 or 444) lattice point as a potential block color.
+      // Each time a better solution is found try to refine the current solution's block color based of the current selectors and intensity table index.
+      for (int zdi = 0; zdi < scan_delta_size; zdi++)
+      {
+         const int zd = m_pParams->m_pScan_deltas[zdi];
+         const int mbb = m_bb + zd;
+         if (mbb < 0) continue; else if (mbb > m_limit) break;
+         
+         for (int ydi = 0; ydi < scan_delta_size; ydi++)
+         {
+            const int yd = m_pParams->m_pScan_deltas[ydi];
+            const int mbg = m_bg + yd;
+            if (mbg < 0) continue; else if (mbg > m_limit) break;
+
+            for (int xdi = 0; xdi < scan_delta_size; xdi++)
+            {
+               const int xd = m_pParams->m_pScan_deltas[xdi];
+               const int mbr = m_br + xd;
+               if (mbr < 0) continue; else if (mbr > m_limit) break;
+      
+               etc1_solution_coordinates coords(mbr, mbg, mbb, 0, m_pParams->m_use_color4);
+               if (m_pParams->m_quality == cHighQuality)
+               {
+                  if (!evaluate_solution(coords, m_trial_solution, &m_best_solution))
+                     continue;
+               }
+               else
+               {
+                  if (!evaluate_solution_fast(coords, m_trial_solution, &m_best_solution))
+                     continue;
+               }
+               
+               // Now we have the input block, the avg. color of the input pixels, a set of trial selector indices, and the block color+intensity index.
+               // Now, for each component, attempt to refine the current solution by solving a simple linear equation. For example, for 4 colors:
+               // The goal is:
+               // pixel0 - (block_color+inten_table[selector0]) + pixel1 - (block_color+inten_table[selector1]) + pixel2 - (block_color+inten_table[selector2]) + pixel3 - (block_color+inten_table[selector3]) = 0
+               // Rearranging this:
+               // (pixel0 + pixel1 + pixel2 + pixel3) - (block_color+inten_table[selector0]) - (block_color+inten_table[selector1]) - (block_color+inten_table[selector2]) - (block_color+inten_table[selector3]) = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3) - block_color - inten_table[selector0] - block_color-inten_table[selector1] - block_color-inten_table[selector2] - block_color-inten_table[selector3] = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - inten_table[selector0] - inten_table[selector1] - inten_table[selector2] - inten_table[selector3] = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3]) = 0
+               // (pixel0 + pixel1 + pixel2 + pixel3)/4 - block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4 = 0
+               // block_color = (pixel0 + pixel1 + pixel2 + pixel3)/4 - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4
+               // So what this means:
+               // optimal_block_color = avg_input - avg_inten_delta
+               // So the optimal block color can be computed by taking the average block color and subtracting the current average of the intensity delta.
+               // Unfortunately, optimal_block_color must then be quantized to 555 or 444 so it's not always possible to improve matters using this formula.
+               // Also, the above formula is for unclamped intensity deltas. The actual implementation takes into account clamping.
+
+               const uint max_refinement_trials = (m_pParams->m_quality == cLowQuality) ? 2 : (((xd | yd | zd) == 0) ? 4 : 2);
+               for (uint refinement_trial = 0; refinement_trial < max_refinement_trials; refinement_trial++)
+               {
+                  const uint8* pSelectors = m_best_solution.m_selectors;
+                  const int* pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
+
+                  int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
+                  const color_quad_u8 base_color(m_best_solution.m_coords.get_scaled_color());
+                  for (uint r = 0; r < n; r++)
+                  {
+                     const uint s = *pSelectors++;
+                     const int yd = pInten_table[s];
+                     // Compute actual delta being applied to each pixel, taking into account clamping.
+                     delta_sum_r += rg_etc1::clamp<int>(base_color.r + yd, 0, 255) - base_color.r;
+                     delta_sum_g += rg_etc1::clamp<int>(base_color.g + yd, 0, 255) - base_color.g;
+                     delta_sum_b += rg_etc1::clamp<int>(base_color.b + yd, 0, 255) - base_color.b;
+                  }
+                  if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
+                     break;
+                  const float avg_delta_r_f = static_cast<float>(delta_sum_r) / n;
+                  const float avg_delta_g_f = static_cast<float>(delta_sum_g) / n;
+                  const float avg_delta_b_f = static_cast<float>(delta_sum_b) / n;
+                  const int br1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
+                  const int bg1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
+                  const int bb1 = rg_etc1::clamp<int>(static_cast<uint>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
+                  
+                  bool skip = false;
+                  
+                  if ((mbr == br1) && (mbg == bg1) && (mbb == bb1))
+                     skip = true;
+                  else if ((br1 == m_best_solution.m_coords.m_unscaled_color.r) && (bg1 == m_best_solution.m_coords.m_unscaled_color.g) && (bb1 == m_best_solution.m_coords.m_unscaled_color.b))
+                     skip = true;
+                  else if ((m_br == br1) && (m_bg == bg1) && (m_bb == bb1))
+                     skip = true;
+
+                  if (skip)
+                     break;
+
+                  etc1_solution_coordinates coords1(br1, bg1, bb1, 0, m_pParams->m_use_color4);
+                  if (m_pParams->m_quality == cHighQuality)
+                  {
+                     if (!evaluate_solution(coords1, m_trial_solution, &m_best_solution)) 
+                        break;
+                  }
+                  else
+                  {
+                     if (!evaluate_solution_fast(coords1, m_trial_solution, &m_best_solution))
+                        break;
+                  }
+
+               }  // refinement_trial
+
+            } // xdi
+         } // ydi
+      } // zdi
+
+      if (!m_best_solution.m_valid)
+      {
+         m_pResult->m_error = cUINT32_MAX;
+         return false;
+      }
+      
+      const uint8* pSelectors = m_best_solution.m_selectors;
+
+#ifdef RG_ETC1_BUILD_DEBUG
+      {
+         color_quad_u8 block_colors[4];
+         m_best_solution.m_coords.get_block_colors(block_colors);
+
+         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
+         uint64 actual_error = 0;
+         for (uint i = 0; i < n; i++)
+            actual_error += pSrc_pixels[i].squared_distance_rgb(block_colors[pSelectors[i]]);
+         
+         RG_ETC1_ASSERT(actual_error == m_best_solution.m_error);
+      }
+#endif      
+      
+      m_pResult->m_error = m_best_solution.m_error;
+
+      m_pResult->m_block_color_unscaled = m_best_solution.m_coords.m_unscaled_color;
+      m_pResult->m_block_color4 = m_best_solution.m_coords.m_color4;
+      
+      m_pResult->m_block_inten_table = m_best_solution.m_coords.m_inten_table;
+      memcpy(m_pResult->m_pSelectors, pSelectors, n);
+      m_pResult->m_n = n;
+
+      return true;
+   }
+
+   void etc1_optimizer::init(const params& p, results& r)
+   {
+      // This version is hardcoded for 8 pixel subblocks.
+      RG_ETC1_ASSERT(p.m_num_src_pixels == 8);
+      
+      m_pParams = &p;
+      m_pResult = &r;
+                  
+      const uint n = 8;
+      
+      m_limit = m_pParams->m_use_color4 ? 15 : 31;
+
+      vec3F avg_color(0.0f);
+
+      for (uint i = 0; i < n; i++)
+      {
+         const color_quad_u8& c = m_pParams->m_pSrc_pixels[i];
+         const vec3F fc(c.r, c.g, c.b);
+
+         avg_color += fc;
+
+         m_luma[i] = static_cast<uint16>(c.r + c.g + c.b);
+         m_sorted_luma[0][i] = i;
+      }
+      avg_color *= (1.0f / static_cast<float>(n));
+      m_avg_color = avg_color;
+
+      m_br = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
+      m_bg = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
+      m_bb = rg_etc1::clamp<int>(static_cast<uint>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
+
+      if (m_pParams->m_quality <= cMediumQuality)
+      {
+         m_pSorted_luma_indices = indirect_radix_sort(n, m_sorted_luma[0], m_sorted_luma[1], m_luma, 0, sizeof(m_luma[0]), false);
+         m_pSorted_luma = m_sorted_luma[0];
+         if (m_pSorted_luma_indices == m_sorted_luma[0])
+            m_pSorted_luma = m_sorted_luma[1];
+      
+         for (uint i = 0; i < n; i++)
+            m_pSorted_luma[i] = m_luma[m_pSorted_luma_indices[i]];
+      }
+      
+      m_best_solution.m_coords.clear();
+      m_best_solution.m_valid = false;
+      m_best_solution.m_error = cUINT64_MAX;
+   }
+
+   bool etc1_optimizer::evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+   {
+      trial_solution.m_valid = false;
+
+      if (m_pParams->m_constrain_against_base_color5)
+      {
+         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
+         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
+         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
+
+         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
+            return false;
+      }
+
+      const color_quad_u8 base_color(coords.get_scaled_color());
+      
+      const uint n = 8;
+            
+      trial_solution.m_error = cUINT64_MAX;
+            
+      for (uint inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
+      {
+         const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+         color_quad_u8 block_colors[4];
+         for (uint s = 0; s < 4; s++)
+         {
+            const int yd = pInten_table[s];
+            block_colors[s].set(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
+         }
+         
+         uint64 total_error = 0;
+         
+         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
+         for (uint c = 0; c < n; c++)
+         {
+            const color_quad_u8& src_pixel = *pSrc_pixels++;
+            
+            uint best_selector_index = 0;
+            uint best_error = rg_etc1::square(src_pixel.r - block_colors[0].r) + rg_etc1::square(src_pixel.g - block_colors[0].g) + rg_etc1::square(src_pixel.b - block_colors[0].b);
+
+            uint trial_error = rg_etc1::square(src_pixel.r - block_colors[1].r) + rg_etc1::square(src_pixel.g - block_colors[1].g) + rg_etc1::square(src_pixel.b - block_colors[1].b);
+            if (trial_error < best_error)
+            {
+               best_error = trial_error;
+               best_selector_index = 1;
+            }
+
+            trial_error = rg_etc1::square(src_pixel.r - block_colors[2].r) + rg_etc1::square(src_pixel.g - block_colors[2].g) + rg_etc1::square(src_pixel.b - block_colors[2].b);
+            if (trial_error < best_error)
+            {
+               best_error = trial_error;
+               best_selector_index = 2;
+            }
+
+            trial_error = rg_etc1::square(src_pixel.r - block_colors[3].r) + rg_etc1::square(src_pixel.g - block_colors[3].g) + rg_etc1::square(src_pixel.b - block_colors[3].b);
+            if (trial_error < best_error)
+            {
+               best_error = trial_error;
+               best_selector_index = 3;
+            }
+
+            m_temp_selectors[c] = static_cast<uint8>(best_selector_index);
+
+            total_error += best_error;
+            if (total_error >= trial_solution.m_error)
+               break;
+         }
+         
+         if (total_error < trial_solution.m_error)
+         {
+            trial_solution.m_error = total_error;
+            trial_solution.m_coords.m_inten_table = inten_table;
+            memcpy(trial_solution.m_selectors, m_temp_selectors, 8);
+            trial_solution.m_valid = true;
+         }
+      }
+      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+
+      bool success = false;
+      if (pBest_solution)
+      {
+         if (trial_solution.m_error < pBest_solution->m_error)
+         {
+            *pBest_solution = trial_solution;
+            success = true;
+         }
+      }
+
+      return success;
+   }
+
+   bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+   {
+      if (m_pParams->m_constrain_against_base_color5)
+      {
+         const int dr = coords.m_unscaled_color.r - m_pParams->m_base_color5.r;
+         const int dg = coords.m_unscaled_color.g - m_pParams->m_base_color5.g;
+         const int db = coords.m_unscaled_color.b - m_pParams->m_base_color5.b;
+
+         if ((rg_etc1::minimum(dr, dg, db) < cETC1ColorDeltaMin) || (rg_etc1::maximum(dr, dg, db) > cETC1ColorDeltaMax))
+         {
+            trial_solution.m_valid = false;
+            return false;
+         }
+      }
+
+      const color_quad_u8 base_color(coords.get_scaled_color());
+
+      const uint n = 8;
+      
+      trial_solution.m_error = cUINT64_MAX;
+
+      for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
+      {
+         const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+         uint block_inten[4];
+         color_quad_u8 block_colors[4];
+         for (uint s = 0; s < 4; s++)
+         {
+            const int yd = pInten_table[s];
+            color_quad_u8 block_color(base_color.r + yd, base_color.g + yd, base_color.b + yd, 0);
+            block_colors[s] = block_color;
+            block_inten[s] = block_color.r + block_color.g + block_color.b;
+         }
+
+         // evaluate_solution_fast() enforces/assumesd a total ordering of the input colors along the intensity (1,1,1) axis to more quickly classify the inputs to selectors.
+         // The inputs colors have been presorted along the projection onto this axis, and ETC1 block colors are always ordered along the intensity axis, so this classification is fast.
+         // 0   1   2   3
+         //   01  12  23
+         const uint block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
+
+         uint64 total_error = 0;
+         const color_quad_u8* pSrc_pixels = m_pParams->m_pSrc_pixels;
+         if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
+         {
+            if (block_inten[0] > m_pSorted_luma[n - 1])
+            {
+           const uint min_error = intabs(block_inten[0] - m_pSorted_luma[n - 1]);
+               if (min_error >= trial_solution.m_error)
+                  continue;
+            }
+
+            memset(&m_temp_selectors[0], 0, n);
+
+            for (uint c = 0; c < n; c++)
+               total_error += block_colors[0].squared_distance_rgb(pSrc_pixels[c]);
+         }
+         else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
+         {
+            if (m_pSorted_luma[0] > block_inten[3])
+            {
+           const uint min_error = intabs(m_pSorted_luma[0] - block_inten[3]);
+               if (min_error >= trial_solution.m_error)
+                  continue;
+            }
+
+            memset(&m_temp_selectors[0], 3, n);
+
+            for (uint c = 0; c < n; c++)
+               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[c]);
+         }
+         else
+         {
+            uint cur_selector = 0, c;
+            for (c = 0; c < n; c++)
+            {
+               const uint y = m_pSorted_luma[c];
+               while ((y * 2) >= block_inten_midpoints[cur_selector])
+                  if (++cur_selector > 2)
+                     goto done;
+               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
+               m_temp_selectors[sorted_pixel_index] = static_cast<uint8>(cur_selector);
+               total_error += block_colors[cur_selector].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
+            }
+done:
+            while (c < n)
+            {
+               const uint sorted_pixel_index = m_pSorted_luma_indices[c];
+               m_temp_selectors[sorted_pixel_index] = 3;
+               total_error += block_colors[3].squared_distance_rgb(pSrc_pixels[sorted_pixel_index]);
+               ++c;
+            }
+         }
+
+         if (total_error < trial_solution.m_error)
+         {
+            trial_solution.m_error = total_error;
+            trial_solution.m_coords.m_inten_table = inten_table;
+            memcpy(trial_solution.m_selectors, m_temp_selectors, n);
+            trial_solution.m_valid = true;
+            if (!total_error)
+               break;
+         }
+      }
+      trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+      trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+      
+      bool success = false;
+      if (pBest_solution)
+      {
+         if (trial_solution.m_error < pBest_solution->m_error)
+         {
+            *pBest_solution = trial_solution;
+            success = true;
+         }
+      }
+
+      return success;
+   }
+         
+   static uint etc1_decode_value(uint diff, uint inten, uint selector, uint packed_c)
+   {
+      const uint limit = diff ? 32 : 16; limit;
+      RG_ETC1_ASSERT((diff < 2) && (inten < 8) && (selector < 4) && (packed_c < limit));
+      int c;
+      if (diff)
+         c = (packed_c >> 2) | (packed_c << 3);
+      else 
+         c = packed_c | (packed_c << 4);
+      c += g_etc1_inten_tables[inten][selector];
+      c = rg_etc1::clamp<int>(c, 0, 255);
+      return c;
+   }
+
+   static inline int mul_8bit(int a, int b) { int t = a*b + 128; return (t + (t >> 8)) >> 8; }
+
+   void pack_etc1_block_init()
+   {
+      for (uint diff = 0; diff < 2; diff++)
+      {
+         const uint limit = diff ? 32 : 16;
+
+         for (uint inten = 0; inten < 8; inten++)
+         {
+            for (uint selector = 0; selector < 4; selector++)
+            {
+               const uint inverse_table_index = diff + (inten << 1) + (selector << 4);
+               for (uint color = 0; color < 256; color++)
+               {
+                  uint best_error = cUINT32_MAX, best_packed_c = 0;
+                  for (uint packed_c = 0; packed_c < limit; packed_c++)
+                  {
+                     int v = etc1_decode_value(diff, inten, selector, packed_c);
+                     uint err = labs(v - static_cast<int>(color));
+		     //printf("err: %d - %u = %u\n",v,color,err);
+                     if (err < best_error)
+                     {
+                        best_error = err;
+                        best_packed_c = packed_c;
+                        if (!best_error) 
+                           break;
+                     }
+                  }
+                  RG_ETC1_ASSERT(best_error <= 255);
+                  g_etc1_inverse_lookup[inverse_table_index][color] = static_cast<uint16>(best_packed_c | (best_error << 8));
+               }
+            }
+         }
+      }
+      
+      uint expand5[32];
+      for(int i = 0; i < 32; i++)
+         expand5[i] = (i << 3) | (i >> 2);
+
+      for(int i = 0; i < 256 + 16; i++)
+      {
+         int v = clamp<int>(i - 8, 0, 255);
+         g_quant5_tab[i] = static_cast<uint8>(expand5[mul_8bit(v,31)]);
+      }
+   }
+
+   // Packs solid color blocks efficiently using a set of small precomputed tables.
+   // For random 888 inputs, MSE results are better than Erricson's ETC1 packer in "slow" mode ~9.5% of the time, is slightly worse only ~.01% of the time, and is equal the rest of the time.
+   static uint64 pack_etc1_block_solid_color(etc1_block& block, const uint8* pColor, etc1_pack_params& pack_params)
+   {
+      pack_params;
+      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
+            
+      static uint s_next_comp[4] = { 1, 2, 0, 1 };
+            
+      uint best_error = cUINT32_MAX, best_i = 0;
+      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
+
+      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
+      for (uint i = 0; i < 3; i++)
+      {
+         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
+
+         const int delta_range = 1;
+         for (int delta = -delta_range; delta <= delta_range; delta++)
+         {
+            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
+
+            const uint16* pTable;
+            if (!c_plus_delta)
+               pTable = g_color8_to_etc_block_config_0_255[0];
+            else if (c_plus_delta == 255)
+               pTable = g_color8_to_etc_block_config_0_255[1];
+            else
+               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
+
+            do
+            {
+               const uint x = *pTable++;
+
+#ifdef RG_ETC1_BUILD_DEBUG
+               const uint diff = x & 1;
+               const uint inten = (x >> 1) & 7;
+               const uint selector = (x >> 4) & 3;
+               const uint p0 = (x >> 8) & 255;
+               RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
+#endif
+
+               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
+               uint16 p1 = pInverse_table[c1];
+               uint16 p2 = pInverse_table[c2];
+               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
+               if (trial_error < best_error)
+               {
+                  best_error = trial_error;
+                  best_x = x;
+                  best_packed_c1 = p1 & 0xFF;
+                  best_packed_c2 = p2 & 0xFF;
+                  best_i = i;
+                  if (!best_error)
+                     goto found_perfect_match;
+               }
+            } while (*pTable != 0xFFFF);
+         }
+      }
+found_perfect_match:
+
+      const uint diff = best_x & 1;
+      const uint inten = (best_x >> 1) & 7;
+
+      block.m_bytes[3] = static_cast<uint8>(((inten | (inten << 3)) << 2) | (diff << 1));
+                        
+      const uint etc1_selector = g_selector_index_to_etc1[(best_x >> 4) & 3];
+      *reinterpret_cast<uint16*>(&block.m_bytes[4]) = (etc1_selector & 2) ? 0xFFFF : 0;
+      *reinterpret_cast<uint16*>(&block.m_bytes[6]) = (etc1_selector & 1) ? 0xFFFF : 0;
+
+      const uint best_packed_c0 = (best_x >> 8) & 255;
+      if (diff)
+      {
+         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 << 3);
+         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 << 3);
+         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 << 3);
+      }
+      else
+      {
+         block.m_bytes[best_i] = static_cast<uint8>(best_packed_c0 | (best_packed_c0 << 4));
+         block.m_bytes[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1 | (best_packed_c1 << 4));
+         block.m_bytes[s_next_comp[best_i+1]] = static_cast<uint8>(best_packed_c2 | (best_packed_c2 << 4));
+      }
+
+      return best_error;
+   }
+      
+   static uint pack_etc1_block_solid_color_constrained(
+      etc1_optimizer::results& results, 
+      uint num_colors, const uint8* pColor, 
+      etc1_pack_params& pack_params, 
+      bool use_diff,
+      const color_quad_u8* pBase_color5_unscaled)
+   {
+      RG_ETC1_ASSERT(g_etc1_inverse_lookup[0][255]);
+
+      pack_params;
+      static uint s_next_comp[4] = { 1, 2, 0, 1 };
+
+      uint best_error = cUINT32_MAX, best_i = 0;
+      int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
+
+      // For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
+      for (uint i = 0; i < 3; i++)
+      {
+         const uint c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
+
+         const int delta_range = 1;
+         for (int delta = -delta_range; delta <= delta_range; delta++)
+         {
+            const int c_plus_delta = rg_etc1::clamp<int>(pColor[i] + delta, 0, 255);
+
+            const uint16* pTable;
+            if (!c_plus_delta)
+               pTable = g_color8_to_etc_block_config_0_255[0];
+            else if (c_plus_delta == 255)
+               pTable = g_color8_to_etc_block_config_0_255[1];
+            else
+               pTable = g_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
+
+            do
+            {
+               const uint x = *pTable++;
+               const uint diff = x & 1;
+               if (static_cast<uint>(use_diff) != diff)
+               {
+                  if (*pTable == 0xFFFF)
+                     break;
+                  continue;
+               }
+
+               if ((diff) && (pBase_color5_unscaled))
+               {
+                  const int p0 = (x >> 8) & 255;
+                  int delta = p0 - static_cast<int>(pBase_color5_unscaled->c[i]);
+                  if ((delta < cETC1ColorDeltaMin) || (delta > cETC1ColorDeltaMax))
+                  {
+                     if (*pTable == 0xFFFF)
+                        break;
+                     continue;
+                  }
+               }
+
+#ifdef RG_ETC1_BUILD_DEBUG
+               {
+                  const uint inten = (x >> 1) & 7;
+                  const uint selector = (x >> 4) & 3;
+                  const uint p0 = (x >> 8) & 255;
+                  RG_ETC1_ASSERT(etc1_decode_value(diff, inten, selector, p0) == (uint)c_plus_delta);
+               }
+#endif
+
+               const uint16* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
+               uint16 p1 = pInverse_table[c1];
+               uint16 p2 = pInverse_table[c2];
+
+               if ((diff) && (pBase_color5_unscaled))
+               {
+                  int delta1 = (p1 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i]]);
+                  int delta2 = (p2 & 0xFF) - static_cast<int>(pBase_color5_unscaled->c[s_next_comp[i + 1]]);
+                  if ((delta1 < cETC1ColorDeltaMin) || (delta1 > cETC1ColorDeltaMax) || (delta2 < cETC1ColorDeltaMin) || (delta2 > cETC1ColorDeltaMax))
+                  {
+                     if (*pTable == 0xFFFF)
+                        break;
+                     continue;
+                  }
+               }
+
+               const uint trial_error = rg_etc1::square(c_plus_delta - pColor[i]) + rg_etc1::square(p1 >> 8) + rg_etc1::square(p2 >> 8);
+               if (trial_error < best_error)
+               {
+                  best_error = trial_error;
+                  best_x = x;
+                  best_packed_c1 = p1 & 0xFF;
+                  best_packed_c2 = p2 & 0xFF;
+                  best_i = i;
+                  if (!best_error)
+                     goto found_perfect_match;
+               }
+            } while (*pTable != 0xFFFF);
+         }
+      }
+found_perfect_match:
+
+      if (best_error == cUINT32_MAX)
+         return best_error;
+
+      best_error *= num_colors;
+
+      results.m_n = num_colors;
+      results.m_block_color4 = !(best_x & 1);
+      results.m_block_inten_table = (best_x >> 1) & 7;
+      memset(results.m_pSelectors, (best_x >> 4) & 3, num_colors);
+
+      const uint best_packed_c0 = (best_x >> 8) & 255;
+      results.m_block_color_unscaled[best_i] = static_cast<uint8>(best_packed_c0);
+      results.m_block_color_unscaled[s_next_comp[best_i]] = static_cast<uint8>(best_packed_c1);
+      results.m_block_color_unscaled[s_next_comp[best_i + 1]] = static_cast<uint8>(best_packed_c2);
+      results.m_error = best_error;
+      
+      return best_error;
+   }
+
+   // Function originally from RYG's public domain real-time DXT1 compressor, modified for 555.
+   static void dither_block_555(color_quad_u8* dest, const color_quad_u8* block)
+   {
+      int err[8],*ep1 = err,*ep2 = err+4;
+      uint8 *quant = g_quant5_tab+8;
+
+      memset(dest, 0xFF, sizeof(color_quad_u8)*16);
+
+      // process channels seperately
+      for(int ch=0;ch<3;ch++)
+      {
+         uint8* bp = (uint8*)block;
+         uint8* dp = (uint8*)dest;
+
+         bp += ch; dp += ch;
+
+         memset(err,0, sizeof(err));
+         for(int y = 0; y < 4; y++)
+         {
+            // pixel 0
+            dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
+            ep1[0] = bp[ 0] - dp[ 0];
+
+            // pixel 1
+            dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
+            ep1[1] = bp[ 4] - dp[ 4];
+
+            // pixel 2
+            dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
+            ep1[2] = bp[ 8] - dp[ 8];
+
+            // pixel 3
+            dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
+            ep1[3] = bp[12] - dp[12];
+
+            // advance to next line
+            int* tmp = ep1; ep1 = ep2; ep2 = tmp;
+            bp += 16;
+            dp += 16;
+         }
+      }
+   }
+
+   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params)
+   {
+      const color_quad_u8* pSrc_pixels = reinterpret_cast<const color_quad_u8*>(pSrc_pixels_rgba);
+      etc1_block& dst_block = *static_cast<etc1_block*>(pETC1_block);
+
+#ifdef RG_ETC1_BUILD_DEBUG
+      // Ensure all alpha values are 0xFF.
+      for (uint i = 0; i < 16; i++)
+      {
+         RG_ETC1_ASSERT(pSrc_pixels[i].a == 255);
+      }
+#endif
+
+      color_quad_u8 src_pixel0(pSrc_pixels[0]);
+
+      // Check for solid block.
+      const uint32 first_pixel_u32 = pSrc_pixels->m_u32;
+      int r;
+      for (r = 15; r >= 1; --r)
+         if (pSrc_pixels[r].m_u32 != first_pixel_u32)
+            break;
+      if (!r)
+         return static_cast<unsigned int>(16 * pack_etc1_block_solid_color(dst_block, &pSrc_pixels[0].r, pack_params));
+      
+      color_quad_u8 dithered_pixels[16];
+      if (pack_params.m_dithering)
+      {
+         dither_block_555(dithered_pixels, pSrc_pixels);
+         pSrc_pixels = dithered_pixels;
+      }
+
+      etc1_optimizer optimizer;
+
+      uint64 best_error = cUINT64_MAX;
+      uint best_flip = false, best_use_color4 = false;
+      
+      uint8 best_selectors[2][8];
+      etc1_optimizer::results best_results[2];
+      for (uint i = 0; i < 2; i++)
+      {
+         best_results[i].m_n = 8;
+         best_results[i].m_pSelectors = best_selectors[i];
+      }
+      
+      uint8 selectors[3][8];
+      etc1_optimizer::results results[3];
+      
+      for (uint i = 0; i < 3; i++)
+      {
+         results[i].m_n = 8;
+         results[i].m_pSelectors = selectors[i];
+      }
+            
+      color_quad_u8 subblock_pixels[8];
+
+      etc1_optimizer::params params(pack_params);
+      params.m_num_src_pixels = 8;
+      params.m_pSrc_pixels = subblock_pixels;
+
+      for (uint flip = 0; flip < 2; flip++)
+      {
+         for (uint use_color4 = 0; use_color4 < 2; use_color4++)
+         {
+            uint64 trial_error = 0;
+
+            uint subblock;
+            for (subblock = 0; subblock < 2; subblock++)
+            {
+               if (flip)
+                  memcpy(subblock_pixels, pSrc_pixels + subblock * 8, sizeof(color_quad_u8) * 8);
+               else
+               {
+                  const color_quad_u8* pSrc_col = pSrc_pixels + subblock * 2;
+                  subblock_pixels[0] = pSrc_col[0]; subblock_pixels[1] = pSrc_col[4]; subblock_pixels[2] = pSrc_col[8]; subblock_pixels[3] = pSrc_col[12];
+                  subblock_pixels[4] = pSrc_col[1]; subblock_pixels[5] = pSrc_col[5]; subblock_pixels[6] = pSrc_col[9]; subblock_pixels[7] = pSrc_col[13];
+               }
+
+               results[2].m_error = cUINT64_MAX;
+               if ((params.m_quality >= cMediumQuality) && ((subblock) || (use_color4)))
+               {
+                  const uint32 subblock_pixel0_u32 = subblock_pixels[0].m_u32;
+                  for (r = 7; r >= 1; --r)
+                     if (subblock_pixels[r].m_u32 != subblock_pixel0_u32)
+                        break;
+                  if (!r)
+                  {
+                     pack_etc1_block_solid_color_constrained(results[2], 8, &subblock_pixels[0].r, pack_params, !use_color4, (subblock && !use_color4) ? &results[0].m_block_color_unscaled : NULL);
+                  }
+               }
+
+               params.m_use_color4 = (use_color4 != 0);
+               params.m_constrain_against_base_color5 = false;
+
+               if ((!use_color4) && (subblock))
+               {
+                  params.m_constrain_against_base_color5 = true;
+                  params.m_base_color5 = results[0].m_block_color_unscaled;
+               }
+                              
+               if (params.m_quality == cHighQuality)
+               {
+                  static const int s_scan_delta_0_to_4[] = { -4, -3, -2, -1, 0, 1, 2, 3, 4 };
+                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_4);
+                  params.m_pScan_deltas = s_scan_delta_0_to_4;
+               }
+               else if (params.m_quality == cMediumQuality)
+               {
+                  static const int s_scan_delta_0_to_1[] = { -1, 0, 1 };
+                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0_to_1);
+                  params.m_pScan_deltas = s_scan_delta_0_to_1;
+               }
+               else
+               {
+                  static const int s_scan_delta_0[] = { 0 };
+                  params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_0);
+                  params.m_pScan_deltas = s_scan_delta_0;
+               }
+               
+               optimizer.init(params, results[subblock]);
+               if (!optimizer.compute())
+                  break;
+                              
+               if (params.m_quality >= cMediumQuality)
+               {
+                  // TODO: Fix fairly arbitrary/unrefined thresholds that control how far away to scan for potentially better solutions.
+                  const uint refinement_error_thresh0 = 3000;
+                  const uint refinement_error_thresh1 = 6000;
+                  if (results[subblock].m_error > refinement_error_thresh0)
+                  {
+                     if (params.m_quality == cMediumQuality)
+                     {
+                        static const int s_scan_delta_2_to_3[] = { -3, -2, 2, 3 };
+                        params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_2_to_3);
+                        params.m_pScan_deltas = s_scan_delta_2_to_3;
+                     }
+                     else
+                     {
+                        static const int s_scan_delta_5_to_5[] = { -5, 5 };
+                        static const int s_scan_delta_5_to_8[] = { -8, -7, -6, -5, 5, 6, 7, 8 };
+                        if (results[subblock].m_error > refinement_error_thresh1)
+                        {
+                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_8);
+                           params.m_pScan_deltas = s_scan_delta_5_to_8;
+                        }
+                        else
+                        {
+                           params.m_scan_delta_size = RG_ETC1_ARRAY_SIZE(s_scan_delta_5_to_5);
+                           params.m_pScan_deltas = s_scan_delta_5_to_5;
+                        }
+                     }
+
+                     if (!optimizer.compute())
+                        break;
+                  }
+
+                  if (results[2].m_error < results[subblock].m_error)
+                     results[subblock] = results[2];
+               }
+                            
+               trial_error += results[subblock].m_error;
+               if (trial_error >= best_error)
+                  break;
+            }
+
+            if (subblock < 2)
+               continue;
+
+            best_error = trial_error;
+            best_results[0] = results[0];
+            best_results[1] = results[1];
+            best_flip = flip;
+            best_use_color4 = use_color4;
+            
+         } // use_color4
+
+      } // flip
+
+      int dr = best_results[1].m_block_color_unscaled.r - best_results[0].m_block_color_unscaled.r;
+      int dg = best_results[1].m_block_color_unscaled.g - best_results[0].m_block_color_unscaled.g;
+      int db = best_results[1].m_block_color_unscaled.b - best_results[0].m_block_color_unscaled.b;
+      RG_ETC1_ASSERT(best_use_color4 || ((rg_etc1::minimum(dr, dg, db) >= cETC1ColorDeltaMin) && (rg_etc1::maximum(dr, dg, db) <= cETC1ColorDeltaMax)));
+           
+      if (best_use_color4)
+      {
+         dst_block.m_bytes[0] = static_cast<uint8>(best_results[1].m_block_color_unscaled.r | (best_results[0].m_block_color_unscaled.r << 4));
+         dst_block.m_bytes[1] = static_cast<uint8>(best_results[1].m_block_color_unscaled.g | (best_results[0].m_block_color_unscaled.g << 4));
+         dst_block.m_bytes[2] = static_cast<uint8>(best_results[1].m_block_color_unscaled.b | (best_results[0].m_block_color_unscaled.b << 4));
+      }
+      else
+      {
+         if (dr < 0) dr += 8; dst_block.m_bytes[0] = static_cast<uint8>((best_results[0].m_block_color_unscaled.r << 3) | dr);
+         if (dg < 0) dg += 8; dst_block.m_bytes[1] = static_cast<uint8>((best_results[0].m_block_color_unscaled.g << 3) | dg);
+         if (db < 0) db += 8; dst_block.m_bytes[2] = static_cast<uint8>((best_results[0].m_block_color_unscaled.b << 3) | db);
+      }
+      
+      dst_block.m_bytes[3] = static_cast<uint8>( (best_results[1].m_block_inten_table << 2) | (best_results[0].m_block_inten_table << 5) | ((~best_use_color4 & 1) << 1) | best_flip );
+      
+      uint selector0 = 0, selector1 = 0;
+      if (best_flip)
+      {
+         // flipped:
+         // { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },               
+         // { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 } 
+         //
+         // { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
+         // { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
+         const uint8* pSelectors0 = best_results[0].m_pSelectors;
+         const uint8* pSelectors1 = best_results[1].m_pSelectors;
+         for (int x = 3; x >= 0; --x)
+         {
+            uint b;
+            b = g_selector_index_to_etc1[pSelectors1[4 + x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+            b = g_selector_index_to_etc1[pSelectors1[x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+            b = g_selector_index_to_etc1[pSelectors0[4 + x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+            b = g_selector_index_to_etc1[pSelectors0[x]];
+            selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+         }
+      }
+      else
+      {
+         // non-flipped:
+         // { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
+         // { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
+         //
+         // { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+         // { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
+         for (int subblock = 1; subblock >= 0; --subblock)
+         {
+            const uint8* pSelectors = best_results[subblock].m_pSelectors + 4;
+            for (uint i = 0; i < 2; i++)
+            {
+               uint b;
+               b = g_selector_index_to_etc1[pSelectors[3]];
+               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+               b = g_selector_index_to_etc1[pSelectors[2]];
+               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+               b = g_selector_index_to_etc1[pSelectors[1]];
+               selector0 = (selector0 << 1) | (b & 1); selector1 = (selector1 << 1) | (b >> 1);
+
+               b = g_selector_index_to_etc1[pSelectors[0]];
+               selector0 = (selector0 << 1) | (b & 1);selector1 = (selector1 << 1) | (b >> 1);
+
+               pSelectors -= 4;
+            }
+         }
+      }
+                  
+      dst_block.m_bytes[4] = static_cast<uint8>(selector1 >> 8); dst_block.m_bytes[5] = static_cast<uint8>(selector1 & 0xFF);
+      dst_block.m_bytes[6] = static_cast<uint8>(selector0 >> 8); dst_block.m_bytes[7] = static_cast<uint8>(selector0 & 0xFF);
+
+      return static_cast<unsigned int>(best_error);
+   }
+
+} // namespace rg_etc1

+ 76 - 76
drivers/etc1/rg_etc1.h

@@ -1,76 +1,76 @@
-// File: rg_etc1.h - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
-// Please see ZLIB license at the end of this file.
-#pragma once
-
-namespace rg_etc1
-{
-   // Unpacks an 8-byte ETC1 compressed block to a block of 4x4 32bpp RGBA pixels.
-   // Returns false if the block is invalid. Invalid blocks will still be unpacked with clamping.
-   // This function is thread safe, and does not dynamically allocate any memory.
-   // If preserve_alpha is true, the alpha channel of the destination pixels will not be overwritten. Otherwise, alpha will be set to 255.
-   bool unpack_etc1_block(const void *pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha = false);
-
-   // Quality setting = the higher the quality, the slower. 
-   // To pack large textures, it is highly recommended to call pack_etc1_block() in parallel, on different blocks, from multiple threads (particularly when using cHighQuality).
-   enum etc1_quality
-   { 
-      cLowQuality,
-      cMediumQuality,
-      cHighQuality,
-   };
-      
-   struct etc1_pack_params
-   {
-      etc1_quality m_quality;
-      bool m_dithering;
-                              
-      inline etc1_pack_params() 
-      {
-         clear();
-      }
-
-      void clear()
-      {
-         m_quality = cHighQuality;
-         m_dithering = false;
-      }
-   };
-
-   // Important: pack_etc1_block_init() must be called before calling pack_etc1_block().
-   void pack_etc1_block_init();
-
-   // Packs a 4x4 block of 32bpp RGBA pixels to an 8-byte ETC1 block.
-   // 32-bit RGBA pixels must always be arranged as (R,G,B,A) (R first, A last) in memory, independent of platform endianness. A should always be 255.
-   // Returns squared error of result.
-   // This function is thread safe, and does not dynamically allocate any memory.
-   // pack_etc1_block() does not currently support "perceptual" colorspace metrics - it primarily optimizes for RGB RMSE.
-   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params);
-            
-} // namespace rg_etc1
-
-//------------------------------------------------------------------------------
-//
-// rg_etc1 uses the ZLIB license:
-// http://opensource.org/licenses/Zlib
-//
-// Copyright (c) 2012 Rich Geldreich
-//
-// This software is provided 'as-is', without any express or implied
-// warranty.  In no event will the authors be held liable for any damages
-// arising from the use of this software.
-//
-// Permission is granted to anyone to use this software for any purpose,
-// including commercial applications, and to alter it and redistribute it
-// freely, subject to the following restrictions:
-//
-// 1. The origin of this software must not be misrepresented; you must not
-// claim that you wrote the original software. If you use this software
-// in a product, an acknowledgment in the product documentation would be
-// appreciated but is not required.
-//
-// 2. Altered source versions must be plainly marked as such, and must not be
-// misrepresented as being the original software.
-//
-// 3. This notice may not be removed or altered from any source distribution.
-//
-//------------------------------------------------------------------------------
+// File: rg_etc1.h - Fast, high quality ETC1 block packer/unpacker - Rich Geldreich <[email protected]>
+// Please see ZLIB license at the end of this file.
+#pragma once
+
+namespace rg_etc1
+{
+   // Unpacks an 8-byte ETC1 compressed block to a block of 4x4 32bpp RGBA pixels.
+   // Returns false if the block is invalid. Invalid blocks will still be unpacked with clamping.
+   // This function is thread safe, and does not dynamically allocate any memory.
+   // If preserve_alpha is true, the alpha channel of the destination pixels will not be overwritten. Otherwise, alpha will be set to 255.
+   bool unpack_etc1_block(const void *pETC1_block, unsigned int* pDst_pixels_rgba, bool preserve_alpha = false);
+
+   // Quality setting = the higher the quality, the slower. 
+   // To pack large textures, it is highly recommended to call pack_etc1_block() in parallel, on different blocks, from multiple threads (particularly when using cHighQuality).
+   enum etc1_quality
+   { 
+      cLowQuality,
+      cMediumQuality,
+      cHighQuality,
+   };
+      
+   struct etc1_pack_params
+   {
+      etc1_quality m_quality;
+      bool m_dithering;
+                              
+      inline etc1_pack_params() 
+      {
+         clear();
+      }
+
+      void clear()
+      {
+         m_quality = cHighQuality;
+         m_dithering = false;
+      }
+   };
+
+   // Important: pack_etc1_block_init() must be called before calling pack_etc1_block().
+   void pack_etc1_block_init();
+
+   // Packs a 4x4 block of 32bpp RGBA pixels to an 8-byte ETC1 block.
+   // 32-bit RGBA pixels must always be arranged as (R,G,B,A) (R first, A last) in memory, independent of platform endianness. A should always be 255.
+   // Returns squared error of result.
+   // This function is thread safe, and does not dynamically allocate any memory.
+   // pack_etc1_block() does not currently support "perceptual" colorspace metrics - it primarily optimizes for RGB RMSE.
+   unsigned int pack_etc1_block(void* pETC1_block, const unsigned int* pSrc_pixels_rgba, etc1_pack_params& pack_params);
+            
+} // namespace rg_etc1
+
+//------------------------------------------------------------------------------
+//
+// rg_etc1 uses the ZLIB license:
+// http://opensource.org/licenses/Zlib
+//
+// Copyright (c) 2012 Rich Geldreich
+//
+// This software is provided 'as-is', without any express or implied
+// warranty.  In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+// claim that you wrote the original software. If you use this software
+// in a product, an acknowledgment in the product documentation would be
+// appreciated but is not required.
+//
+// 2. Altered source versions must be plainly marked as such, and must not be
+// misrepresented as being the original software.
+//
+// 3. This notice may not be removed or altered from any source distribution.
+//
+//------------------------------------------------------------------------------

+ 4 - 0
drivers/gl_context/glew.c

@@ -1,3 +1,7 @@
+#ifdef __HAIKU__
+	#undef GLEW_ENABLED
+#endif
+
 #ifdef GLEW_ENABLED
 /*
 ** The OpenGL Extension Wrangler Library

+ 56 - 5
drivers/gles2/rasterizer_gles2.cpp

@@ -1408,6 +1408,40 @@ GLuint RasterizerGLES2::_texture_get_name(RID p_tex) {
 	return texture->tex_id;
 };
 
+void RasterizerGLES2::texture_set_path(RID p_texture,const String& p_path) {
+	Texture * texture = texture_owner.get(p_texture);
+	ERR_FAIL_COND(!texture);
+
+	texture->path=p_path;
+
+}
+
+String RasterizerGLES2::texture_get_path(RID p_texture) const{
+
+	Texture * texture = texture_owner.get(p_texture);
+	ERR_FAIL_COND_V(!texture,String());
+	return texture->path;
+}
+void RasterizerGLES2::texture_debug_usage(List<VS::TextureInfo> *r_info){
+
+	List<RID> textures;
+	texture_owner.get_owned_list(&textures);
+
+	for (List<RID>::Element *E=textures.front();E;E=E->next()) {
+
+		Texture *t = texture_owner.get(E->get());
+		if (!t)
+			continue;
+		VS::TextureInfo tinfo;
+		tinfo.path=t->path;
+		tinfo.format=t->format;
+		tinfo.size.x=t->alloc_width;
+		tinfo.size.y=t->alloc_height;
+		tinfo.bytes=t->total_data_size;
+		r_info->push_back(tinfo);
+	}
+
+}
 
 /* SHADER API */
 
@@ -4145,7 +4179,7 @@ void RasterizerGLES2::begin_frame() {
 
 	//fragment_lighting=Globals::get_singleton()->get("rasterizer/use_fragment_lighting");
 #ifdef TOOLS_ENABLED
-	canvas_shader.set_conditional(CanvasShaderGLES2::USE_PIXEL_SNAP,GLOBAL_DEF("rasterizer/use_pixel_snap",false));
+	canvas_shader.set_conditional(CanvasShaderGLES2::USE_PIXEL_SNAP,GLOBAL_DEF("display/use_2d_pixel_snap",false));
 	shadow_filter=ShadowFilterTechnique(int(Globals::get_singleton()->get("rasterizer/shadow_filter")));
 #endif
 
@@ -4160,7 +4194,6 @@ void RasterizerGLES2::begin_frame() {
 	time_delta=time-last_time;
 	last_time=time;
 	frame++;
-	clear_viewport(Color(1,0,0.5));
 
 	_rinfo.vertex_count=0;
 	_rinfo.object_count=0;
@@ -5970,6 +6003,10 @@ void RasterizerGLES2::_render(const Geometry *p_geometry,const Material *p_mater
 			if (element_count==0)
 				return;
 
+			if (mm->visible>=0) {
+				element_count=MIN(element_count,mm->visible);
+			}
+
 			const MultiMesh::Element *elements=&mm->elements[0];
 
 			_rinfo.vertex_count+=s->array_len*element_count;
@@ -9161,7 +9198,11 @@ void RasterizerGLES2::_canvas_item_setup_shader_params(CanvasItemMaterial *mater
 		glBindTexture(GL_TEXTURE_2D,framebuffer.sample_color);
 		if (framebuffer.scale==1 && !canvas_texscreen_used) {
 #ifdef GLEW_ENABLED
-			glReadBuffer(GL_COLOR_ATTACHMENT0);
+			if (current_rt) {
+				glReadBuffer(GL_COLOR_ATTACHMENT0);
+			} else {
+				glReadBuffer(GL_BACK);
+			}
 #endif
 			glCopyTexSubImage2D(GL_TEXTURE_2D,0,x,y,x,y,viewport.width,viewport.height);
 //			if (current_clip) {
@@ -9341,7 +9382,11 @@ void RasterizerGLES2::canvas_render_items(CanvasItem *p_item_list,int p_z,const
 			glBindTexture(GL_TEXTURE_2D,framebuffer.sample_color);
 
 #ifdef GLEW_ENABLED
-			glReadBuffer(GL_COLOR_ATTACHMENT0);
+			if (current_rt) {
+				glReadBuffer(GL_COLOR_ATTACHMENT0);
+			} else {
+				glReadBuffer(GL_BACK);
+			}
 #endif
 			glCopyTexSubImage2D(GL_TEXTURE_2D,0,x,y,x,y,w,h);
 //			if (current_clip) {
@@ -10804,7 +10849,7 @@ void RasterizerGLES2::init() {
 	copy_shader.set_conditional(CopyShaderGLES2::USE_8BIT_HDR,!use_fp16_fb);
 	canvas_shader.set_conditional(CanvasShaderGLES2::USE_DEPTH_SHADOWS,read_depth_supported);
 
-	canvas_shader.set_conditional(CanvasShaderGLES2::USE_PIXEL_SNAP,GLOBAL_DEF("rasterizer/use_pixel_snap",false));
+	canvas_shader.set_conditional(CanvasShaderGLES2::USE_PIXEL_SNAP,GLOBAL_DEF("display/use_2d_pixel_snap",false));
 
 	npo2_textures_available=true;
 	//fragment_lighting=false;
@@ -11188,6 +11233,12 @@ RasterizerGLES2::RasterizerGLES2(bool p_compress_arrays,bool p_keep_ram_copy,boo
 	tc0_idx=0;
 };
 
+void RasterizerGLES2::restore_framebuffer() {
+
+	glBindFramebuffer(GL_FRAMEBUFFER, base_framebuffer);
+	
+}
+
 RasterizerGLES2::~RasterizerGLES2() {
 
 	memdelete_arr(skinned_buffer);

+ 7 - 0
drivers/gles2/rasterizer_gles2.h

@@ -115,6 +115,7 @@ class RasterizerGLES2 : public Rasterizer {
 
 	struct Texture {
 
+		String path;
 		uint32_t flags;
 		int width,height;
 		int alloc_width, alloc_height;
@@ -1325,6 +1326,10 @@ public:
 	virtual void texture_set_size_override(RID p_texture,int p_width, int p_height);
 	virtual void texture_set_reload_hook(RID p_texture,ObjectID p_owner,const StringName& p_function) const;
 
+	virtual void texture_set_path(RID p_texture,const String& p_path);
+	virtual String texture_get_path(RID p_texture) const;
+	virtual void texture_debug_usage(List<VS::TextureInfo> *r_info);
+
 	GLuint _texture_get_name(RID p_tex);
 
 	/* SHADER API */
@@ -1695,6 +1700,8 @@ public:
 	void reload_vram();
 
 	virtual bool has_feature(VS::Features p_feature) const;
+	
+	virtual void restore_framebuffer();
 
 	static RasterizerGLES2* get_singleton();
 

+ 11 - 11
drivers/gles2/shader_compiler_gles2.cpp

@@ -132,18 +132,18 @@ String ShaderCompilerGLES2::dump_node_code(SL::Node *p_node,int p_level,bool p_a
 			SL::BlockNode *bnode=(SL::BlockNode*)p_node;
 
 			//variables
-			code+="{"ENDL;
+            code+="{" ENDL;
 			for(Map<StringName,SL::DataType>::Element *E=bnode->variables.front();E;E=E->next()) {
 
-				code+=_mktab(p_level)+_typestr(E->value())+" "+replace_string(E->key())+";"ENDL;
+                code+=_mktab(p_level)+_typestr(E->value())+" "+replace_string(E->key())+";" ENDL;
 			}
 
 			for(int i=0;i<bnode->statements.size();i++) {
 
-				code+=_mktab(p_level)+dump_node_code(bnode->statements[i],p_level)+";"ENDL;
+                code+=_mktab(p_level)+dump_node_code(bnode->statements[i],p_level)+";" ENDL;
 			}
 
-			code+="}"ENDL;
+            code+="}" ENDL;
 
 		} break;
 		case SL::Node::TYPE_VARIABLE: {
@@ -489,15 +489,15 @@ String ShaderCompilerGLES2::dump_node_code(SL::Node *p_node,int p_level,bool p_a
 			SL::ControlFlowNode *cfnode=(SL::ControlFlowNode*)p_node;
 			if (cfnode->flow_op==SL::FLOW_OP_IF) {
 
-				code+="if ("+dump_node_code(cfnode->statements[0],p_level)+") {"ENDL;
+                code+="if ("+dump_node_code(cfnode->statements[0],p_level)+") {" ENDL;
 				code+=dump_node_code(cfnode->statements[1],p_level+1);
 				if (cfnode->statements.size()==3) {
 
-					code+="} else {"ENDL;
+                    code+="} else {" ENDL;
 					code+=dump_node_code(cfnode->statements[2],p_level+1);
 				}
 
-				code+="}"ENDL;
+                code+="}" ENDL;
 
 			} else if (cfnode->flow_op==SL::FLOW_OP_RETURN) {
 
@@ -560,7 +560,7 @@ Error ShaderCompilerGLES2::compile_node(SL::ProgramNode *p_program) {
 		ubase=uniforms->size();
 	for(Map<StringName,SL::Uniform>::Element *E=p_program->uniforms.front();E;E=E->next()) {
 
-		String uline="uniform "+_typestr(E->get().type)+" _"+E->key().operator String()+";"ENDL;
+        String uline="uniform "+_typestr(E->get().type)+" _"+E->key().operator String()+";" ENDL;
 
 		global_code+=uline;
 		if (uniforms) {
@@ -593,10 +593,10 @@ Error ShaderCompilerGLES2::compile_node(SL::ProgramNode *p_program) {
 			header+=_typestr(fnode->arguments[i].type)+" "+replace_string(fnode->arguments[i].name);
 		}
 
-		header+=") {"ENDL;
+        header+=") {" ENDL;
 		String fcode=header;
 		fcode+=dump_node_code(fnode->body,1);
-		fcode+="}"ENDL;
+        fcode+="}" ENDL;
 		global_code+=fcode;
 
 	}
@@ -605,7 +605,7 @@ Error ShaderCompilerGLES2::compile_node(SL::ProgramNode *p_program) {
 
 		StringName varname=E->key();
 		String newvarname=replace_string(varname);
-		global_code+="uniform "+_typestr(E->get())+" "+newvarname+";"ENDL;
+        global_code+="uniform "+_typestr(E->get())+" "+newvarname+";" ENDL;
 	}*/
 
 	code=dump_node_code(p_program,0);

+ 64 - 69
drivers/mpc/audio_stream_mpc.cpp

@@ -1,7 +1,7 @@
 #include "audio_stream_mpc.h"
 
 
-Error AudioStreamMPC::_open_file() {
+Error AudioStreamPlaybackMPC::_open_file() {
 
 	if (f) {
 		memdelete(f);
@@ -41,7 +41,7 @@ Error AudioStreamMPC::_open_file() {
 	return OK;
 }
 
-void AudioStreamMPC::_close_file() {
+void AudioStreamPlaybackMPC::_close_file() {
 
 	if (f) {
 		memdelete(f);
@@ -52,7 +52,7 @@ void AudioStreamMPC::_close_file() {
 	data_ofs=0;
 }
 
-int AudioStreamMPC::_read_file(void *p_dst,int p_bytes) {
+int AudioStreamPlaybackMPC::_read_file(void *p_dst,int p_bytes) {
 
 	if (f)
 		return f->get_buffer((uint8_t*)p_dst,p_bytes);
@@ -68,7 +68,7 @@ int AudioStreamMPC::_read_file(void *p_dst,int p_bytes) {
 	return p_bytes;
 }
 
-bool AudioStreamMPC::_seek_file(int p_pos){
+bool AudioStreamPlaybackMPC::_seek_file(int p_pos){
 
 	if (p_pos<0 || p_pos>streamlen)
 		return false;
@@ -83,7 +83,7 @@ bool AudioStreamMPC::_seek_file(int p_pos){
 	return true;
 
 }
-int AudioStreamMPC::_tell_file()  const{
+int AudioStreamPlaybackMPC::_tell_file()  const{
 
 	if (f)
 		return f->get_pos();
@@ -93,13 +93,13 @@ int AudioStreamMPC::_tell_file()  const{
 
 }
 
-int AudioStreamMPC::_sizeof_file() const{
+int AudioStreamPlaybackMPC::_sizeof_file() const{
 
 	//print_line("sizeof file, get: "+itos(streamlen));
 	return streamlen;
 }
 
-bool AudioStreamMPC::_canseek_file() const{
+bool AudioStreamPlaybackMPC::_canseek_file() const{
 
 	//print_line("canseek file, get true");
 	return true;
@@ -107,51 +107,46 @@ bool AudioStreamMPC::_canseek_file() const{
 
 /////////////////////
 
-mpc_int32_t AudioStreamMPC::_mpc_read(mpc_reader *p_reader,void *p_dst, mpc_int32_t p_bytes) {
+mpc_int32_t AudioStreamPlaybackMPC::_mpc_read(mpc_reader *p_reader,void *p_dst, mpc_int32_t p_bytes) {
 
-	AudioStreamMPC *smpc=(AudioStreamMPC *)p_reader->data;
+	AudioStreamPlaybackMPC *smpc=(AudioStreamPlaybackMPC *)p_reader->data;
 	return smpc->_read_file(p_dst,p_bytes);
 }
 
-mpc_bool_t AudioStreamMPC::_mpc_seek(mpc_reader *p_reader,mpc_int32_t p_offset) {
+mpc_bool_t AudioStreamPlaybackMPC::_mpc_seek(mpc_reader *p_reader,mpc_int32_t p_offset) {
 
-	AudioStreamMPC *smpc=(AudioStreamMPC *)p_reader->data;
+	AudioStreamPlaybackMPC *smpc=(AudioStreamPlaybackMPC *)p_reader->data;
 	return smpc->_seek_file(p_offset);
 
 }
-mpc_int32_t AudioStreamMPC::_mpc_tell(mpc_reader *p_reader) {
+mpc_int32_t AudioStreamPlaybackMPC::_mpc_tell(mpc_reader *p_reader) {
 
-	AudioStreamMPC *smpc=(AudioStreamMPC *)p_reader->data;
+	AudioStreamPlaybackMPC *smpc=(AudioStreamPlaybackMPC *)p_reader->data;
 	return smpc->_tell_file();
 
 }
-mpc_int32_t AudioStreamMPC::_mpc_get_size(mpc_reader *p_reader) {
+mpc_int32_t AudioStreamPlaybackMPC::_mpc_get_size(mpc_reader *p_reader) {
 
-	AudioStreamMPC *smpc=(AudioStreamMPC *)p_reader->data;
+	AudioStreamPlaybackMPC *smpc=(AudioStreamPlaybackMPC *)p_reader->data;
 	return smpc->_sizeof_file();
 
 
 }
-mpc_bool_t AudioStreamMPC::_mpc_canseek(mpc_reader *p_reader) {
+mpc_bool_t AudioStreamPlaybackMPC::_mpc_canseek(mpc_reader *p_reader) {
 
-	AudioStreamMPC *smpc=(AudioStreamMPC *)p_reader->data;
+	AudioStreamPlaybackMPC *smpc=(AudioStreamPlaybackMPC *)p_reader->data;
 	return smpc->_canseek_file();
 }
 
 
 
-bool AudioStreamMPC::_can_mix() const {
 
-	return /*active &&*/ !paused;
-}
-
-
-void AudioStreamMPC::update() {
+int AudioStreamPlaybackMPC::mix(int16_t* p_bufer,int p_frames) {
 
 	if (!active || paused)
-		return;
+		return 0;
 
-	int todo=get_todo();
+	int todo=p_frames;
 
 	while(todo>MPC_DECODER_BUFFER_LENGTH/si.channels) {
 
@@ -162,7 +157,7 @@ void AudioStreamMPC::update() {
 		mpc_status err = mpc_demux_decode(demux, &frame);
 		if (frame.bits!=-1) {
 
-			int16_t *dst_buff = get_write_buffer();
+			int16_t *dst_buff = p_bufer;
 
 #ifdef MPC_FIXED_POINT
 
@@ -185,21 +180,21 @@ void AudioStreamMPC::update() {
 #endif
 
 			int frames = frame.samples;
-			write(frames);
+			p_bufer+=si.channels*frames;
 			todo-=frames;
 		} else {
 
 			if (err != MPC_STATUS_OK) {
 
 				stop();
-				ERR_EXPLAIN("Error decoding MPC");
-				ERR_FAIL();
+				ERR_PRINT("Error decoding MPC");
+				break;
 			} else {
 
 				//finished
 				if (!loop) {
 					stop();
-					return;
+					break;
 				} else {
 
 
@@ -213,9 +208,11 @@ void AudioStreamMPC::update() {
 			}
 		}
 	}
+
+	return p_frames-todo;
 }
 
-Error AudioStreamMPC::_reload() {
+Error AudioStreamPlaybackMPC::_reload() {
 
 	ERR_FAIL_COND_V(demux!=NULL, ERR_FILE_ALREADY_IN_USE);
 
@@ -224,30 +221,39 @@ Error AudioStreamMPC::_reload() {
 
 	demux = mpc_demux_init(&reader);
 	ERR_FAIL_COND_V(!demux,ERR_CANT_CREATE);
-
 	mpc_demux_get_info(demux,  &si);
-	_setup(si.channels,si.sample_freq,MPC_DECODER_BUFFER_LENGTH*2/si.channels);
 
 	return OK;
 }
 
-void AudioStreamMPC::set_file(const String& p_file) {
+void AudioStreamPlaybackMPC::set_file(const String& p_file) {
 
 	file=p_file;
 
+	Error err = _open_file();
+	ERR_FAIL_COND(err!=OK);
+	demux = mpc_demux_init(&reader);
+	ERR_FAIL_COND(!demux);
+	mpc_demux_get_info(demux,  &si);
+	stream_min_size=MPC_DECODER_BUFFER_LENGTH*2/si.channels;
+	stream_rate=si.sample_freq;
+	stream_channels=si.channels;
+
+	mpc_demux_exit(demux);
+	demux=NULL;
+	_close_file();
+
 }
 
 
-String AudioStreamMPC::get_file() const {
+String AudioStreamPlaybackMPC::get_file() const {
 
 	return file;
 }
 
 
-void AudioStreamMPC::play() {
-
+void AudioStreamPlaybackMPC::play(float p_offset) {
 
-	_THREAD_SAFE_METHOD_
 
 	if (active)
 		stop();
@@ -262,9 +268,9 @@ void AudioStreamMPC::play() {
 
 }
 
-void AudioStreamMPC::stop()  {
+void AudioStreamPlaybackMPC::stop()  {
+
 
-	_THREAD_SAFE_METHOD_
 	if (!active)
 		return;
 	if (demux) {
@@ -275,70 +281,58 @@ void AudioStreamMPC::stop()  {
 	active=false;
 
 }
-bool AudioStreamMPC::is_playing() const  {
+bool AudioStreamPlaybackMPC::is_playing() const  {
 
-	return active || (get_total() - get_todo() -1 > 0);
+	return active;
 }
 
-void AudioStreamMPC::set_paused(bool p_paused)  {
 
-	paused=p_paused;
-}
-bool AudioStreamMPC::is_paused(bool p_paused) const  {
-
-	return paused;
-}
-
-void AudioStreamMPC::set_loop(bool p_enable)  {
+void AudioStreamPlaybackMPC::set_loop(bool p_enable)  {
 
 	loop=p_enable;
 }
-bool AudioStreamMPC::has_loop() const  {
+bool AudioStreamPlaybackMPC::has_loop() const  {
 
 	return loop;
 }
 
-float AudioStreamMPC::get_length() const {
+float AudioStreamPlaybackMPC::get_length() const {
 
 	return 0;
 }
 
-String AudioStreamMPC::get_stream_name() const {
+String AudioStreamPlaybackMPC::get_stream_name() const {
 
 	return "";
 }
 
-int AudioStreamMPC::get_loop_count() const {
+int AudioStreamPlaybackMPC::get_loop_count() const {
 
 	return 0;
 }
 
-float AudioStreamMPC::get_pos() const {
+float AudioStreamPlaybackMPC::get_pos() const {
 
 	return 0;
 }
-void AudioStreamMPC::seek_pos(float p_time) {
+void AudioStreamPlaybackMPC::seek_pos(float p_time) {
 
 
 }
 
-AudioStream::UpdateMode AudioStreamMPC::get_update_mode() const {
 
-	return UPDATE_THREAD;
-}
+void AudioStreamPlaybackMPC::_bind_methods() {
 
-void AudioStreamMPC::_bind_methods() {
-
-	ObjectTypeDB::bind_method(_MD("set_file","name"),&AudioStreamMPC::set_file);
-	ObjectTypeDB::bind_method(_MD("get_file"),&AudioStreamMPC::get_file);
+	ObjectTypeDB::bind_method(_MD("set_file","name"),&AudioStreamPlaybackMPC::set_file);
+	ObjectTypeDB::bind_method(_MD("get_file"),&AudioStreamPlaybackMPC::get_file);
 
 	ADD_PROPERTYNZ( PropertyInfo(Variant::STRING,"file",PROPERTY_HINT_FILE,"mpc"), _SCS("set_file"), _SCS("get_file"));
 
 }
 
-AudioStreamMPC::AudioStreamMPC() {
+AudioStreamPlaybackMPC::AudioStreamPlaybackMPC() {
 
-	preload=true;
+	preload=false;
 	f=NULL;
 	streamlen=0;
 	data_ofs=0;
@@ -356,7 +350,7 @@ AudioStreamMPC::AudioStreamMPC() {
 
 }
 
-AudioStreamMPC::~AudioStreamMPC() {
+AudioStreamPlaybackMPC::~AudioStreamPlaybackMPC() {
 
 	stop();
 
@@ -366,8 +360,9 @@ AudioStreamMPC::~AudioStreamMPC() {
 
 
 
-RES ResourceFormatLoaderAudioStreamMPC::load(const String &p_path,const String& p_original_path) {
-
+RES ResourceFormatLoaderAudioStreamMPC::load(const String &p_path, const String& p_original_path, Error *r_error) {
+	if (r_error)
+		*r_error=OK; //streamed so it will always work..
 	AudioStreamMPC *mpc_stream = memnew(AudioStreamMPC);
 	mpc_stream->set_file(p_path);
 	return Ref<AudioStreamMPC>(mpc_stream);

+ 34 - 13
drivers/mpc/audio_stream_mpc.h

@@ -1,18 +1,17 @@
 #ifndef AUDIO_STREAM_MPC_H
 #define AUDIO_STREAM_MPC_H
 
-#include "scene/resources/audio_stream_resampled.h"
+#include "scene/resources/audio_stream.h"
 #include "os/file_access.h"
 #include "mpc/mpcdec.h"
 #include "os/thread_safe.h"
 #include "io/resource_loader.h"
 //#include "../libmpcdec/decoder.h"
 //#include "../libmpcdec/internal.h"
-class AudioStreamMPC : public AudioStreamResampled {
 
-	OBJ_TYPE( AudioStreamMPC, AudioStreamResampled );
+class AudioStreamPlaybackMPC : public AudioStreamPlayback {
 
-	_THREAD_SAFE_CLASS_
+	OBJ_TYPE( AudioStreamPlaybackMPC, AudioStreamPlayback );
 
 	bool preload;
 	FileAccess *f;
@@ -39,7 +38,9 @@ class AudioStreamMPC : public AudioStreamResampled {
 	static mpc_int32_t _mpc_get_size(mpc_reader *p_reader);
 	static mpc_bool_t _mpc_canseek(mpc_reader *p_reader);
 
-	virtual bool _can_mix() const ;
+	int stream_min_size;
+	int stream_rate;
+	int stream_channels;
 
 protected:
 	Error _open_file();
@@ -59,12 +60,10 @@ public:
 	void set_file(const String& p_file);
 	String get_file() const;
 
-	virtual void play();
+	virtual void play(float p_offset=0);
 	virtual void stop();
 	virtual bool is_playing() const;
 
-	virtual void set_paused(bool p_paused);
-	virtual bool is_paused(bool p_paused) const;
 
 	virtual void set_loop(bool p_enable);
 	virtual bool has_loop() const;
@@ -78,17 +77,39 @@ public:
 	virtual float get_pos() const;
 	virtual void seek_pos(float p_time);
 
-	virtual UpdateMode get_update_mode() const;
-	virtual void update();
+	virtual int get_channels() const { return stream_channels; }
+	virtual int get_mix_rate() const { return stream_rate; }
 
-	AudioStreamMPC();
-	~AudioStreamMPC();
+	virtual int get_minimum_buffer_size() const { return stream_min_size; }
+	virtual int mix(int16_t* p_bufer,int p_frames);
+
+	virtual void set_loop_restart_time(float p_time) {  }
+
+	AudioStreamPlaybackMPC();
+	~AudioStreamPlaybackMPC();
 };
 
+class AudioStreamMPC : public AudioStream {
+
+	OBJ_TYPE( AudioStreamMPC, AudioStream );
+
+	String file;
+public:
+
+	Ref<AudioStreamPlayback> instance_playback() {
+		Ref<AudioStreamPlaybackMPC> pb = memnew( AudioStreamPlaybackMPC );
+		pb->set_file(file);
+		return pb;
+	}
+
+	void set_file(const String& p_file) { file=p_file; }
+
+
+};
 
 class ResourceFormatLoaderAudioStreamMPC : public ResourceFormatLoader {
 public:
-	virtual RES load(const String &p_path,const String& p_original_path="");
+	virtual RES load(const String &p_path,const String& p_original_path="",Error *r_error=NULL);
 	virtual void get_recognized_extensions(List<String> *p_extensions) const;
 	virtual bool handles_type(const String& p_type) const;
 	virtual String get_resource_type(const String &p_path) const;

+ 5814 - 5814
drivers/nedmalloc/malloc.c.h

@@ -1,5814 +1,5814 @@
-#ifdef NEDMALLOC_ENABLED
-/*
-  This is a version (aka dlmalloc) of malloc/free/realloc written by
-  Doug Lea and released to the public domain, as explained at
-  http://creativecommons.org/licenses/publicdomain.  Send questions,
-  comments, complaints, performance data, etc to [email protected]
-
-* Version 2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
-
-   Note: There may be an updated version of this malloc obtainable at
-           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
-         Check before installing!
-
-* Quickstart
-
-  This library is all in one file to simplify the most common usage:
-  ftp it, compile it (-O3), and link it into another program. All of
-  the compile-time options default to reasonable values for use on
-  most platforms.  You might later want to step through various
-  compile-time and dynamic tuning options.
-
-  For convenience, an include file for code using this malloc is at:
-     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.4.h
-  You don't really need this .h file unless you call functions not
-  defined in your system include files.  The .h file contains only the
-  excerpts from this file needed for using this malloc on ANSI C/C++
-  systems, so long as you haven't changed compile-time options about
-  naming and tuning parameters.  If you do, then you can create your
-  own malloc.h that does include all settings by cutting at the point
-  indicated below. Note that you may already by default be using a C
-  library containing a malloc that is based on some version of this
-  malloc (for example in linux). You might still want to use the one
-  in this file to customize settings or to avoid overheads associated
-  with library versions.
-
-* Vital statistics:
-
-  Supported pointer/size_t representation:       4 or 8 bytes
-       size_t MUST be an unsigned type of the same width as
-       pointers. (If you are using an ancient system that declares
-       size_t as a signed type, or need it to be a different width
-       than pointers, you can use a previous release of this malloc
-       (e.g. 2.7.2) supporting these.)
-
-  Alignment:                                     8 bytes (default)
-       This suffices for nearly all current machines and C compilers.
-       However, you can define MALLOC_ALIGNMENT to be wider than this
-       if necessary (up to 128bytes), at the expense of using more space.
-
-  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
-                                          8 or 16 bytes (if 8byte sizes)
-       Each malloced chunk has a hidden word of overhead holding size
-       and status information, and additional cross-check word
-       if FOOTERS is defined.
-
-  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
-                          8-byte ptrs:  32 bytes    (including overhead)
-
-       Even a request for zero bytes (i.e., malloc(0)) returns a
-       pointer to something of the minimum allocatable size.
-       The maximum overhead wastage (i.e., number of extra bytes
-       allocated than were requested in malloc) is less than or equal
-       to the minimum size, except for requests >= mmap_threshold that
-       are serviced via mmap(), where the worst case wastage is about
-       32 bytes plus the remainder from a system page (the minimal
-       mmap unit); typically 4096 or 8192 bytes.
-
-  Security: static-safe; optionally more or less
-       The "security" of malloc refers to the ability of malicious
-       code to accentuate the effects of errors (for example, freeing
-       space that is not currently malloc'ed or overwriting past the
-       ends of chunks) in code that calls malloc.  This malloc
-       guarantees not to modify any memory locations below the base of
-       heap, i.e., static variables, even in the presence of usage
-       errors.  The routines additionally detect most improper frees
-       and reallocs.  All this holds as long as the static bookkeeping
-       for malloc itself is not corrupted by some other means.  This
-       is only one aspect of security -- these checks do not, and
-       cannot, detect all possible programming errors.
-
-       If FOOTERS is defined nonzero, then each allocated chunk
-       carries an additional check word to verify that it was malloced
-       from its space.  These check words are the same within each
-       execution of a program using malloc, but differ across
-       executions, so externally crafted fake chunks cannot be
-       freed. This improves security by rejecting frees/reallocs that
-       could corrupt heap memory, in addition to the checks preventing
-       writes to statics that are always on.  This may further improve
-       security at the expense of time and space overhead.  (Note that
-       FOOTERS may also be worth using with MSPACES.)
-
-       By default detected errors cause the program to abort (calling
-       "abort()"). You can override this to instead proceed past
-       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
-       has no effect, and a malloc that encounters a bad address
-       caused by user overwrites will ignore the bad address by
-       dropping pointers and indices to all known memory. This may
-       be appropriate for programs that should continue if at all
-       possible in the face of programming errors, although they may
-       run out of memory because dropped memory is never reclaimed.
-
-       If you don't like either of these options, you can define
-       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
-       else. And if if you are sure that your program using malloc has
-       no errors or vulnerabilities, you can define INSECURE to 1,
-       which might (or might not) provide a small performance improvement.
-
-  Thread-safety: NOT thread-safe unless USE_LOCKS defined
-       When USE_LOCKS is defined, each public call to malloc, free,
-       etc is surrounded with either a pthread mutex or a win32
-       spinlock (depending on WIN32). This is not especially fast, and
-       can be a major bottleneck.  It is designed only to provide
-       minimal protection in concurrent environments, and to provide a
-       basis for extensions.  If you are using malloc in a concurrent
-       program, consider instead using nedmalloc
-       (http://www.nedprod.com/programs/portable/nedmalloc/) or
-       ptmalloc (See http://www.malloc.de), which are derived
-       from versions of this malloc.
-
-  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
-       This malloc can use unix sbrk or any emulation (invoked using
-       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
-       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
-       memory.  On most unix systems, it tends to work best if both
-       MORECORE and MMAP are enabled.  On Win32, it uses emulations
-       based on VirtualAlloc. It also uses common C library functions
-       like memset.
-
-  Compliance: I believe it is compliant with the Single Unix Specification
-       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
-       others as well.
-
-* Overview of algorithms
-
-  This is not the fastest, most space-conserving, most portable, or
-  most tunable malloc ever written. However it is among the fastest
-  while also being among the most space-conserving, portable and
-  tunable.  Consistent balance across these factors results in a good
-  general-purpose allocator for malloc-intensive programs.
-
-  In most ways, this malloc is a best-fit allocator. Generally, it
-  chooses the best-fitting existing chunk for a request, with ties
-  broken in approximately least-recently-used order. (This strategy
-  normally maintains low fragmentation.) However, for requests less
-  than 256bytes, it deviates from best-fit when there is not an
-  exactly fitting available chunk by preferring to use space adjacent
-  to that used for the previous small request, as well as by breaking
-  ties in approximately most-recently-used order. (These enhance
-  locality of series of small allocations.)  And for very large requests
-  (>= 256Kb by default), it relies on system memory mapping
-  facilities, if supported.  (This helps avoid carrying around and
-  possibly fragmenting memory used only for large chunks.)
-
-  All operations (except malloc_stats and mallinfo) have execution
-  times that are bounded by a constant factor of the number of bits in
-  a size_t, not counting any clearing in calloc or copying in realloc,
-  or actions surrounding MORECORE and MMAP that have times
-  proportional to the number of non-contiguous regions returned by
-  system allocation routines, which is often just 1. In real-time
-  applications, you can optionally suppress segment traversals using
-  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
-  system allocators return non-contiguous spaces, at the typical
-  expense of carrying around more memory and increased fragmentation.
-
-  The implementation is not very modular and seriously overuses
-  macros. Perhaps someday all C compilers will do as good a job
-  inlining modular code as can now be done by brute-force expansion,
-  but now, enough of them seem not to.
-
-  Some compilers issue a lot of warnings about code that is
-  dead/unreachable only on some platforms, and also about intentional
-  uses of negation on unsigned types. All known cases of each can be
-  ignored.
-
-  For a longer but out of date high-level description, see
-     http://gee.cs.oswego.edu/dl/html/malloc.html
-
-* MSPACES
-  If MSPACES is defined, then in addition to malloc, free, etc.,
-  this file also defines mspace_malloc, mspace_free, etc. These
-  are versions of malloc routines that take an "mspace" argument
-  obtained using create_mspace, to control all internal bookkeeping.
-  If ONLY_MSPACES is defined, only these versions are compiled.
-  So if you would like to use this allocator for only some allocations,
-  and your system malloc for others, you can compile with
-  ONLY_MSPACES and then do something like...
-    static mspace mymspace = create_mspace(0,0); // for example
-    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)
-
-  (Note: If you only need one instance of an mspace, you can instead
-  use "USE_DL_PREFIX" to relabel the global malloc.)
-
-  You can similarly create thread-local allocators by storing
-  mspaces as thread-locals. For example:
-    static __thread mspace tlms = 0;
-    void*  tlmalloc(size_t bytes) {
-      if (tlms == 0) tlms = create_mspace(0, 0);
-      return mspace_malloc(tlms, bytes);
-    }
-    void  tlfree(void* mem) { mspace_free(tlms, mem); }
-
-  Unless FOOTERS is defined, each mspace is completely independent.
-  You cannot allocate from one and free to another (although
-  conformance is only weakly checked, so usage errors are not always
-  caught). If FOOTERS is defined, then each chunk carries around a tag
-  indicating its originating mspace, and frees are directed to their
-  originating spaces.
-
- -------------------------  Compile-time options ---------------------------
-
-Be careful in setting #define values for numerical constants of type
-size_t. On some systems, literal values are not automatically extended
-to size_t precision unless they are explicitly casted. You can also
-use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.
-
-WIN32                    default: defined if _WIN32 defined
-  Defining WIN32 sets up defaults for MS environment and compilers.
-  Otherwise defaults are for unix. Beware that there seem to be some
-  cases where this malloc might not be a pure drop-in replacement for
-  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
-  SetDIBits()) may be due to bugs in some video driver implementations
-  when pixel buffers are malloc()ed, and the region spans more than
-  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
-  default granularity, pixel buffers may straddle virtual allocation
-  regions more often than when using the Microsoft allocator.  You can
-  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
-  buffers rather than using malloc().  If this is not possible,
-  recompile this malloc with a larger DEFAULT_GRANULARITY.
-
-MALLOC_ALIGNMENT         default: (size_t)8
-  Controls the minimum alignment for malloc'ed chunks.  It must be a
-  power of two and at least 8, even on machines for which smaller
-  alignments would suffice. It may be defined as larger than this
-  though. Note however that code and data structures are optimized for
-  the case of 8-byte alignment.
-
-MSPACES                  default: 0 (false)
-  If true, compile in support for independent allocation spaces.
-  This is only supported if HAVE_MMAP is true.
-
-ONLY_MSPACES             default: 0 (false)
-  If true, only compile in mspace versions, not regular versions.
-
-USE_LOCKS                default: 0 (false)
-  Causes each call to each public routine to be surrounded with
-  pthread or WIN32 mutex lock/unlock. (If set true, this can be
-  overridden on a per-mspace basis for mspace versions.) If set to a
-  non-zero value other than 1, locks are used, but their
-  implementation is left out, so lock functions must be supplied manually,
-  as described below.
-
-USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and on x86 using gcc or MSC
-  If true, uses custom spin locks for locking. This is currently
-  supported only for x86 platforms using gcc or recent MS compilers.
-  Otherwise, posix locks or win32 critical sections are used.
-
-FOOTERS                  default: 0
-  If true, provide extra checking and dispatching by placing
-  information in the footers of allocated chunks. This adds
-  space and time overhead.
-
-INSECURE                 default: 0
-  If true, omit checks for usage errors and heap space overwrites.
-
-USE_DL_PREFIX            default: NOT defined
-  Causes compiler to prefix all public routines with the string 'dl'.
-  This can be useful when you only want to use this malloc in one part
-  of a program, using your regular system malloc elsewhere.
-
-ABORT                    default: defined as abort()
-  Defines how to abort on failed checks.  On most systems, a failed
-  check cannot die with an "assert" or even print an informative
-  message, because the underlying print routines in turn call malloc,
-  which will fail again.  Generally, the best policy is to simply call
-  abort(). It's not very useful to do more than this because many
-  errors due to overwriting will show up as address faults (null, odd
-  addresses etc) rather than malloc-triggered checks, so will also
-  abort.  Also, most compilers know that abort() does not return, so
-  can better optimize code conditionally calling it.
-
-PROCEED_ON_ERROR           default: defined as 0 (false)
-  Controls whether detected bad addresses cause them to bypassed
-  rather than aborting. If set, detected bad arguments to free and
-  realloc are ignored. And all bookkeeping information is zeroed out
-  upon a detected overwrite of freed heap space, thus losing the
-  ability to ever return it from malloc again, but enabling the
-  application to proceed. If PROCEED_ON_ERROR is defined, the
-  static variable malloc_corruption_error_count is compiled in
-  and can be examined to see if errors have occurred. This option
-  generates slower code than the default abort policy.
-
-DEBUG                    default: NOT defined
-  The DEBUG setting is mainly intended for people trying to modify
-  this code or diagnose problems when porting to new platforms.
-  However, it may also be able to better isolate user errors than just
-  using runtime checks.  The assertions in the check routines spell
-  out in more detail the assumptions and invariants underlying the
-  algorithms.  The checking is fairly extensive, and will slow down
-  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
-  set will attempt to check every non-mmapped allocated and free chunk
-  in the course of computing the summaries.
-
-ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
-  Debugging assertion failures can be nearly impossible if your
-  version of the assert macro causes malloc to be called, which will
-  lead to a cascade of further failures, blowing the runtime stack.
-  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
-  which will usually make debugging easier.
-
-MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
-  The action to take before "return 0" when malloc fails to be able to
-  return memory because there is none available.
-
-HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
-  True if this system supports sbrk or an emulation of it.
-
-MORECORE                  default: sbrk
-  The name of the sbrk-style system routine to call to obtain more
-  memory.  See below for guidance on writing custom MORECORE
-  functions. The type of the argument to sbrk/MORECORE varies across
-  systems.  It cannot be size_t, because it supports negative
-  arguments, so it is normally the signed type of the same width as
-  size_t (sometimes declared as "intptr_t").  It doesn't much matter
-  though. Internally, we only call it with arguments less than half
-  the max value of a size_t, which should work across all reasonable
-  possibilities, although sometimes generating compiler warnings.
-
-MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
-  If true, take advantage of fact that consecutive calls to MORECORE
-  with positive arguments always return contiguous increasing
-  addresses.  This is true of unix sbrk. It does not hurt too much to
-  set it true anyway, since malloc copes with non-contiguities.
-  Setting it false when definitely non-contiguous saves time
-  and possibly wasted space it would take to discover this though.
-
-MORECORE_CANNOT_TRIM      default: NOT defined
-  True if MORECORE cannot release space back to the system when given
-  negative arguments. This is generally necessary only if you are
-  using a hand-crafted MORECORE function that cannot handle negative
-  arguments.
-
-NO_SEGMENT_TRAVERSAL       default: 0
-  If non-zero, suppresses traversals of memory segments
-  returned by either MORECORE or CALL_MMAP. This disables
-  merging of segments that are contiguous, and selectively
-  releasing them to the OS if unused, but bounds execution times.
-
-HAVE_MMAP                 default: 1 (true)
-  True if this system supports mmap or an emulation of it.  If so, and
-  HAVE_MORECORE is not true, MMAP is used for all system
-  allocation. If set and HAVE_MORECORE is true as well, MMAP is
-  primarily used to directly allocate very large blocks. It is also
-  used as a backup strategy in cases where MORECORE fails to provide
-  space from system. Note: A single call to MUNMAP is assumed to be
-  able to unmap memory that may have be allocated using multiple calls
-  to MMAP, so long as they are adjacent.
-
-HAVE_MREMAP               default: 1 on linux, else 0
-  If true realloc() uses mremap() to re-allocate large blocks and
-  extend or shrink allocation spaces.
-
-MMAP_CLEARS               default: 1 except on WINCE.
-  True if mmap clears memory so calloc doesn't need to. This is true
-  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.
-
-USE_BUILTIN_FFS            default: 0 (i.e., not used)
-  Causes malloc to use the builtin ffs() function to compute indices.
-  Some compilers may recognize and intrinsify ffs to be faster than the
-  supplied C version. Also, the case of x86 using gcc is special-cased
-  to an asm instruction, so is already as fast as it can be, and so
-  this setting has no effect. Similarly for Win32 under recent MS compilers.
-  (On most x86s, the asm version is only slightly faster than the C version.)
-
-malloc_getpagesize         default: derive from system includes, or 4096.
-  The system page size. To the extent possible, this malloc manages
-  memory from the system in page-size units.  This may be (and
-  usually is) a function rather than a constant. This is ignored
-  if WIN32, where page size is determined using getSystemInfo during
-  initialization. This may be several megabytes if ENABLE_LARGE_PAGES
-  is enabled.
-
-ENABLE_LARGE_PAGES         default: NOT defined
-  Causes the system page size to be the value of GetLargePageMinimum()
-  if that function is available (Windows Server 2003/Vista or later).
-  This allows the use of large page entries in the MMU which can
-  significantly improve performance in large working set applications
-  as TLB cache load is reduced by a factor of three. Note that enabling
-  this option is equal to locking the process' memory in current
-  implementations of Windows and requires the SE_LOCK_MEMORY_PRIVILEGE
-  to be held by the process in order to succeed.
-
-USE_DEV_RANDOM             default: 0 (i.e., not used)
-  Causes malloc to use /dev/random to initialize secure magic seed for
-  stamping footers. Otherwise, the current time is used.
-
-NO_MALLINFO                default: 0
-  If defined, don't compile "mallinfo". This can be a simple way
-  of dealing with mismatches between system declarations and
-  those in this file.
-
-MALLINFO_FIELD_TYPE        default: size_t
-  The type of the fields in the mallinfo struct. This was originally
-  defined as "int" in SVID etc, but is more usefully defined as
-  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set
-
-REALLOC_ZERO_BYTES_FREES    default: not defined
-  This should be set if a call to realloc with zero bytes should
-  be the same as a call to free. Some people think it should. Otherwise,
-  since this malloc returns a unique pointer for malloc(0), so does
-  realloc(p, 0).
-
-LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
-LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
-LACKS_STDLIB_H                default: NOT defined unless on WIN32
-  Define these if your system does not have these header files.
-  You might need to manually insert some of the declarations they provide.
-
-DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
-                                system_info.dwAllocationGranularity in WIN32,
-                                GetLargePageMinimum() if ENABLE_LARGE_PAGES,
-                                otherwise 64K.
-      Also settable using mallopt(M_GRANULARITY, x)
-  The unit for allocating and deallocating memory from the system.  On
-  most systems with contiguous MORECORE, there is no reason to
-  make this more than a page. However, systems with MMAP tend to
-  either require or encourage larger granularities.  You can increase
-  this value to prevent system allocation functions to be called so
-  often, especially if they are slow.  The value must be at least one
-  page and must be a power of two.  Setting to 0 causes initialization
-  to either page size or win32 region size.  (Note: In previous
-  versions of malloc, the equivalent of this option was called
-  "TOP_PAD")
-
-DEFAULT_GRANULARITY_ALIGNED default: undefined (which means page size)
-  Whether to enforce alignment when allocating and deallocating memory
-  from the system i.e. the base address of all allocations will be
-  aligned to DEFAULT_GRANULARITY if it is set. Note that enabling this carries
-  some overhead as multiple calls must now be made when probing for a valid
-  aligned value, however it does greatly ease the checking for whether
-  a given memory pointer was allocated by this allocator rather than
-  some other.
-
-DEFAULT_TRIM_THRESHOLD    default: 2MB
-      Also settable using mallopt(M_TRIM_THRESHOLD, x)
-  The maximum amount of unused top-most memory to keep before
-  releasing via malloc_trim in free().  Automatic trimming is mainly
-  useful in long-lived programs using contiguous MORECORE.  Because
-  trimming via sbrk can be slow on some systems, and can sometimes be
-  wasteful (in cases where programs immediately afterward allocate
-  more large chunks) the value should be high enough so that your
-  overall system performance would improve by releasing this much
-  memory.  As a rough guide, you might set to a value close to the
-  average size of a process (program) running on your system.
-  Releasing this much memory would allow such a process to run in
-  memory.  Generally, it is worth tuning trim thresholds when a
-  program undergoes phases where several large chunks are allocated
-  and released in ways that can reuse each other's storage, perhaps
-  mixed with phases where there are no such chunks at all. The trim
-  value must be greater than page size to have any useful effect.  To
-  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
-  some people use of mallocing a huge space and then freeing it at
-  program startup, in an attempt to reserve system memory, doesn't
-  have the intended effect under automatic trimming, since that memory
-  will immediately be returned to the system.
-
-DEFAULT_MMAP_THRESHOLD       default: 256K
-      Also settable using mallopt(M_MMAP_THRESHOLD, x)
-  The request size threshold for using MMAP to directly service a
-  request. Requests of at least this size that cannot be allocated
-  using already-existing space will be serviced via mmap.  (If enough
-  normal freed space already exists it is used instead.)  Using mmap
-  segregates relatively large chunks of memory so that they can be
-  individually obtained and released from the host system. A request
-  serviced through mmap is never reused by any other request (at least
-  not directly; the system may just so happen to remap successive
-  requests to the same locations).  Segregating space in this way has
-  the benefits that: Mmapped space can always be individually released
-  back to the system, which helps keep the system level memory demands
-  of a long-lived program low.  Also, mapped memory doesn't become
-  `locked' between other chunks, as can happen with normally allocated
-  chunks, which means that even trimming via malloc_trim would not
-  release them.  However, it has the disadvantage that the space
-  cannot be reclaimed, consolidated, and then used to service later
-  requests, as happens with normal chunks.  The advantages of mmap
-  nearly always outweigh disadvantages for "large" chunks, but the
-  value of "large" may vary across systems.  The default is an
-  empirically derived value that works well in most systems. You can
-  disable mmap by setting to MAX_SIZE_T.
-
-MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
-  The number of consolidated frees between checks to release
-  unused segments when freeing. When using non-contiguous segments,
-  especially with multiple mspaces, checking only for topmost space
-  doesn't always suffice to trigger trimming. To compensate for this,
-  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
-  current number of segments, if greater) try to release unused
-  segments to the OS when freeing chunks that result in
-  consolidation. The best value for this parameter is a compromise
-  between slowing down frees with relatively costly checks that
-  rarely trigger versus holding on to unused memory. To effectively
-  disable, set to MAX_SIZE_T. This may lead to a very slight speed
-  improvement at the expense of carrying around more memory.
-*/
-
-/* Version identifier to allow people to support multiple versions */
-#ifndef DLMALLOC_VERSION
-#define DLMALLOC_VERSION 20804
-#endif /* DLMALLOC_VERSION */
-
-#ifndef WIN32
-#ifdef _WIN32
-#define WIN32 1
-#endif  /* _WIN32 */
-#ifdef _WIN32_WCE
-#define LACKS_FCNTL_H
-#define WIN32 1
-#endif /* _WIN32_WCE */
-#endif  /* WIN32 */
-#ifdef WIN32
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-#include <tchar.h>
-#define HAVE_MMAP 1
-#define HAVE_MORECORE 0
-#define LACKS_UNISTD_H
-#define LACKS_SYS_PARAM_H
-#define LACKS_SYS_MMAN_H
-#define LACKS_STRING_H
-#define LACKS_STRINGS_H
-#define LACKS_SYS_TYPES_H
-#define LACKS_ERRNO_H
-#ifndef MALLOC_FAILURE_ACTION
-#define MALLOC_FAILURE_ACTION
-#endif /* MALLOC_FAILURE_ACTION */
-#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
-#define MMAP_CLEARS 0
-#else
-#define MMAP_CLEARS 1
-#endif /* _WIN32_WCE */
-#endif  /* WIN32 */
-
-#if defined(DARWIN) || defined(_DARWIN)
-/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
-#ifndef HAVE_MORECORE
-#define HAVE_MORECORE 0
-#define HAVE_MMAP 1
-/* OSX allocators provide 16 byte alignment */
-#ifndef MALLOC_ALIGNMENT
-#define MALLOC_ALIGNMENT ((size_t)16U)
-#endif
-#endif  /* HAVE_MORECORE */
-#endif  /* DARWIN */
-
-#ifndef LACKS_SYS_TYPES_H
-#include <sys/types.h>  /* For size_t */
-#endif  /* LACKS_SYS_TYPES_H */
-
-#if (defined(__GNUC__) && ((defined(__i386__) || defined(__x86_64__)))) || (defined(_MSC_VER) && _MSC_VER>=1310)
-#define SPIN_LOCKS_AVAILABLE 1
-#else
-#define SPIN_LOCKS_AVAILABLE 0
-#endif
-
-/* The maximum possible size_t value has all bits set */
-#define MAX_SIZE_T           (~(size_t)0)
-
-#ifndef ONLY_MSPACES
-#define ONLY_MSPACES 0     /* define to a value */
-#else
-#define ONLY_MSPACES 1
-#endif  /* ONLY_MSPACES */
-#ifndef MSPACES
-#if ONLY_MSPACES
-#define MSPACES 1
-#else   /* ONLY_MSPACES */
-#define MSPACES 0
-#endif  /* ONLY_MSPACES */
-#endif  /* MSPACES */
-#ifndef MALLOC_ALIGNMENT
-#define MALLOC_ALIGNMENT ((size_t)8U)
-#endif  /* MALLOC_ALIGNMENT */
-#ifndef FOOTERS
-#define FOOTERS 0
-#endif  /* FOOTERS */
-#ifndef ABORT
-#define ABORT  abort()
-#endif  /* ABORT */
-#ifndef ABORT_ON_ASSERT_FAILURE
-#define ABORT_ON_ASSERT_FAILURE 1
-#endif  /* ABORT_ON_ASSERT_FAILURE */
-#ifndef PROCEED_ON_ERROR
-#define PROCEED_ON_ERROR 0
-#endif  /* PROCEED_ON_ERROR */
-#ifndef USE_LOCKS
-#define USE_LOCKS 0
-#endif  /* USE_LOCKS */
-#ifndef USE_SPIN_LOCKS
-#if USE_LOCKS && SPIN_LOCKS_AVAILABLE
-#define USE_SPIN_LOCKS 1
-#else
-#define USE_SPIN_LOCKS 0
-#endif /* USE_LOCKS && SPIN_LOCKS_AVAILABLE. */
-#endif /* USE_SPIN_LOCKS */
-#ifndef INSECURE
-#define INSECURE 0
-#endif  /* INSECURE */
-#ifndef HAVE_MMAP
-#define HAVE_MMAP 1
-#endif  /* HAVE_MMAP */
-#ifndef MMAP_CLEARS
-#define MMAP_CLEARS 1
-#endif  /* MMAP_CLEARS */
-#ifndef HAVE_MREMAP
-#ifdef linux
-#define HAVE_MREMAP 1
-#else   /* linux */
-#define HAVE_MREMAP 0
-#endif  /* linux */
-#endif  /* HAVE_MREMAP */
-#ifndef MALLOC_FAILURE_ACTION
-#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
-#endif  /* MALLOC_FAILURE_ACTION */
-#ifndef HAVE_MORECORE
-#if ONLY_MSPACES
-#define HAVE_MORECORE 0
-#else   /* ONLY_MSPACES */
-#define HAVE_MORECORE 1
-#endif  /* ONLY_MSPACES */
-#endif  /* HAVE_MORECORE */
-#if !HAVE_MORECORE
-#define MORECORE_CONTIGUOUS 0
-#else   /* !HAVE_MORECORE */
-#define MORECORE_DEFAULT sbrk
-#ifndef MORECORE_CONTIGUOUS
-#define MORECORE_CONTIGUOUS 1
-#endif  /* MORECORE_CONTIGUOUS */
-#endif  /* HAVE_MORECORE */
-#ifndef DEFAULT_GRANULARITY
-#if (MORECORE_CONTIGUOUS || defined(WIN32))
-#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
-#else   /* MORECORE_CONTIGUOUS */
-#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
-#endif  /* MORECORE_CONTIGUOUS */
-#endif  /* DEFAULT_GRANULARITY */
-#ifndef DEFAULT_TRIM_THRESHOLD
-#ifndef MORECORE_CANNOT_TRIM
-#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
-#else   /* MORECORE_CANNOT_TRIM */
-#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
-#endif  /* MORECORE_CANNOT_TRIM */
-#endif  /* DEFAULT_TRIM_THRESHOLD */
-#ifndef DEFAULT_MMAP_THRESHOLD
-#if HAVE_MMAP
-#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
-#else   /* HAVE_MMAP */
-#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
-#endif  /* HAVE_MMAP */
-#endif  /* DEFAULT_MMAP_THRESHOLD */
-#ifndef MAX_RELEASE_CHECK_RATE
-#if HAVE_MMAP
-#define MAX_RELEASE_CHECK_RATE 4095
-#else
-#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
-#endif /* HAVE_MMAP */
-#endif /* MAX_RELEASE_CHECK_RATE */
-#ifndef USE_BUILTIN_FFS
-#define USE_BUILTIN_FFS 0
-#endif  /* USE_BUILTIN_FFS */
-#ifndef USE_DEV_RANDOM
-#define USE_DEV_RANDOM 0
-#endif  /* USE_DEV_RANDOM */
-#ifndef NO_MALLINFO
-#define NO_MALLINFO 0
-#endif  /* NO_MALLINFO */
-#ifndef MALLINFO_FIELD_TYPE
-#define MALLINFO_FIELD_TYPE size_t
-#endif  /* MALLINFO_FIELD_TYPE */
-#ifndef NO_SEGMENT_TRAVERSAL
-#define NO_SEGMENT_TRAVERSAL 0
-#endif /* NO_SEGMENT_TRAVERSAL */
-
-/*
-  mallopt tuning options.  SVID/XPG defines four standard parameter
-  numbers for mallopt, normally defined in malloc.h.  None of these
-  are used in this malloc, so setting them has no effect. But this
-  malloc does support the following options.
-*/
-
-#define M_TRIM_THRESHOLD     (-1)
-#define M_GRANULARITY        (-2)
-#define M_MMAP_THRESHOLD     (-3)
-
-/* ------------------------ Mallinfo declarations ------------------------ */
-
-#if !NO_MALLINFO
-/*
-  This version of malloc supports the standard SVID/XPG mallinfo
-  routine that returns a struct containing usage properties and
-  statistics. It should work on any system that has a
-  /usr/include/malloc.h defining struct mallinfo.  The main
-  declaration needed is the mallinfo struct that is returned (by-copy)
-  by mallinfo().  The malloinfo struct contains a bunch of fields that
-  are not even meaningful in this version of malloc.  These fields are
-  are instead filled by mallinfo() with other numbers that might be of
-  interest.
-
-  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
-  /usr/include/malloc.h file that includes a declaration of struct
-  mallinfo.  If so, it is included; else a compliant version is
-  declared below.  These must be precisely the same for mallinfo() to
-  work.  The original SVID version of this struct, defined on most
-  systems with mallinfo, declares all fields as ints. But some others
-  define as unsigned long. If your system defines the fields using a
-  type of different width than listed here, you MUST #include your
-  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
-*/
-
-/* #define HAVE_USR_INCLUDE_MALLOC_H */
-
-#ifdef HAVE_USR_INCLUDE_MALLOC_H
-#include "/usr/include/malloc.h"
-#else /* HAVE_USR_INCLUDE_MALLOC_H */
-#ifndef STRUCT_MALLINFO_DECLARED
-#define STRUCT_MALLINFO_DECLARED 1
-struct mallinfo {
-  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
-  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
-  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
-  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
-  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
-  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
-  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
-  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
-  MALLINFO_FIELD_TYPE fordblks; /* total free space */
-  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
-};
-#endif /* STRUCT_MALLINFO_DECLARED */
-#endif /* HAVE_USR_INCLUDE_MALLOC_H */
-#endif /* NO_MALLINFO */
-
-/*
-  Try to persuade compilers to inline. The most critical functions for
-  inlining are defined as macros, so these aren't used for them.
-*/
-
-#ifndef FORCEINLINE
-  #if defined(__GNUC__)
-#define FORCEINLINE __inline __attribute__ ((always_inline))
-  #elif defined(_MSC_VER)
-    #define FORCEINLINE __forceinline
-  #endif
-#endif
-#ifndef NOINLINE
-  #if defined(__GNUC__)
-    #define NOINLINE __attribute__ ((noinline))
-  #elif defined(_MSC_VER)
-    #define NOINLINE __declspec(noinline)
-  #else
-    #define NOINLINE
-  #endif
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#ifndef FORCEINLINE
- #define FORCEINLINE inline
-#endif
-#endif /* __cplusplus */
-#ifndef FORCEINLINE
- #define FORCEINLINE
-#endif
-
-#if !ONLY_MSPACES
-
-/* ------------------- Declarations of public routines ------------------- */
-
-#ifndef USE_DL_PREFIX
-#define dlcalloc               calloc
-#define dlfree                 free
-#define dlmalloc               malloc
-#define dlmemalign             memalign
-#define dlrealloc              realloc
-#define dlvalloc               valloc
-#define dlpvalloc              pvalloc
-#define dlmallinfo             mallinfo
-#define dlmallopt              mallopt
-#define dlmalloc_trim          malloc_trim
-#define dlmalloc_stats         malloc_stats
-#define dlmalloc_usable_size   malloc_usable_size
-#define dlmalloc_footprint     malloc_footprint
-#define dlmalloc_max_footprint malloc_max_footprint
-#define dlindependent_calloc   independent_calloc
-#define dlindependent_comalloc independent_comalloc
-#endif /* USE_DL_PREFIX */
-
-
-/*
-  malloc(size_t n)
-  Returns a pointer to a newly allocated chunk of at least n bytes, or
-  null if no space is available, in which case errno is set to ENOMEM
-  on ANSI C systems.
-
-  If n is zero, malloc returns a minimum-sized chunk. (The minimum
-  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
-  systems.)  Note that size_t is an unsigned type, so calls with
-  arguments that would be negative if signed are interpreted as
-  requests for huge amounts of space, which will often fail. The
-  maximum supported value of n differs across systems, but is in all
-  cases less than the maximum representable value of a size_t.
-*/
-void* dlmalloc(size_t);
-
-/*
-  free(void* p)
-  Releases the chunk of memory pointed to by p, that had been previously
-  allocated using malloc or a related routine such as realloc.
-  It has no effect if p is null. If p was not malloced or already
-  freed, free(p) will by default cause the current program to abort.
-*/
-void  dlfree(void*);
-
-/*
-  calloc(size_t n_elements, size_t element_size);
-  Returns a pointer to n_elements * element_size bytes, with all locations
-  set to zero.
-*/
-void* dlcalloc(size_t, size_t);
-
-/*
-  realloc(void* p, size_t n)
-  Returns a pointer to a chunk of size n that contains the same data
-  as does chunk p up to the minimum of (n, p's size) bytes, or null
-  if no space is available.
-
-  The returned pointer may or may not be the same as p. The algorithm
-  prefers extending p in most cases when possible, otherwise it
-  employs the equivalent of a malloc-copy-free sequence.
-
-  If p is null, realloc is equivalent to malloc.
-
-  If space is not available, realloc returns null, errno is set (if on
-  ANSI) and p is NOT freed.
-
-  if n is for fewer bytes than already held by p, the newly unused
-  space is lopped off and freed if possible.  realloc with a size
-  argument of zero (re)allocates a minimum-sized chunk.
-
-  The old unix realloc convention of allowing the last-free'd chunk
-  to be used as an argument to realloc is not supported.
-*/
-
-void* dlrealloc(void*, size_t);
-
-/*
-  memalign(size_t alignment, size_t n);
-  Returns a pointer to a newly allocated chunk of n bytes, aligned
-  in accord with the alignment argument.
-
-  The alignment argument should be a power of two. If the argument is
-  not a power of two, the nearest greater power is used.
-  8-byte alignment is guaranteed by normal malloc calls, so don't
-  bother calling memalign with an argument of 8 or less.
-
-  Overreliance on memalign is a sure way to fragment space.
-*/
-void* dlmemalign(size_t, size_t);
-
-/*
-  valloc(size_t n);
-  Equivalent to memalign(pagesize, n), where pagesize is the page
-  size of the system. If the pagesize is unknown, 4096 is used.
-*/
-void* dlvalloc(size_t);
-
-/*
-  mallopt(int parameter_number, int parameter_value)
-  Sets tunable parameters The format is to provide a
-  (parameter-number, parameter-value) pair.  mallopt then sets the
-  corresponding parameter to the argument value if it can (i.e., so
-  long as the value is meaningful), and returns 1 if successful else
-  0.  To workaround the fact that mallopt is specified to use int,
-  not size_t parameters, the value -1 is specially treated as the
-  maximum unsigned size_t value.
-
-  SVID/XPG/ANSI defines four standard param numbers for mallopt,
-  normally defined in malloc.h.  None of these are use in this malloc,
-  so setting them has no effect. But this malloc also supports other
-  options in mallopt. See below for details.  Briefly, supported
-  parameters are as follows (listed defaults are for "typical"
-  configurations).
-
-  Symbol            param #  default    allowed param values
-  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
-  M_GRANULARITY        -2     page size   any power of 2 >= page size
-  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
-*/
-int dlmallopt(int, int);
-
-/*
-  malloc_footprint();
-  Returns the number of bytes obtained from the system.  The total
-  number of bytes allocated by malloc, realloc etc., is less than this
-  value. Unlike mallinfo, this function returns only a precomputed
-  result, so can be called frequently to monitor memory consumption.
-  Even if locks are otherwise defined, this function does not use them,
-  so results might not be up to date.
-*/
-size_t dlmalloc_footprint(void);
-
-/*
-  malloc_max_footprint();
-  Returns the maximum number of bytes obtained from the system. This
-  value will be greater than current footprint if deallocated space
-  has been reclaimed by the system. The peak number of bytes allocated
-  by malloc, realloc etc., is less than this value. Unlike mallinfo,
-  this function returns only a precomputed result, so can be called
-  frequently to monitor memory consumption.  Even if locks are
-  otherwise defined, this function does not use them, so results might
-  not be up to date.
-*/
-size_t dlmalloc_max_footprint(void);
-
-#if !NO_MALLINFO
-/*
-  mallinfo()
-  Returns (by copy) a struct containing various summary statistics:
-
-  arena:     current total non-mmapped bytes allocated from system
-  ordblks:   the number of free chunks
-  smblks:    always zero.
-  hblks:     current number of mmapped regions
-  hblkhd:    total bytes held in mmapped regions
-  usmblks:   the maximum total allocated space. This will be greater
-                than current total if trimming has occurred.
-  fsmblks:   always zero
-  uordblks:  current total allocated space (normal or mmapped)
-  fordblks:  total free space
-  keepcost:  the maximum number of bytes that could ideally be released
-               back to system via malloc_trim. ("ideally" means that
-               it ignores page restrictions etc.)
-
-  Because these fields are ints, but internal bookkeeping may
-  be kept as longs, the reported values may wrap around zero and
-  thus be inaccurate.
-*/
-struct mallinfo dlmallinfo(void);
-#endif /* NO_MALLINFO */
-
-/*
-  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);
-
-  independent_calloc is similar to calloc, but instead of returning a
-  single cleared space, it returns an array of pointers to n_elements
-  independent elements that can hold contents of size elem_size, each
-  of which starts out cleared, and can be independently freed,
-  realloc'ed etc. The elements are guaranteed to be adjacently
-  allocated (this is not guaranteed to occur with multiple callocs or
-  mallocs), which may also improve cache locality in some
-  applications.
-
-  The "chunks" argument is optional (i.e., may be null, which is
-  probably the most typical usage). If it is null, the returned array
-  is itself dynamically allocated and should also be freed when it is
-  no longer needed. Otherwise, the chunks array must be of at least
-  n_elements in length. It is filled in with the pointers to the
-  chunks.
-
-  In either case, independent_calloc returns this pointer array, or
-  null if the allocation failed.  If n_elements is zero and "chunks"
-  is null, it returns a chunk representing an array with zero elements
-  (which should be freed if not wanted).
-
-  Each element must be individually freed when it is no longer
-  needed. If you'd like to instead be able to free all at once, you
-  should instead use regular calloc and assign pointers into this
-  space to represent elements.  (In this case though, you cannot
-  independently free elements.)
-
-  independent_calloc simplifies and speeds up implementations of many
-  kinds of pools.  It may also be useful when constructing large data
-  structures that initially have a fixed number of fixed-sized nodes,
-  but the number is not known at compile time, and some of the nodes
-  may later need to be freed. For example:
-
-  struct Node { int item; struct Node* next; };
-
-  struct Node* build_list() {
-    struct Node** pool;
-    int n = read_number_of_nodes_needed();
-    if (n <= 0) return 0;
-    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
-    if (pool == 0) die();
-    // organize into a linked list...
-    struct Node* first = pool[0];
-    for (i = 0; i < n-1; ++i)
-      pool[i]->next = pool[i+1];
-    free(pool);     // Can now free the array (or not, if it is needed later)
-    return first;
-  }
-*/
-void** dlindependent_calloc(size_t, size_t, void**);
-
-/*
-  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);
-
-  independent_comalloc allocates, all at once, a set of n_elements
-  chunks with sizes indicated in the "sizes" array.    It returns
-  an array of pointers to these elements, each of which can be
-  independently freed, realloc'ed etc. The elements are guaranteed to
-  be adjacently allocated (this is not guaranteed to occur with
-  multiple callocs or mallocs), which may also improve cache locality
-  in some applications.
-
-  The "chunks" argument is optional (i.e., may be null). If it is null
-  the returned array is itself dynamically allocated and should also
-  be freed when it is no longer needed. Otherwise, the chunks array
-  must be of at least n_elements in length. It is filled in with the
-  pointers to the chunks.
-
-  In either case, independent_comalloc returns this pointer array, or
-  null if the allocation failed.  If n_elements is zero and chunks is
-  null, it returns a chunk representing an array with zero elements
-  (which should be freed if not wanted).
-
-  Each element must be individually freed when it is no longer
-  needed. If you'd like to instead be able to free all at once, you
-  should instead use a single regular malloc, and assign pointers at
-  particular offsets in the aggregate space. (In this case though, you
-  cannot independently free elements.)
-
-  independent_comallac differs from independent_calloc in that each
-  element may have a different size, and also that it does not
-  automatically clear elements.
-
-  independent_comalloc can be used to speed up allocation in cases
-  where several structs or objects must always be allocated at the
-  same time.  For example:
-
-  struct Head { ... }
-  struct Foot { ... }
-
-  void send_message(char* msg) {
-    int msglen = strlen(msg);
-    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
-    void* chunks[3];
-    if (independent_comalloc(3, sizes, chunks) == 0)
-      die();
-    struct Head* head = (struct Head*)(chunks[0]);
-    char*        body = (char*)(chunks[1]);
-    struct Foot* foot = (struct Foot*)(chunks[2]);
-    // ...
-  }
-
-  In general though, independent_comalloc is worth using only for
-  larger values of n_elements. For small values, you probably won't
-  detect enough difference from series of malloc calls to bother.
-
-  Overuse of independent_comalloc can increase overall memory usage,
-  since it cannot reuse existing noncontiguous small chunks that
-  might be available for some of the elements.
-*/
-void** dlindependent_comalloc(size_t, size_t*, void**);
-
-
-/*
-  pvalloc(size_t n);
-  Equivalent to valloc(minimum-page-that-holds(n)), that is,
-  round up n to nearest pagesize.
- */
-void*  dlpvalloc(size_t);
-
-/*
-  malloc_trim(size_t pad);
-
-  If possible, gives memory back to the system (via negative arguments
-  to sbrk) if there is unused memory at the `high' end of the malloc
-  pool or in unused MMAP segments. You can call this after freeing
-  large blocks of memory to potentially reduce the system-level memory
-  requirements of a program. However, it cannot guarantee to reduce
-  memory. Under some allocation patterns, some large free blocks of
-  memory will be locked between two used chunks, so they cannot be
-  given back to the system.
-
-  The `pad' argument to malloc_trim represents the amount of free
-  trailing space to leave untrimmed. If this argument is zero, only
-  the minimum amount of memory to maintain internal data structures
-  will be left. Non-zero arguments can be supplied to maintain enough
-  trailing space to service future expected allocations without having
-  to re-obtain memory from the system.
-
-  Malloc_trim returns 1 if it actually released any memory, else 0.
-*/
-int  dlmalloc_trim(size_t);
-
-/*
-  malloc_stats();
-  Prints on stderr the amount of space obtained from the system (both
-  via sbrk and mmap), the maximum amount (which may be more than
-  current if malloc_trim and/or munmap got called), and the current
-  number of bytes allocated via malloc (or realloc, etc) but not yet
-  freed. Note that this is the number of bytes allocated, not the
-  number requested. It will be larger than the number requested
-  because of alignment and bookkeeping overhead. Because it includes
-  alignment wastage as being in use, this figure may be greater than
-  zero even when no user-level chunks are allocated.
-
-  The reported current and maximum system memory can be inaccurate if
-  a program makes other calls to system memory allocation functions
-  (normally sbrk) outside of malloc.
-
-  malloc_stats prints only the most commonly interesting statistics.
-  More information can be obtained by calling mallinfo.
-*/
-void  dlmalloc_stats(void);
-
-#endif /* ONLY_MSPACES */
-
-/*
-  malloc_usable_size(void* p);
-
-  Returns the number of bytes you can actually use in
-  an allocated chunk, which may be more than you requested (although
-  often not) due to alignment and minimum size constraints.
-  You can use this many bytes without worrying about
-  overwriting other allocated objects. This is not a particularly great
-  programming practice. malloc_usable_size can be more useful in
-  debugging and assertions, for example:
-
-  p = malloc(n);
-  assert(malloc_usable_size(p) >= 256);
-*/
-size_t dlmalloc_usable_size(void*);
-
-
-#if MSPACES
-
-/*
-  mspace is an opaque type representing an independent
-  region of space that supports mspace_malloc, etc.
-*/
-typedef void* mspace;
-
-/*
-  create_mspace creates and returns a new independent space with the
-  given initial capacity, or, if 0, the default granularity size.  It
-  returns null if there is no system memory available to create the
-  space.  If argument locked is non-zero, the space uses a separate
-  lock to control access. The capacity of the space will grow
-  dynamically as needed to service mspace_malloc requests.  You can
-  control the sizes of incremental increases of this space by
-  compiling with a different DEFAULT_GRANULARITY or dynamically
-  setting with mallopt(M_GRANULARITY, value).
-*/
-mspace create_mspace(size_t capacity, int locked);
-
-/*
-  destroy_mspace destroys the given space, and attempts to return all
-  of its memory back to the system, returning the total number of
-  bytes freed. After destruction, the results of access to all memory
-  used by the space become undefined.
-*/
-size_t destroy_mspace(mspace msp);
-
-/*
-  create_mspace_with_base uses the memory supplied as the initial base
-  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
-  space is used for bookkeeping, so the capacity must be at least this
-  large. (Otherwise 0 is returned.) When this initial space is
-  exhausted, additional memory will be obtained from the system.
-  Destroying this space will deallocate all additionally allocated
-  space (if possible) but not the initial base.
-*/
-mspace create_mspace_with_base(void* base, size_t capacity, int locked);
-
-/*
-  mspace_track_large_chunks controls whether requests for large chunks
-  are allocated in their own untracked mmapped regions, separate from
-  others in this mspace. By default large chunks are not tracked,
-  which reduces fragmentation. However, such chunks are not
-  necessarily released to the system upon destroy_mspace.  Enabling
-  tracking by setting to true may increase fragmentation, but avoids
-  leakage when relying on destroy_mspace to release all memory
-  allocated using this space.  The function returns the previous
-  setting.
-*/
-int mspace_track_large_chunks(mspace msp, int enable);
-
-
-/*
-  mspace_malloc behaves as malloc, but operates within
-  the given space.
-*/
-void* mspace_malloc(mspace msp, size_t bytes);
-
-/*
-  mspace_free behaves as free, but operates within
-  the given space.
-
-  If compiled with FOOTERS==1, mspace_free is not actually needed.
-  free may be called instead of mspace_free because freed chunks from
-  any space are handled by their originating spaces.
-*/
-void mspace_free(mspace msp, void* mem);
-
-/*
-  mspace_realloc behaves as realloc, but operates within
-  the given space.
-
-  If compiled with FOOTERS==1, mspace_realloc is not actually
-  needed.  realloc may be called instead of mspace_realloc because
-  realloced chunks from any space are handled by their originating
-  spaces.
-*/
-void* mspace_realloc(mspace msp, void* mem, size_t newsize);
-
-/*
-  mspace_calloc behaves as calloc, but operates within
-  the given space.
-*/
-void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);
-
-/*
-  mspace_memalign behaves as memalign, but operates within
-  the given space.
-*/
-void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);
-
-/*
-  mspace_independent_calloc behaves as independent_calloc, but
-  operates within the given space.
-*/
-void** mspace_independent_calloc(mspace msp, size_t n_elements,
-                                 size_t elem_size, void* chunks[]);
-
-/*
-  mspace_independent_comalloc behaves as independent_comalloc, but
-  operates within the given space.
-*/
-void** mspace_independent_comalloc(mspace msp, size_t n_elements,
-                                   size_t sizes[], void* chunks[]);
-
-/*
-  mspace_footprint() returns the number of bytes obtained from the
-  system for this space.
-*/
-size_t mspace_footprint(mspace msp);
-
-/*
-  mspace_max_footprint() returns the peak number of bytes obtained from the
-  system for this space.
-*/
-size_t mspace_max_footprint(mspace msp);
-
-
-#if !NO_MALLINFO
-/*
-  mspace_mallinfo behaves as mallinfo, but reports properties of
-  the given space.
-*/
-struct mallinfo mspace_mallinfo(mspace msp);
-#endif /* NO_MALLINFO */
-
-/*
-  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
-*/
-  size_t mspace_usable_size(void* mem);
-
-/*
-  mspace_malloc_stats behaves as malloc_stats, but reports
-  properties of the given space.
-*/
-void mspace_malloc_stats(mspace msp);
-
-/*
-  mspace_trim behaves as malloc_trim, but
-  operates within the given space.
-*/
-int mspace_trim(mspace msp, size_t pad);
-
-/*
-  An alias for mallopt.
-*/
-int mspace_mallopt(int, int);
-
-#endif /* MSPACES */
-
-#ifdef __cplusplus
-}  /* end of extern "C" */
-#endif /* __cplusplus */
-
-/*
-  ========================================================================
-  To make a fully customizable malloc.h header file, cut everything
-  above this line, put into file malloc.h, edit to suit, and #include it
-  on the next line, as well as in programs that use this malloc.
-  ========================================================================
-*/
-
-/* #include "malloc.h" */
-
-/*------------------------------ internal #includes ---------------------- */
-
-#ifdef WIN32
-#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
-#endif /* WIN32 */
-
-#include <stdio.h>       /* for printing in malloc_stats */
-
-#ifndef LACKS_ERRNO_H
-#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
-#endif /* LACKS_ERRNO_H */
-#if FOOTERS || DEBUG
-#include <time.h>        /* for magic initialization */
-#endif /* FOOTERS */
-#ifndef LACKS_STDLIB_H
-#include <stdlib.h>      /* for abort() */
-#endif /* LACKS_STDLIB_H */
-#ifdef DEBUG
-#if ABORT_ON_ASSERT_FAILURE
-#undef assert
-#define assert(x) if(!(x)) ABORT
-#else /* ABORT_ON_ASSERT_FAILURE */
-#include <assert.h>
-#endif /* ABORT_ON_ASSERT_FAILURE */
-#else  /* DEBUG */
-#ifndef assert
-#define assert(x)
-#endif
-#define DEBUG 0
-#endif /* DEBUG */
-#ifndef LACKS_STRING_H
-#include <string.h>      /* for memset etc */
-#endif  /* LACKS_STRING_H */
-#if USE_BUILTIN_FFS
-#ifndef LACKS_STRINGS_H
-#include <strings.h>     /* for ffs */
-#endif /* LACKS_STRINGS_H */
-#endif /* USE_BUILTIN_FFS */
-#if HAVE_MMAP
-#ifndef LACKS_SYS_MMAN_H
-/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
-#if (defined(linux) && !defined(__USE_GNU))
-#define __USE_GNU 1
-#include <sys/mman.h>    /* for mmap */
-#undef __USE_GNU
-#else
-#include <sys/mman.h>    /* for mmap */
-#endif /* linux */
-#endif /* LACKS_SYS_MMAN_H */
-#ifndef LACKS_FCNTL_H
-#include <fcntl.h>
-#endif /* LACKS_FCNTL_H */
-#endif /* HAVE_MMAP */
-#ifndef LACKS_UNISTD_H
-#include <unistd.h>     /* for sbrk, sysconf */
-#else /* LACKS_UNISTD_H */
-#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
-extern void*     sbrk(ptrdiff_t);
-#endif /* FreeBSD etc */
-#endif /* LACKS_UNISTD_H */
-
-/* Declarations for locking */
-#if USE_LOCKS
-#ifndef WIN32
-#include <pthread.h>
-#if defined (__SVR4) && defined (__sun)  /* solaris */
-#include <thread.h>
-#endif /* solaris */
-#else
-#ifndef _M_AMD64
-/* These are already defined on AMD64 builds */
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
-LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-#endif /* _M_AMD64 */
-#pragma intrinsic (_InterlockedCompareExchange)
-#pragma intrinsic (_InterlockedExchange)
-#define interlockedcompareexchange _InterlockedCompareExchange
-#define interlockedexchange _InterlockedExchange
-#endif /* Win32 */
-#endif /* USE_LOCKS */
-
-/* Declarations for bit scanning on win32 */
-#if defined(_MSC_VER) && _MSC_VER>=1300
-#ifndef BitScanForward	/* Try to avoid pulling in WinNT.h */
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
-unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#define BitScanForward _BitScanForward
-#define BitScanReverse _BitScanReverse
-#pragma intrinsic(_BitScanForward)
-#pragma intrinsic(_BitScanReverse)
-#endif /* BitScanForward */
-#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */
-
-#ifndef WIN32
-#ifndef malloc_getpagesize
-#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
-#    ifndef _SC_PAGE_SIZE
-#      define _SC_PAGE_SIZE _SC_PAGESIZE
-#    endif
-#  endif
-#  ifdef _SC_PAGE_SIZE
-#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
-#  else
-#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
-       extern size_t getpagesize();
-#      define malloc_getpagesize getpagesize()
-#    else
-#      ifdef WIN32 /* use supplied emulation of getpagesize */
-#        define malloc_getpagesize getpagesize()
-#      else
-#        ifndef LACKS_SYS_PARAM_H
-#          include <sys/param.h>
-#        endif
-#        ifdef EXEC_PAGESIZE
-#          define malloc_getpagesize EXEC_PAGESIZE
-#        else
-#          ifdef NBPG
-#            ifndef CLSIZE
-#              define malloc_getpagesize NBPG
-#            else
-#              define malloc_getpagesize (NBPG * CLSIZE)
-#            endif
-#          else
-#            ifdef NBPC
-#              define malloc_getpagesize NBPC
-#            else
-#              ifdef PAGESIZE
-#                define malloc_getpagesize PAGESIZE
-#              else /* just guess */
-#                define malloc_getpagesize ((size_t)4096U)
-#              endif
-#            endif
-#          endif
-#        endif
-#      endif
-#    endif
-#  endif
-#endif
-#endif
-
-
-
-/* ------------------- size_t and alignment properties -------------------- */
-
-/* The byte and bit size of a size_t */
-#define SIZE_T_SIZE         (sizeof(size_t))
-#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)
-
-/* Some constants coerced to size_t */
-/* Annoying but necessary to avoid errors on some platforms */
-#define SIZE_T_ZERO         ((size_t)0)
-#define SIZE_T_ONE          ((size_t)1)
-#define SIZE_T_TWO          ((size_t)2)
-#define SIZE_T_FOUR         ((size_t)4)
-#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
-#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
-#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
-#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)
-
-/* The bit mask value corresponding to MALLOC_ALIGNMENT */
-#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)
-
-/* True if address a has acceptable alignment */
-#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)
-
-/* the number of bytes to offset an address to align it */
-#define align_offset(A)\
- ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
-  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))
-
-/*
-  malloc_params holds global properties, including those that can be
-  dynamically set using mallopt. There is a single instance, mparams,
-  initialized in init_mparams. Note that the non-zeroness of "magic"
-  also serves as an initialization flag.
-*/
-typedef unsigned int flag_t;
-struct malloc_params {
-  volatile size_t magic;
-  size_t page_size;
-  size_t granularity;
-  size_t mmap_threshold;
-  size_t trim_threshold;
-  flag_t default_mflags;
-};
-
-static struct malloc_params mparams;
-
-/* Ensure mparams initialized */
-#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams())
-
-/* -------------------------- MMAP preliminaries ------------------------- */
-
-/*
-   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
-   checks to fail so compiler optimizer can delete code rather than
-   using so many "#if"s.
-*/
-
-
-/* MORECORE and MMAP must return MFAIL on failure */
-#define MFAIL                ((void*)(MAX_SIZE_T))
-#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */
-
-#if HAVE_MMAP
-
-#ifndef WIN32
-#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
-#define MAP_ANONYMOUS        MAP_ANON
-#endif /* MAP_ANON */
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-#define MMAP_IMPL mmap_aligned
-static void* lastAlignedmmap; /* Used as a hint */
-static void* mmap_aligned(void *start, size_t length, int prot, int flags, int fd, off_t offset) {
-  void* baseaddress = 0;
-  void* ptr = 0;
-  if(!start) {
-    baseaddress = lastAlignedmmap;
-    for(;;) {
-      if(baseaddress) flags|=MAP_FIXED;
-      ptr = mmap(baseaddress, length, prot, flags, fd, offset);
-      if(!ptr)
-        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
-      else if((size_t)ptr & (mparams.granularity - SIZE_T_ONE)) {
-        munmap(ptr, length);
-        baseaddress = (void*)(((size_t)ptr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
-      }
-      else break;
-    }
-  }
-  else ptr = mmap(start, length, prot, flags, fd, offset);
-  if(ptr) lastAlignedmmap = (void*)((size_t) ptr + mparams.granularity);
-  return ptr;
-}
-#else
-#define MMAP_IMPL mmap
-#endif /* DEFAULT_GRANULARITY_ALIGNED */
-#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
-#define MMAP_PROT            (PROT_READ|PROT_WRITE)
-#ifdef MAP_ANONYMOUS
-#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
-#define MMAP_DEFAULT(s)       MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
-#else /* MAP_ANONYMOUS */
-/*
-   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
-   is unlikely to be needed, but is supplied just in case.
-*/
-#define MMAP_FLAGS           (MAP_PRIVATE)
-static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
-#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
-           (dev_zero_fd = open("/dev/zero", O_RDWR), \
-            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
-            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
-#endif /* MAP_ANONYMOUS */
-
-#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)
-
-#else /* WIN32 */
-
-/* Win32 MMAP via VirtualAlloc */
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-static void* lastWin32mmap; /* Used as a hint */
-#endif /* DEFAULT_GRANULARITY_ALIGNED */
-#ifdef ENABLE_LARGE_PAGES
-static int largepagesavailable = 1;
-#endif /* ENABLE_LARGE_PAGES */
-static FORCEINLINE void* win32mmap(size_t size) {
-  void* baseaddress = 0;
-  void* ptr = 0;
-#ifdef ENABLE_LARGE_PAGES
-  /* Note that large pages are *always* allocated on a large page boundary.
-  If however granularity is small then don't waste a kernel call if size
-  isn't around the size of a large page */
-  if(largepagesavailable && size >= 1*1024*1024) {
-    ptr = VirtualAlloc(baseaddress, size, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
-    if(!ptr && ERROR_PRIVILEGE_NOT_HELD==GetLastError()) largepagesavailable=0;
-  }
-#endif
-  if(!ptr) {
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-    /* We try to avoid overhead by speculatively reserving at aligned
-    addresses until we succeed */
-    baseaddress = lastWin32mmap;
-    for(;;) {
-      void* reserveaddr = VirtualAlloc(baseaddress, size, MEM_RESERVE, PAGE_READWRITE);
-      if(!reserveaddr)
-        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
-      else if((size_t)reserveaddr & (mparams.granularity - SIZE_T_ONE)) {
-        VirtualFree(reserveaddr, 0, MEM_RELEASE);
-        baseaddress = (void*)(((size_t)reserveaddr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
-      }
-      else break;
-    }
-#endif
-    if(!ptr) ptr = VirtualAlloc(baseaddress, size, baseaddress ? MEM_COMMIT : MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
-#if DEBUG
-    if(lastWin32mmap && ptr!=lastWin32mmap) printf("Non-contiguous VirtualAlloc between %p and %p\n", ptr, lastWin32mmap);
-#endif
-#ifdef DEFAULT_GRANULARITY_ALIGNED
-    if(ptr) lastWin32mmap = (void*)((size_t) ptr + mparams.granularity);
-#endif
-  }
-#if DEBUG
-#ifdef ENABLE_LARGE_PAGES
-  printf("VirtualAlloc returns %p size %u. LargePagesAvailable=%d\n", ptr, size, largepagesavailable);
-#else
-  printf("VirtualAlloc returns %p size %u\n", ptr, size);
-#endif
-#endif
-  return (ptr != 0)? ptr: MFAIL;
-}
-
-/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
-static FORCEINLINE void* win32direct_mmap(size_t size) {
-  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
-                           PAGE_READWRITE);
-  return (ptr != 0)? ptr: MFAIL;
-}
-
-/* This function supports releasing coalesed segments */
-static FORCEINLINE int win32munmap(void* ptr, size_t size) {
-  MEMORY_BASIC_INFORMATION minfo;
-  char* cptr = (char*)ptr;
-  while (size) {
-    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
-      return -1;
-    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
-        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
-      return -1;
-    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
-      return -1;
-    cptr += minfo.RegionSize;
-    size -= minfo.RegionSize;
-  }
-  return 0;
-}
-
-#define MMAP_DEFAULT(s)             win32mmap(s)
-#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
-#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
-#endif /* WIN32 */
-#endif /* HAVE_MMAP */
-
-#if HAVE_MREMAP
-#ifndef WIN32
-#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
-#endif /* WIN32 */
-#endif /* HAVE_MREMAP */
-
-
-/**
- * Define CALL_MORECORE
- */
-#if HAVE_MORECORE
-    #ifdef MORECORE
-        #define CALL_MORECORE(S)    MORECORE(S)
-    #else  /* MORECORE */
-        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
-    #endif /* MORECORE */
-#else  /* HAVE_MORECORE */
-    #define CALL_MORECORE(S)        MFAIL
-#endif /* HAVE_MORECORE */
-
-/**
- * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
- */
-#if HAVE_MMAP
-    #define USE_MMAP_BIT            (SIZE_T_ONE)
-
-    #ifdef MMAP
-        #define CALL_MMAP(s)        MMAP(s)
-    #else /* MMAP */
-        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
-    #endif /* MMAP */
-    #ifdef MUNMAP
-        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
-    #else /* MUNMAP */
-        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
-    #endif /* MUNMAP */
-    #ifdef DIRECT_MMAP
-        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
-    #else /* DIRECT_MMAP */
-        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
-    #endif /* DIRECT_MMAP */
-#else  /* HAVE_MMAP */
-    #define USE_MMAP_BIT            (SIZE_T_ZERO)
-
-    #define MMAP(s)                 MFAIL
-    #define MUNMAP(a, s)            (-1)
-    #define DIRECT_MMAP(s)          MFAIL
-    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
-    #define CALL_MMAP(s)            MMAP(s)
-    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
-#endif /* HAVE_MMAP */
-
-/**
- * Define CALL_MREMAP
- */
-#if HAVE_MMAP && HAVE_MREMAP
-    #ifdef MREMAP
-        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
-    #else /* MREMAP */
-        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
-    #endif /* MREMAP */
-#else  /* HAVE_MMAP && HAVE_MREMAP */
-    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
-#endif /* HAVE_MMAP && HAVE_MREMAP */
-
-/* mstate bit set if continguous morecore disabled or failed */
-#define USE_NONCONTIGUOUS_BIT (4U)
-
-/* segment bit set in create_mspace_with_base */
-#define EXTERN_BIT            (8U)
-
-
-/* --------------------------- Lock preliminaries ------------------------ */
-
-/*
-  When locks are defined, there is one global lock, plus
-  one per-mspace lock.
-
-  The global lock_ensures that mparams.magic and other unique
-  mparams values are initialized only once. It also protects
-  sequences of calls to MORECORE.  In many cases sys_alloc requires
-  two calls, that should not be interleaved with calls by other
-  threads.  This does not protect against direct calls to MORECORE
-  by other threads not using this lock, so there is still code to
-  cope the best we can on interference.
-
-  Per-mspace locks surround calls to malloc, free, etc.  To enable use
-  in layered extensions, per-mspace locks are reentrant.
-
-  Because lock-protected regions generally have bounded times, it is
-  OK to use the supplied simple spinlocks in the custom versions for
-  x86. Spinlocks are likely to improve performance for lightly
-  contended applications, but worsen performance under heavy
-  contention.
-
-  If USE_LOCKS is > 1, the definitions of lock routines here are
-  bypassed, in which case you will need to define the type MLOCK_T,
-  and at least INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK and possibly
-  TRY_LOCK (which is not used in this malloc, but commonly needed in
-  extensions.)  You must also declare a
-    static MLOCK_T malloc_global_mutex = { initialization values };.
-
-*/
-
-#if USE_LOCKS == 1
-
-#if USE_SPIN_LOCKS && SPIN_LOCKS_AVAILABLE
-#ifndef WIN32
-
-/* Custom pthread-style spin locks on x86 and x64 for gcc */
-struct pthread_mlock_t {
-  volatile unsigned int l;
-  char cachelinepadding[64];
-  unsigned int c;
-  pthread_t threadid;
-};
-#define MLOCK_T               struct pthread_mlock_t
-#define CURRENT_THREAD        pthread_self()
-#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
-#define ACQUIRE_LOCK(sl)      pthread_acquire_lock(sl)
-#define RELEASE_LOCK(sl)      pthread_release_lock(sl)
-#define TRY_LOCK(sl)          pthread_try_lock(sl)
-#define SPINS_PER_YIELD       63
-
-static MLOCK_T malloc_global_mutex = { 0, "", 0, 0};
-
-static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) {
-  int spins = 0;
-  volatile unsigned int* lp = &sl->l;
-  for (;;) {
-    if (*lp != 0) {
-      if (sl->threadid == CURRENT_THREAD) {
-        ++sl->c;
-        return 0;
-      }
-    }
-    else {
-      /* place args to cmpxchgl in locals to evade oddities in some gccs */
-      int cmp = 0;
-      int val = 1;
-      int ret;
-      __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
-                             : "=a" (ret)
-                             : "r" (val), "m" (*(lp)), "0"(cmp)
-                             : "memory", "cc");
-      if (!ret) {
-        assert(!sl->threadid);
-        sl->threadid = CURRENT_THREAD;
-        sl->c = 1;
-        return 0;
-      }
-    }
-    if ((++spins & SPINS_PER_YIELD) == 0) {
-#if defined (__SVR4) && defined (__sun) /* solaris */
-      thr_yield();
-#else
-#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
-      sched_yield();
-#else  /* no-op yield on unknown systems */
-      ;
-#endif /* __linux__ || __FreeBSD__ || __APPLE__ */
-#endif /* solaris */
-    }
-  }
-}
-
-static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) {
-  volatile unsigned int* lp = &sl->l;
-  assert(*lp != 0);
-  assert(sl->threadid == CURRENT_THREAD);
-  if (--sl->c == 0) {
-    sl->threadid = 0;
-    int prev = 0;
-    int ret;
-    __asm__ __volatile__ ("lock; xchgl %0, %1"
-                          : "=r" (ret)
-                          : "m" (*(lp)), "0"(prev)
-                          : "memory");
-  }
-}
-
-static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) {
-  volatile unsigned int* lp = &sl->l;
-  if (*lp != 0) {
-    if (sl->threadid == CURRENT_THREAD) {
-      ++sl->c;
-      return 1;
-    }
-  }
-  else {
-    int cmp = 0;
-    int val = 1;
-    int ret;
-    __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
-                           : "=a" (ret)
-                           : "r" (val), "m" (*(lp)), "0"(cmp)
-                           : "memory", "cc");
-    if (!ret) {
-      assert(!sl->threadid);
-      sl->threadid = CURRENT_THREAD;
-      sl->c = 1;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-
-#else /* WIN32 */
-/* Custom win32-style spin locks on x86 and x64 for MSC */
-struct win32_mlock_t {
-  volatile long l;
-  char cachelinepadding[64];
-  unsigned int c;
-  long threadid;
-};
-
-#define MLOCK_T               struct win32_mlock_t
-#define CURRENT_THREAD        ((long)GetCurrentThreadId())
-#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
-#define ACQUIRE_LOCK(sl)      win32_acquire_lock(sl)
-#define RELEASE_LOCK(sl)      win32_release_lock(sl)
-#define TRY_LOCK(sl)          win32_try_lock(sl)
-#define SPINS_PER_YIELD       63
-
-static MLOCK_T malloc_global_mutex = { 0, 0, 0};
-
-static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) {
-  int spins = 0;
-  for (;;) {
-    if (sl->l != 0) {
-      if (sl->threadid == CURRENT_THREAD) {
-        ++sl->c;
-        return 0;
-      }
-    }
-    else {
-      if (!interlockedexchange(&sl->l, 1)) {
-        assert(!sl->threadid);
-        sl->threadid = CURRENT_THREAD;
-        sl->c = 1;
-        return 0;
-      }
-    }
-    if ((++spins & SPINS_PER_YIELD) == 0)
-      SleepEx(0, FALSE);
-  }
-}
-
-static FORCEINLINE void win32_release_lock (MLOCK_T *sl) {
-  assert(sl->threadid == CURRENT_THREAD);
-  assert(sl->l != 0);
-  if (--sl->c == 0) {
-    sl->threadid = 0;
-    interlockedexchange (&sl->l, 0);
-  }
-}
-
-static FORCEINLINE int win32_try_lock (MLOCK_T *sl) {
-  if (sl->l != 0) {
-    if (sl->threadid == CURRENT_THREAD) {
-      ++sl->c;
-      return 1;
-    }
-  }
-  else {
-    if (!interlockedexchange(&sl->l, 1)){
-      assert(!sl->threadid);
-      sl->threadid = CURRENT_THREAD;
-      sl->c = 1;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-#endif /* WIN32 */
-#else /* USE_SPIN_LOCKS */
-
-#ifndef WIN32
-/* pthreads-based locks */
-
-#define MLOCK_T               pthread_mutex_t
-#define CURRENT_THREAD        pthread_self()
-#define INITIAL_LOCK(sl)      pthread_init_lock(sl)
-#define ACQUIRE_LOCK(sl)      pthread_mutex_lock(sl)
-#define RELEASE_LOCK(sl)      pthread_mutex_unlock(sl)
-#define TRY_LOCK(sl)          (!pthread_mutex_trylock(sl))
-
-static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-/* Cope with old-style linux recursive lock initialization by adding */
-/* skipped internal declaration from pthread.h */
-#ifdef linux
-#ifndef PTHREAD_MUTEX_RECURSIVE
-extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
-					   int __kind));
-#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
-#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
-#endif
-#endif
-
-static int pthread_init_lock (MLOCK_T *sl) {
-  pthread_mutexattr_t attr;
-  if (pthread_mutexattr_init(&attr)) return 1;
-  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
-  if (pthread_mutex_init(sl, &attr)) return 1;
-  if (pthread_mutexattr_destroy(&attr)) return 1;
-  return 0;
-}
-
-#else /* WIN32 */
-/* Win32 critical sections */
-#define MLOCK_T               CRITICAL_SECTION
-#define CURRENT_THREAD        GetCurrentThreadId()
-#define INITIAL_LOCK(s)       (!InitializeCriticalSectionAndSpinCount((s), 0x80000000|4000))
-#define ACQUIRE_LOCK(s)       (EnterCriticalSection(sl), 0)
-#define RELEASE_LOCK(s)       LeaveCriticalSection(sl)
-#define TRY_LOCK(s)           TryEnterCriticalSection(sl)
-#define NEED_GLOBAL_LOCK_INIT
-
-static MLOCK_T malloc_global_mutex;
-static volatile long malloc_global_mutex_status;
-
-/* Use spin loop to initialize global lock */
-static void init_malloc_global_mutex() {
-  for (;;) {
-    long stat = malloc_global_mutex_status;
-    if (stat > 0)
-      return;
-    /* transition to < 0 while initializing, then to > 0) */
-    if (stat == 0 &&
-        interlockedcompareexchange(&malloc_global_mutex_status, -1, 0) == 0) {
-      InitializeCriticalSection(&malloc_global_mutex);
-      interlockedexchange(&malloc_global_mutex_status,1);
-      return;
-    }
-    SleepEx(0, FALSE);
-  }
-}
-
-#endif /* WIN32 */
-#endif /* USE_SPIN_LOCKS */
-#endif /* USE_LOCKS == 1 */
-
-/* -----------------------  User-defined locks ------------------------ */
-
-#if USE_LOCKS > 1
-/* Define your own lock implementation here */
-/* #define INITIAL_LOCK(sl)  ... */
-/* #define ACQUIRE_LOCK(sl)  ... */
-/* #define RELEASE_LOCK(sl)  ... */
-/* #define TRY_LOCK(sl) ... */
-/* static MLOCK_T malloc_global_mutex = ... */
-#endif /* USE_LOCKS > 1 */
-
-/* -----------------------  Lock-based state ------------------------ */
-
-#if USE_LOCKS
-#define USE_LOCK_BIT               (2U)
-#else  /* USE_LOCKS */
-#define USE_LOCK_BIT               (0U)
-#define INITIAL_LOCK(l)
-#endif /* USE_LOCKS */
-
-#if USE_LOCKS
-#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK
-#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
-#endif
-#ifndef RELEASE_MALLOC_GLOBAL_LOCK
-#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
-#endif
-#else  /* USE_LOCKS */
-#define ACQUIRE_MALLOC_GLOBAL_LOCK()
-#define RELEASE_MALLOC_GLOBAL_LOCK()
-#endif /* USE_LOCKS */
-
-
-/* -----------------------  Chunk representations ------------------------ */
-
-/*
-  (The following includes lightly edited explanations by Colin Plumb.)
-
-  The malloc_chunk declaration below is misleading (but accurate and
-  necessary).  It declares a "view" into memory allowing access to
-  necessary fields at known offsets from a given base.
-
-  Chunks of memory are maintained using a `boundary tag' method as
-  originally described by Knuth.  (See the paper by Paul Wilson
-  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
-  techniques.)  Sizes of free chunks are stored both in the front of
-  each chunk and at the end.  This makes consolidating fragmented
-  chunks into bigger chunks fast.  The head fields also hold bits
-  representing whether chunks are free or in use.
-
-  Here are some pictures to make it clearer.  They are "exploded" to
-  show that the state of a chunk can be thought of as extending from
-  the high 31 bits of the head field of its header through the
-  prev_foot and PINUSE_BIT bit of the following chunk header.
-
-  A chunk that's in use looks like:
-
-   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-           | Size of previous chunk (if P = 0)                             |
-           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
-         | Size of this chunk                                         1| +-+
-   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         |                                                               |
-         +-                                                             -+
-         |                                                               |
-         +-                                                             -+
-         |                                                               :
-         +-      size - sizeof(size_t) available payload bytes          -+
-         :                                                               |
- chunk-> +-                                                             -+
-         |                                                               |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
-       | Size of next chunk (may or may not be in use)               | +-+
- mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-    And if it's free, it looks like this:
-
-   chunk-> +-                                                             -+
-           | User payload (must be in use, or we would have merged!)       |
-           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
-         | Size of this chunk                                         0| +-+
-   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         | Next pointer                                                  |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         | Prev pointer                                                  |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         |                                                               :
-         +-      size - sizeof(struct chunk) unused bytes               -+
-         :                                                               |
- chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-         | Size of this chunk                                            |
-         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
-       | Size of next chunk (must be in use, or we would have merged)| +-+
- mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-       |                                                               :
-       +- User payload                                                -+
-       :                                                               |
-       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-                                                                     |0|
-                                                                     +-+
-  Note that since we always merge adjacent free chunks, the chunks
-  adjacent to a free chunk must be in use.
-
-  Given a pointer to a chunk (which can be derived trivially from the
-  payload pointer) we can, in O(1) time, find out whether the adjacent
-  chunks are free, and if so, unlink them from the lists that they
-  are on and merge them with the current chunk.
-
-  Chunks always begin on even word boundaries, so the mem portion
-  (which is returned to the user) is also on an even word boundary, and
-  thus at least double-word aligned.
-
-  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
-  chunk size (which is always a multiple of two words), is an in-use
-  bit for the *previous* chunk.  If that bit is *clear*, then the
-  word before the current chunk size contains the previous chunk
-  size, and can be used to find the front of the previous chunk.
-  The very first chunk allocated always has this bit set, preventing
-  access to non-existent (or non-owned) memory. If pinuse is set for
-  any given chunk, then you CANNOT determine the size of the
-  previous chunk, and might even get a memory addressing fault when
-  trying to do so.
-
-  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
-  the chunk size redundantly records whether the current chunk is
-  inuse (unless the chunk is mmapped). This redundancy enables usage
-  checks within free and realloc, and reduces indirection when freeing
-  and consolidating chunks.
-
-  Each freshly allocated chunk must have both cinuse and pinuse set.
-  That is, each allocated chunk borders either a previously allocated
-  and still in-use chunk, or the base of its memory arena. This is
-  ensured by making all allocations from the the `lowest' part of any
-  found chunk.  Further, no free chunk physically borders another one,
-  so each free chunk is known to be preceded and followed by either
-  inuse chunks or the ends of memory.
-
-  Note that the `foot' of the current chunk is actually represented
-  as the prev_foot of the NEXT chunk. This makes it easier to
-  deal with alignments etc but can be very confusing when trying
-  to extend or adapt this code.
-
-  The exceptions to all this are
-
-     1. The special chunk `top' is the top-most available chunk (i.e.,
-        the one bordering the end of available memory). It is treated
-        specially.  Top is never included in any bin, is used only if
-        no other chunk is available, and is released back to the
-        system if it is very large (see M_TRIM_THRESHOLD).  In effect,
-        the top chunk is treated as larger (and thus less well
-        fitting) than any other available chunk.  The top chunk
-        doesn't update its trailing size field since there is no next
-        contiguous chunk that would have to index off it. However,
-        space is still allocated for it (TOP_FOOT_SIZE) to enable
-        separation or merging when space is extended.
-
-     3. Chunks allocated via mmap, have both cinuse and pinuse bits
-        cleared in their head fields.  Because they are allocated
-        one-by-one, each must carry its own prev_foot field, which is
-        also used to hold the offset this chunk has within its mmapped
-        region, which is needed to preserve alignment. Each mmapped
-        chunk is trailed by the first two fields of a fake next-chunk
-        for sake of usage checks.
-
-*/
-
-struct malloc_chunk {
-  size_t               prev_foot;  /* Size of previous chunk (if free).  */
-  size_t               head;       /* Size and inuse bits. */
-  struct malloc_chunk* fd;         /* double links -- used only if free. */
-  struct malloc_chunk* bk;
-};
-
-typedef struct malloc_chunk  mchunk;
-typedef struct malloc_chunk* mchunkptr;
-typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
-typedef unsigned int bindex_t;         /* Described below */
-typedef unsigned int binmap_t;         /* Described below */
-
-/* ------------------- Chunks sizes and alignments ----------------------- */
-
-#define MCHUNK_SIZE         (sizeof(mchunk))
-
-#if FOOTERS
-#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
-#else /* FOOTERS */
-#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
-#endif /* FOOTERS */
-
-/* MMapped chunks need a second word of overhead ... */
-#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
-/* ... and additional padding for fake next-chunk at foot */
-#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)
-
-/* The smallest size we can malloc is an aligned minimal chunk */
-#define MIN_CHUNK_SIZE\
-  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
-
-/* conversion from malloc headers to user pointers, and back */
-#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
-#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
-/* chunk associated with aligned address A */
-#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))
-
-/* Bounds on request (not chunk) sizes. */
-#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
-#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)
-
-/* pad request bytes into a usable size */
-#define pad_request(req) \
-   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
-
-/* pad request, checking for minimum (but not maximum) */
-#define request2size(req) \
-  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))
-
-
-/* ------------------ Operations on head and foot fields ----------------- */
-
-/*
-  The head field of a chunk is or'ed with PINUSE_BIT when previous
-  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
-  use, unless mmapped, in which case both bits are cleared.
-
-  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
-*/
-
-#define PINUSE_BIT          (SIZE_T_ONE)
-#define CINUSE_BIT          (SIZE_T_TWO)
-#define FLAG4_BIT           (SIZE_T_FOUR)
-#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
-#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)
-
-/* Head value for fenceposts */
-#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)
-
-/* extraction of fields from head words */
-#define cinuse(p)           ((p)->head & CINUSE_BIT)
-#define pinuse(p)           ((p)->head & PINUSE_BIT)
-#define is_inuse(p)         (((p)->head & INUSE_BITS) != PINUSE_BIT)
-#define is_mmapped(p)       (((p)->head & INUSE_BITS) == 0)
-
-#define chunksize(p)        ((p)->head & ~(FLAG_BITS))
-
-#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
-
-/* Treat space at ptr +/- offset as a chunk */
-#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
-#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))
-
-/* Ptr to next or previous physical malloc_chunk. */
-#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
-#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))
-
-/* extract next chunk's pinuse bit */
-#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)
-
-/* Get/set size at footer */
-#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
-#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))
-
-/* Set size, pinuse bit, and foot */
-#define set_size_and_pinuse_of_free_chunk(p, s)\
-  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))
-
-/* Set size, pinuse bit, foot, and clear next pinuse */
-#define set_free_with_pinuse(p, s, n)\
-  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))
-
-/* Get the internal overhead associated with chunk p */
-#define overhead_for(p)\
- (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)
-
-/* Return true if malloced space is not necessarily cleared */
-#if MMAP_CLEARS
-#define calloc_must_clear(p) (!is_mmapped(p))
-#else /* MMAP_CLEARS */
-#define calloc_must_clear(p) (1)
-#endif /* MMAP_CLEARS */
-
-/* ---------------------- Overlaid data structures ----------------------- */
-
-/*
-  When chunks are not in use, they are treated as nodes of either
-  lists or trees.
-
-  "Small"  chunks are stored in circular doubly-linked lists, and look
-  like this:
-
-    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Size of previous chunk                            |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `head:' |             Size of chunk, in bytes                         |P|
-      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Forward pointer to next chunk in list             |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Back pointer to previous chunk in list            |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Unused space (may be 0 bytes long)                .
-            .                                                               .
-            .                                                               |
-nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `foot:' |             Size of chunk, in bytes                           |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-  Larger chunks are kept in a form of bitwise digital trees (aka
-  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
-  free chunks greater than 256 bytes, their size doesn't impose any
-  constraints on user chunk sizes.  Each node looks like:
-
-    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Size of previous chunk                            |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `head:' |             Size of chunk, in bytes                         |P|
-      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Forward pointer to next chunk of same size        |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Back pointer to previous chunk of same size       |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Pointer to left child (child[0])                  |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Pointer to right child (child[1])                 |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Pointer to parent                                 |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             bin index of this chunk                           |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-            |             Unused space                                      .
-            .                                                               |
-nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    `foot:' |             Size of chunk, in bytes                           |
-            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
-  of the same size are arranged in a circularly-linked list, with only
-  the oldest chunk (the next to be used, in our FIFO ordering)
-  actually in the tree.  (Tree members are distinguished by a non-null
-  parent pointer.)  If a chunk with the same size an an existing node
-  is inserted, it is linked off the existing node using pointers that
-  work in the same way as fd/bk pointers of small chunks.
-
-  Each tree contains a power of 2 sized range of chunk sizes (the
-  smallest is 0x100 <= x < 0x180), which is is divided in half at each
-  tree level, with the chunks in the smaller half of the range (0x100
-  <= x < 0x140 for the top nose) in the left subtree and the larger
-  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
-  done by inspecting individual bits.
-
-  Using these rules, each node's left subtree contains all smaller
-  sizes than its right subtree.  However, the node at the root of each
-  subtree has no particular ordering relationship to either.  (The
-  dividing line between the subtree sizes is based on trie relation.)
-  If we remove the last chunk of a given size from the interior of the
-  tree, we need to replace it with a leaf node.  The tree ordering
-  rules permit a node to be replaced by any leaf below it.
-
-  The smallest chunk in a tree (a common operation in a best-fit
-  allocator) can be found by walking a path to the leftmost leaf in
-  the tree.  Unlike a usual binary tree, where we follow left child
-  pointers until we reach a null, here we follow the right child
-  pointer any time the left one is null, until we reach a leaf with
-  both child pointers null. The smallest chunk in the tree will be
-  somewhere along that path.
-
-  The worst case number of steps to add, find, or remove a node is
-  bounded by the number of bits differentiating chunks within
-  bins. Under current bin calculations, this ranges from 6 up to 21
-  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
-  is of course much better.
-*/
-
-struct malloc_tree_chunk {
-  /* The first four fields must be compatible with malloc_chunk */
-  size_t                    prev_foot;
-  size_t                    head;
-  struct malloc_tree_chunk* fd;
-  struct malloc_tree_chunk* bk;
-
-  struct malloc_tree_chunk* child[2];
-  struct malloc_tree_chunk* parent;
-  bindex_t                  index;
-};
-
-typedef struct malloc_tree_chunk  tchunk;
-typedef struct malloc_tree_chunk* tchunkptr;
-typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */
-
-/* A little helper macro for trees */
-#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])
-
-/* ----------------------------- Segments -------------------------------- */
-
-/*
-  Each malloc space may include non-contiguous segments, held in a
-  list headed by an embedded malloc_segment record representing the
-  top-most space. Segments also include flags holding properties of
-  the space. Large chunks that are directly allocated by mmap are not
-  included in this list. They are instead independently created and
-  destroyed without otherwise keeping track of them.
-
-  Segment management mainly comes into play for spaces allocated by
-  MMAP.  Any call to MMAP might or might not return memory that is
-  adjacent to an existing segment.  MORECORE normally contiguously
-  extends the current space, so this space is almost always adjacent,
-  which is simpler and faster to deal with. (This is why MORECORE is
-  used preferentially to MMAP when both are available -- see
-  sys_alloc.)  When allocating using MMAP, we don't use any of the
-  hinting mechanisms (inconsistently) supported in various
-  implementations of unix mmap, or distinguish reserving from
-  committing memory. Instead, we just ask for space, and exploit
-  contiguity when we get it.  It is probably possible to do
-  better than this on some systems, but no general scheme seems
-  to be significantly better.
-
-  Management entails a simpler variant of the consolidation scheme
-  used for chunks to reduce fragmentation -- new adjacent memory is
-  normally prepended or appended to an existing segment. However,
-  there are limitations compared to chunk consolidation that mostly
-  reflect the fact that segment processing is relatively infrequent
-  (occurring only when getting memory from system) and that we
-  don't expect to have huge numbers of segments:
-
-  * Segments are not indexed, so traversal requires linear scans.  (It
-    would be possible to index these, but is not worth the extra
-    overhead and complexity for most programs on most platforms.)
-  * New segments are only appended to old ones when holding top-most
-    memory; if they cannot be prepended to others, they are held in
-    different segments.
-
-  Except for the top-most segment of an mstate, each segment record
-  is kept at the tail of its segment. Segments are added by pushing
-  segment records onto the list headed by &mstate.seg for the
-  containing mstate.
-
-  Segment flags control allocation/merge/deallocation policies:
-  * If EXTERN_BIT set, then we did not allocate this segment,
-    and so should not try to deallocate or merge with others.
-    (This currently holds only for the initial segment passed
-    into create_mspace_with_base.)
-  * If USE_MMAP_BIT set, the segment may be merged with
-    other surrounding mmapped segments and trimmed/de-allocated
-    using munmap.
-  * If neither bit is set, then the segment was obtained using
-    MORECORE so can be merged with surrounding MORECORE'd segments
-    and deallocated/trimmed using MORECORE with negative arguments.
-*/
-
-struct malloc_segment {
-  char*        base;             /* base address */
-  size_t       size;             /* allocated size */
-  struct malloc_segment* next;   /* ptr to next segment */
-  flag_t       sflags;           /* mmap and extern flag */
-};
-
-#define is_mmapped_segment(S)  ((S)->sflags & USE_MMAP_BIT)
-#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)
-
-typedef struct malloc_segment  msegment;
-typedef struct malloc_segment* msegmentptr;
-
-/* ---------------------------- malloc_state ----------------------------- */
-
-/*
-   A malloc_state holds all of the bookkeeping for a space.
-   The main fields are:
-
-  Top
-    The topmost chunk of the currently active segment. Its size is
-    cached in topsize.  The actual size of topmost space is
-    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
-    fenceposts and segment records if necessary when getting more
-    space from the system.  The size at which to autotrim top is
-    cached from mparams in trim_check, except that it is disabled if
-    an autotrim fails.
-
-  Designated victim (dv)
-    This is the preferred chunk for servicing small requests that
-    don't have exact fits.  It is normally the chunk split off most
-    recently to service another small request.  Its size is cached in
-    dvsize. The link fields of this chunk are not maintained since it
-    is not kept in a bin.
-
-  SmallBins
-    An array of bin headers for free chunks.  These bins hold chunks
-    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
-    chunks of all the same size, spaced 8 bytes apart.  To simplify
-    use in double-linked lists, each bin header acts as a malloc_chunk
-    pointing to the real first node, if it exists (else pointing to
-    itself).  This avoids special-casing for headers.  But to avoid
-    waste, we allocate only the fd/bk pointers of bins, and then use
-    repositioning tricks to treat these as the fields of a chunk.
-
-  TreeBins
-    Treebins are pointers to the roots of trees holding a range of
-    sizes. There are 2 equally spaced treebins for each power of two
-    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
-    larger.
-
-  Bin maps
-    There is one bit map for small bins ("smallmap") and one for
-    treebins ("treemap).  Each bin sets its bit when non-empty, and
-    clears the bit when empty.  Bit operations are then used to avoid
-    bin-by-bin searching -- nearly all "search" is done without ever
-    looking at bins that won't be selected.  The bit maps
-    conservatively use 32 bits per map word, even if on 64bit system.
-    For a good description of some of the bit-based techniques used
-    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
-    supplement at http://hackersdelight.org/). Many of these are
-    intended to reduce the branchiness of paths through malloc etc, as
-    well as to reduce the number of memory locations read or written.
-
-  Segments
-    A list of segments headed by an embedded malloc_segment record
-    representing the initial space.
-
-  Address check support
-    The least_addr field is the least address ever obtained from
-    MORECORE or MMAP. Attempted frees and reallocs of any address less
-    than this are trapped (unless INSECURE is defined).
-
-  Magic tag
-    A cross-check field that should always hold same value as mparams.magic.
-
-  Flags
-    Bits recording whether to use MMAP, locks, or contiguous MORECORE
-
-  Statistics
-    Each space keeps track of current and maximum system memory
-    obtained via MORECORE or MMAP.
-
-  Trim support
-    Fields holding the amount of unused topmost memory that should trigger
-    timming, and a counter to force periodic scanning to release unused
-    non-topmost segments.
-
-  Locking
-    If USE_LOCKS is defined, the "mutex" lock is acquired and released
-    around every public call using this mspace.
-
-  Extension support
-    A void* pointer and a size_t field that can be used to help implement
-    extensions to this malloc.
-*/
-
-/* Bin types, widths and sizes */
-#define NSMALLBINS        (32U)
-#define NTREEBINS         (32U)
-#define SMALLBIN_SHIFT    (3U)
-#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
-#define TREEBIN_SHIFT     (8U)
-#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
-#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
-#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)
-
-struct malloc_state {
-  binmap_t   smallmap;
-  binmap_t   treemap;
-  size_t     dvsize;
-  size_t     topsize;
-  char*      least_addr;
-  mchunkptr  dv;
-  mchunkptr  top;
-  size_t     trim_check;
-  size_t     release_checks;
-  size_t     magic;
-  mchunkptr  smallbins[(NSMALLBINS+1)*2];
-  tbinptr    treebins[NTREEBINS];
-  size_t     footprint;
-  size_t     max_footprint;
-  flag_t     mflags;
-  msegment   seg;
-#if USE_LOCKS
-  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
-#endif /* USE_LOCKS */
-  void*      extp;      /* Unused but available for extensions */
-  size_t     exts;
-};
-
-typedef struct malloc_state*    mstate;
-
-/* ------------- Global malloc_state and malloc_params ------------------- */
-
-#if !ONLY_MSPACES
-
-/* The global malloc_state used for all non-"mspace" calls */
-static struct malloc_state _gm_;
-#define gm                 (&_gm_)
-#define is_global(M)       ((M) == &_gm_)
-
-#endif /* !ONLY_MSPACES */
-
-#define is_initialized(M)  ((M)->top != 0)
-
-/* -------------------------- system alloc setup ------------------------- */
-
-/* Operations on mflags */
-
-#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
-#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
-#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
-
-#define use_mmap(M)           ((M)->mflags &   USE_MMAP_BIT)
-#define enable_mmap(M)        ((M)->mflags |=  USE_MMAP_BIT)
-#define disable_mmap(M)       ((M)->mflags &= ~USE_MMAP_BIT)
-
-#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
-#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)
-
-#define set_lock(M,L)\
- ((M)->mflags = (L)?\
-  ((M)->mflags | USE_LOCK_BIT) :\
-  ((M)->mflags & ~USE_LOCK_BIT))
-
-/* page-align a size */
-#define page_align(S)\
- (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))
-
-/* granularity-align a size */
-#define granularity_align(S)\
-  (((S) + (mparams.granularity - SIZE_T_ONE))\
-   & ~(mparams.granularity - SIZE_T_ONE))
-
-
-/* For mmap, use granularity alignment on windows, else page-align */
-#ifdef WIN32
-#define mmap_align(S) granularity_align(S)
-#else
-#define mmap_align(S) page_align(S)
-#endif
-
-/* For sys_alloc, enough padding to ensure can malloc request on success */
-#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)
-
-#define is_page_aligned(S)\
-   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
-#define is_granularity_aligned(S)\
-   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)
-
-/*  True if segment S holds address A */
-#define segment_holds(S, A)\
-  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)
-
-/* Return segment holding given address */
-static msegmentptr segment_holding(mstate m, char* addr) {
-  msegmentptr sp = &m->seg;
-  for (;;) {
-    if (addr >= sp->base && addr < sp->base + sp->size)
-      return sp;
-    if ((sp = sp->next) == 0)
-      return 0;
-  }
-}
-
-/* Return true if segment contains a segment link */
-static int has_segment_link(mstate m, msegmentptr ss) {
-  msegmentptr sp = &m->seg;
-  for (;;) {
-    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
-      return 1;
-    if ((sp = sp->next) == 0)
-      return 0;
-  }
-}
-
-#ifndef MORECORE_CANNOT_TRIM
-#define should_trim(M,s)  ((s) > (M)->trim_check)
-#else  /* MORECORE_CANNOT_TRIM */
-#define should_trim(M,s)  (0)
-#endif /* MORECORE_CANNOT_TRIM */
-
-/*
-  TOP_FOOT_SIZE is padding at the end of a segment, including space
-  that may be needed to place segment records and fenceposts when new
-  noncontiguous segments are added.
-*/
-#define TOP_FOOT_SIZE\
-  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)
-
-
-/* -------------------------------  Hooks -------------------------------- */
-
-/*
-  PREACTION should be defined to return 0 on success, and nonzero on
-  failure. If you are not using locking, you can redefine these to do
-  anything you like.
-*/
-
-#if USE_LOCKS
-
-#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
-#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
-#else /* USE_LOCKS */
-
-#ifndef PREACTION
-#define PREACTION(M) (0)
-#endif  /* PREACTION */
-
-#ifndef POSTACTION
-#define POSTACTION(M)
-#endif  /* POSTACTION */
-
-#endif /* USE_LOCKS */
-
-/*
-  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
-  USAGE_ERROR_ACTION is triggered on detected bad frees and
-  reallocs. The argument p is an address that might have triggered the
-  fault. It is ignored by the two predefined actions, but might be
-  useful in custom actions that try to help diagnose errors.
-*/
-
-#if PROCEED_ON_ERROR
-
-/* A count of the number of corruption errors causing resets */
-int malloc_corruption_error_count;
-
-/* default corruption action */
-static void reset_on_error(mstate m);
-
-#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
-#define USAGE_ERROR_ACTION(m, p)
-
-#else /* PROCEED_ON_ERROR */
-
-#ifndef CORRUPTION_ERROR_ACTION
-#define CORRUPTION_ERROR_ACTION(m) ABORT
-#endif /* CORRUPTION_ERROR_ACTION */
-
-#ifndef USAGE_ERROR_ACTION
-#define USAGE_ERROR_ACTION(m,p) ABORT
-#endif /* USAGE_ERROR_ACTION */
-
-#endif /* PROCEED_ON_ERROR */
-
-/* -------------------------- Debugging setup ---------------------------- */
-
-#if ! DEBUG
-
-#define check_free_chunk(M,P)
-#define check_inuse_chunk(M,P)
-#define check_malloced_chunk(M,P,N)
-#define check_mmapped_chunk(M,P)
-#define check_malloc_state(M)
-#define check_top_chunk(M,P)
-
-#else /* DEBUG */
-#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
-#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
-#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
-#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
-#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
-#define check_malloc_state(M)       do_check_malloc_state(M)
-
-static void   do_check_any_chunk(mstate m, mchunkptr p);
-static void   do_check_top_chunk(mstate m, mchunkptr p);
-static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
-static void   do_check_inuse_chunk(mstate m, mchunkptr p);
-static void   do_check_free_chunk(mstate m, mchunkptr p);
-static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
-static void   do_check_tree(mstate m, tchunkptr t);
-static void   do_check_treebin(mstate m, bindex_t i);
-static void   do_check_smallbin(mstate m, bindex_t i);
-static void   do_check_malloc_state(mstate m);
-static int    bin_find(mstate m, mchunkptr x);
-static size_t traverse_and_check(mstate m);
-#endif /* DEBUG */
-
-/* ---------------------------- Indexing Bins ---------------------------- */
-
-#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
-#define small_index(s)      (bindex_t)((s)  >> SMALLBIN_SHIFT)
-#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
-#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))
-
-/* addressing by index. See above about smallbin repositioning */
-#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
-#define treebin_at(M,i)     (&((M)->treebins[i]))
-
-/* assign tree index for size S to variable I. Use x86 asm if possible  */
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-#define compute_tree_index(S, I)\
-{\
-  unsigned int X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int K;\
-    __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "g"  (X));\
-    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
-  }\
-}
-
-#elif defined (__INTEL_COMPILER)
-#define compute_tree_index(S, I)\
-{\
-  size_t X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int K = _bit_scan_reverse (X); \
-    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
-  }\
-}
-
-#elif defined(_MSC_VER) && _MSC_VER>=1300
-#define compute_tree_index(S, I)\
-{\
-  size_t X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int K;\
-    _BitScanReverse((DWORD *) &K, (DWORD) X);\
-    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
-  }\
-}
-
-#else /* GNUC */
-#define compute_tree_index(S, I)\
-{\
-  size_t X = S >> TREEBIN_SHIFT;\
-  if (X == 0)\
-    I = 0;\
-  else if (X > 0xFFFF)\
-    I = NTREEBINS-1;\
-  else {\
-    unsigned int Y = (unsigned int)X;\
-    unsigned int N = ((Y - 0x100) >> 16) & 8;\
-    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
-    N += K;\
-    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
-    K = 14 - N + ((Y <<= K) >> 15);\
-    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
-  }\
-}
-#endif /* GNUC */
-
-/* Bit representing maximum resolved size in a treebin at i */
-#define bit_for_tree_index(i) \
-   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)
-
-/* Shift placing maximum resolved bit in a treebin at i as sign bit */
-#define leftshift_for_tree_index(i) \
-   ((i == NTREEBINS-1)? 0 : \
-    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))
-
-/* The size of the smallest chunk held in bin with index i */
-#define minsize_for_tree_index(i) \
-   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
-   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))
-
-
-/* ------------------------ Operations on bin maps ----------------------- */
-
-/* bit corresponding to given index */
-#define idx2bit(i)              ((binmap_t)(1) << (i))
-
-/* Mark/Clear bits with given index */
-#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
-#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
-#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))
-
-#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
-#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
-#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))
-
-/* isolate the least set bit of a bitmap */
-#define least_bit(x)         ((x) & -(x))
-
-/* mask with all bits to left of least bit of x on */
-#define left_bits(x)         ((x<<1) | -(x<<1))
-
-/* mask with all bits to left of or equal to least bit of x on */
-#define same_or_left_bits(x) ((x) | -(x))
-
-/* index corresponding to given bit. Use x86 asm if possible */
-
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int J;\
-  __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "g" (X));\
-  I = (bindex_t)J;\
-}
-
-#elif defined (__INTEL_COMPILER)
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int J;\
-  J = _bit_scan_forward (X); \
-  I = (bindex_t)J;\
-}
-
-#elif defined(_MSC_VER) && _MSC_VER>=1300
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int J;\
-  _BitScanForward((DWORD *) &J, X);\
-  I = (bindex_t)J;\
-}
-
-#elif USE_BUILTIN_FFS
-#define compute_bit2idx(X, I) I = ffs(X)-1
-
-#else
-#define compute_bit2idx(X, I)\
-{\
-  unsigned int Y = X - 1;\
-  unsigned int K = Y >> (16-4) & 16;\
-  unsigned int N = K;        Y >>= K;\
-  N += K = Y >> (8-3) &  8;  Y >>= K;\
-  N += K = Y >> (4-2) &  4;  Y >>= K;\
-  N += K = Y >> (2-1) &  2;  Y >>= K;\
-  N += K = Y >> (1-0) &  1;  Y >>= K;\
-  I = (bindex_t)(N + Y);\
-}
-#endif /* GNUC */
-
-
-/* ----------------------- Runtime Check Support ------------------------- */
-
-/*
-  For security, the main invariant is that malloc/free/etc never
-  writes to a static address other than malloc_state, unless static
-  malloc_state itself has been corrupted, which cannot occur via
-  malloc (because of these checks). In essence this means that we
-  believe all pointers, sizes, maps etc held in malloc_state, but
-  check all of those linked or offsetted from other embedded data
-  structures.  These checks are interspersed with main code in a way
-  that tends to minimize their run-time cost.
-
-  When FOOTERS is defined, in addition to range checking, we also
-  verify footer fields of inuse chunks, which can be used guarantee
-  that the mstate controlling malloc/free is intact.  This is a
-  streamlined version of the approach described by William Robertson
-  et al in "Run-time Detection of Heap-based Overflows" LISA'03
-  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
-  of an inuse chunk holds the xor of its mstate and a random seed,
-  that is checked upon calls to free() and realloc().  This is
-  (probablistically) unguessable from outside the program, but can be
-  computed by any code successfully malloc'ing any chunk, so does not
-  itself provide protection against code that has already broken
-  security through some other means.  Unlike Robertson et al, we
-  always dynamically check addresses of all offset chunks (previous,
-  next, etc). This turns out to be cheaper than relying on hashes.
-*/
-
-#if !INSECURE
-/* Check if address a is at least as high as any from MORECORE or MMAP */
-#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
-/* Check if address of next chunk n is higher than base chunk p */
-#define ok_next(p, n)    ((char*)(p) < (char*)(n))
-/* Check if p has inuse status */
-#define ok_inuse(p)     is_inuse(p)
-/* Check if p has its pinuse bit on */
-#define ok_pinuse(p)     pinuse(p)
-
-#else /* !INSECURE */
-#define ok_address(M, a) (1)
-#define ok_next(b, n)    (1)
-#define ok_inuse(p)      (1)
-#define ok_pinuse(p)     (1)
-#endif /* !INSECURE */
-
-#if (FOOTERS && !INSECURE)
-/* Check if (alleged) mstate m has expected magic field */
-#define ok_magic(M)      ((M)->magic == mparams.magic)
-#else  /* (FOOTERS && !INSECURE) */
-#define ok_magic(M)      (1)
-#endif /* (FOOTERS && !INSECURE) */
-
-
-/* In gcc, use __builtin_expect to minimize impact of checks */
-#if !INSECURE
-#if defined(__GNUC__) && __GNUC__ >= 3
-#define RTCHECK(e)  __builtin_expect(e, 1)
-#else /* GNUC */
-#define RTCHECK(e)  (e)
-#endif /* GNUC */
-#else /* !INSECURE */
-#define RTCHECK(e)  (1)
-#endif /* !INSECURE */
-
-/* macros to set up inuse chunks with or without footers */
-
-#if !FOOTERS
-
-#define mark_inuse_foot(M,p,s)
-
-/* Macros for setting head/foot of non-mmapped chunks */
-
-/* Set cinuse bit and pinuse bit of next chunk */
-#define set_inuse(M,p,s)\
-  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
-  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
-
-/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
-#define set_inuse_and_pinuse(M,p,s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
-  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
-
-/* Set size, cinuse and pinuse bit of this chunk */
-#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))
-
-#else /* FOOTERS */
-
-/* Set foot of inuse chunk to be xor of mstate and seed */
-#define mark_inuse_foot(M,p,s)\
-  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))
-
-#define get_mstate_for(p)\
-  ((mstate)(((mchunkptr)((char*)(p) +\
-    (chunksize(p))))->prev_foot ^ mparams.magic))
-
-#define set_inuse(M,p,s)\
-  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
-  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
-  mark_inuse_foot(M,p,s))
-
-#define set_inuse_and_pinuse(M,p,s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
-  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
- mark_inuse_foot(M,p,s))
-
-#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
-  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
-  mark_inuse_foot(M, p, s))
-
-#endif /* !FOOTERS */
-
-/* ---------------------------- setting mparams -------------------------- */
-
-#ifdef ENABLE_LARGE_PAGES
-typedef size_t (WINAPI *GetLargePageMinimum_t)(void);
-#endif
-
-/* Initialize mparams */
-static int init_mparams(void) {
-#ifdef NEED_GLOBAL_LOCK_INIT
-  if (malloc_global_mutex_status <= 0)
-    init_malloc_global_mutex();
-#endif
-
-  ACQUIRE_MALLOC_GLOBAL_LOCK();
-  if (mparams.magic == 0) {
-    size_t magic;
-    size_t psize;
-    size_t gsize;
-
-#ifndef WIN32
-    psize = malloc_getpagesize;
-    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
-#else /* WIN32 */
-    {
-      SYSTEM_INFO system_info;
-      GetSystemInfo(&system_info);
-      psize = system_info.dwPageSize;
-      gsize = ((DEFAULT_GRANULARITY != 0)?
-               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
-#ifdef ENABLE_LARGE_PAGES
-      { 
-          GetLargePageMinimum_t GetLargePageMinimum_ = (GetLargePageMinimum_t) GetProcAddress(GetModuleHandle(__T("kernel32.dll")), "GetLargePageMinimum");
-          if(GetLargePageMinimum_) {
-              size_t largepagesize = GetLargePageMinimum_();
-              if(largepagesize) {
-                  psize = largepagesize;
-                  gsize = ((DEFAULT_GRANULARITY != 0)?
-                           DEFAULT_GRANULARITY : largepagesize);
-                  if(gsize < largepagesize) gsize = largepagesize;
-              }
-          }
-      }
-#endif
-    }
-#endif /* WIN32 */
-
-    /* Sanity-check configuration:
-       size_t must be unsigned and as wide as pointer type.
-       ints must be at least 4 bytes.
-       alignment must be at least 8.
-       Alignment, min chunk size, and page size must all be powers of 2.
-    */
-    if ((sizeof(size_t) != sizeof(char*)) ||
-        (MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
-        (sizeof(int) < 4)  ||
-        (MALLOC_ALIGNMENT < (size_t)8U) ||
-        ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
-        ((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
-        ((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
-        ((psize            & (psize-SIZE_T_ONE))            != 0))
-      ABORT;
-
-    mparams.granularity = gsize;
-    mparams.page_size = psize;
-    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
-    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
-#if MORECORE_CONTIGUOUS
-    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
-#else  /* MORECORE_CONTIGUOUS */
-    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
-#endif /* MORECORE_CONTIGUOUS */
-
-#if !ONLY_MSPACES
-    /* Set up lock for main malloc area */
-    gm->mflags = mparams.default_mflags;
-    INITIAL_LOCK(&gm->mutex);
-#endif
-
-    {
-#if USE_DEV_RANDOM
-      int fd;
-      unsigned char buf[sizeof(size_t)];
-      /* Try to use /dev/urandom, else fall back on using time */
-      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
-          read(fd, buf, sizeof(buf)) == sizeof(buf)) {
-        magic = *((size_t *) buf);
-        close(fd);
-      }
-      else
-#endif /* USE_DEV_RANDOM */
-#ifdef WIN32
-        magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
-#else
-        magic = (size_t)(time(0) ^ (size_t)0x55555555U);
-#endif
-      magic |= (size_t)8U;    /* ensure nonzero */
-      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
-      mparams.magic = magic;
-    }
-  }
-
-  RELEASE_MALLOC_GLOBAL_LOCK();
-  return 1;
-}
-
-/* support for mallopt */
-static int change_mparam(int param_number, int value) {
-  size_t val;
-  ensure_initialization();
-  val = (value == -1)? MAX_SIZE_T : (size_t)value;
-  switch(param_number) {
-  case M_TRIM_THRESHOLD:
-    mparams.trim_threshold = val;
-    return 1;
-  case M_GRANULARITY:
-    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
-      mparams.granularity = val;
-      return 1;
-    }
-    else
-      return 0;
-  case M_MMAP_THRESHOLD:
-    mparams.mmap_threshold = val;
-    return 1;
-  default:
-    return 0;
-  }
-}
-
-#if DEBUG
-/* ------------------------- Debugging Support --------------------------- */
-
-/* Check properties of any chunk, whether free, inuse, mmapped etc  */
-static void do_check_any_chunk(mstate m, mchunkptr p) {
-  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
-  assert(ok_address(m, p));
-}
-
-/* Check properties of top chunk */
-static void do_check_top_chunk(mstate m, mchunkptr p) {
-  msegmentptr sp = segment_holding(m, (char*)p);
-  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
-  assert(sp != 0);
-  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
-  assert(ok_address(m, p));
-  assert(sz == m->topsize);
-  assert(sz > 0);
-  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
-  assert(pinuse(p));
-  assert(!pinuse(chunk_plus_offset(p, sz)));
-}
-
-/* Check properties of (inuse) mmapped chunks */
-static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
-  size_t  sz = chunksize(p);
-  size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD);
-  assert(is_mmapped(p));
-  assert(use_mmap(m));
-  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
-  assert(ok_address(m, p));
-  assert(!is_small(sz));
-  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
-  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
-  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
-}
-
-/* Check properties of inuse chunks */
-static void do_check_inuse_chunk(mstate m, mchunkptr p) {
-  do_check_any_chunk(m, p);
-  assert(is_inuse(p));
-  assert(next_pinuse(p));
-  /* If not pinuse and not mmapped, previous chunk has OK offset */
-  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
-  if (is_mmapped(p))
-    do_check_mmapped_chunk(m, p);
-}
-
-/* Check properties of free chunks */
-static void do_check_free_chunk(mstate m, mchunkptr p) {
-  size_t sz = chunksize(p);
-  mchunkptr next = chunk_plus_offset(p, sz);
-  do_check_any_chunk(m, p);
-  assert(!is_inuse(p));
-  assert(!next_pinuse(p));
-  assert (!is_mmapped(p));
-  if (p != m->dv && p != m->top) {
-    if (sz >= MIN_CHUNK_SIZE) {
-      assert((sz & CHUNK_ALIGN_MASK) == 0);
-      assert(is_aligned(chunk2mem(p)));
-      assert(next->prev_foot == sz);
-      assert(pinuse(p));
-      assert (next == m->top || is_inuse(next));
-      assert(p->fd->bk == p);
-      assert(p->bk->fd == p);
-    }
-    else  /* markers are always of size SIZE_T_SIZE */
-      assert(sz == SIZE_T_SIZE);
-  }
-}
-
-/* Check properties of malloced chunks at the point they are malloced */
-static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
-  if (mem != 0) {
-    mchunkptr p = mem2chunk(mem);
-    size_t sz = p->head & ~INUSE_BITS;
-    do_check_inuse_chunk(m, p);
-    assert((sz & CHUNK_ALIGN_MASK) == 0);
-    assert(sz >= MIN_CHUNK_SIZE);
-    assert(sz >= s);
-    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
-    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
-  }
-}
-
-/* Check a tree and its subtrees.  */
-static void do_check_tree(mstate m, tchunkptr t) {
-  tchunkptr head = 0;
-  tchunkptr u = t;
-  bindex_t tindex = t->index;
-  size_t tsize = chunksize(t);
-  bindex_t idx;
-  compute_tree_index(tsize, idx);
-  assert(tindex == idx);
-  assert(tsize >= MIN_LARGE_SIZE);
-  assert(tsize >= minsize_for_tree_index(idx));
-  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));
-
-  do { /* traverse through chain of same-sized nodes */
-    do_check_any_chunk(m, ((mchunkptr)u));
-    assert(u->index == tindex);
-    assert(chunksize(u) == tsize);
-    assert(!is_inuse(u));
-    assert(!next_pinuse(u));
-    assert(u->fd->bk == u);
-    assert(u->bk->fd == u);
-    if (u->parent == 0) {
-      assert(u->child[0] == 0);
-      assert(u->child[1] == 0);
-    }
-    else {
-      assert(head == 0); /* only one node on chain has parent */
-      head = u;
-      assert(u->parent != u);
-      assert (u->parent->child[0] == u ||
-              u->parent->child[1] == u ||
-              *((tbinptr*)(u->parent)) == u);
-      if (u->child[0] != 0) {
-        assert(u->child[0]->parent == u);
-        assert(u->child[0] != u);
-        do_check_tree(m, u->child[0]);
-      }
-      if (u->child[1] != 0) {
-        assert(u->child[1]->parent == u);
-        assert(u->child[1] != u);
-        do_check_tree(m, u->child[1]);
-      }
-      if (u->child[0] != 0 && u->child[1] != 0) {
-        assert(chunksize(u->child[0]) < chunksize(u->child[1]));
-      }
-    }
-    u = u->fd;
-  } while (u != t);
-  assert(head != 0);
-}
-
-/*  Check all the chunks in a treebin.  */
-static void do_check_treebin(mstate m, bindex_t i) {
-  tbinptr* tb = treebin_at(m, i);
-  tchunkptr t = *tb;
-  int empty = (m->treemap & (1U << i)) == 0;
-  if (t == 0)
-    assert(empty);
-  if (!empty)
-    do_check_tree(m, t);
-}
-
-/*  Check all the chunks in a smallbin.  */
-static void do_check_smallbin(mstate m, bindex_t i) {
-  sbinptr b = smallbin_at(m, i);
-  mchunkptr p = b->bk;
-  unsigned int empty = (m->smallmap & (1U << i)) == 0;
-  if (p == b)
-    assert(empty);
-  if (!empty) {
-    for (; p != b; p = p->bk) {
-      size_t size = chunksize(p);
-      mchunkptr q;
-      /* each chunk claims to be free */
-      do_check_free_chunk(m, p);
-      /* chunk belongs in bin */
-      assert(small_index(size) == i);
-      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
-      /* chunk is followed by an inuse chunk */
-      q = next_chunk(p);
-      if (q->head != FENCEPOST_HEAD)
-        do_check_inuse_chunk(m, q);
-    }
-  }
-}
-
-/* Find x in a bin. Used in other check functions. */
-static int bin_find(mstate m, mchunkptr x) {
-  size_t size = chunksize(x);
-  if (is_small(size)) {
-    bindex_t sidx = small_index(size);
-    sbinptr b = smallbin_at(m, sidx);
-    if (smallmap_is_marked(m, sidx)) {
-      mchunkptr p = b;
-      do {
-        if (p == x)
-          return 1;
-      } while ((p = p->fd) != b);
-    }
-  }
-  else {
-    bindex_t tidx;
-    compute_tree_index(size, tidx);
-    if (treemap_is_marked(m, tidx)) {
-      tchunkptr t = *treebin_at(m, tidx);
-      size_t sizebits = size << leftshift_for_tree_index(tidx);
-      while (t != 0 && chunksize(t) != size) {
-        t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
-        sizebits <<= 1;
-      }
-      if (t != 0) {
-        tchunkptr u = t;
-        do {
-          if (u == (tchunkptr)x)
-            return 1;
-        } while ((u = u->fd) != t);
-      }
-    }
-  }
-  return 0;
-}
-
-/* Traverse each chunk and check it; return total */
-static size_t traverse_and_check(mstate m) {
-  size_t sum = 0;
-  if (is_initialized(m)) {
-    msegmentptr s = &m->seg;
-    sum += m->topsize + TOP_FOOT_SIZE;
-    while (s != 0) {
-      mchunkptr q = align_as_chunk(s->base);
-      mchunkptr lastq = 0;
-      assert(pinuse(q));
-      while (segment_holds(s, q) &&
-             q != m->top && q->head != FENCEPOST_HEAD) {
-        sum += chunksize(q);
-        if (is_inuse(q)) {
-          assert(!bin_find(m, q));
-          do_check_inuse_chunk(m, q);
-        }
-        else {
-          assert(q == m->dv || bin_find(m, q));
-          assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */
-          do_check_free_chunk(m, q);
-        }
-        lastq = q;
-        q = next_chunk(q);
-      }
-      s = s->next;
-    }
-  }
-  return sum;
-}
-
-/* Check all properties of malloc_state. */
-static void do_check_malloc_state(mstate m) {
-  bindex_t i;
-  size_t total;
-  /* check bins */
-  for (i = 0; i < NSMALLBINS; ++i)
-    do_check_smallbin(m, i);
-  for (i = 0; i < NTREEBINS; ++i)
-    do_check_treebin(m, i);
-
-  if (m->dvsize != 0) { /* check dv chunk */
-    do_check_any_chunk(m, m->dv);
-    assert(m->dvsize == chunksize(m->dv));
-    assert(m->dvsize >= MIN_CHUNK_SIZE);
-    assert(bin_find(m, m->dv) == 0);
-  }
-
-  if (m->top != 0) {   /* check top chunk */
-    do_check_top_chunk(m, m->top);
-    /*assert(m->topsize == chunksize(m->top)); redundant */
-    assert(m->topsize > 0);
-    assert(bin_find(m, m->top) == 0);
-  }
-
-  total = traverse_and_check(m);
-  assert(total <= m->footprint);
-  assert(m->footprint <= m->max_footprint);
-}
-#endif /* DEBUG */
-
-/* ----------------------------- statistics ------------------------------ */
-
-#if !NO_MALLINFO
-static struct mallinfo internal_mallinfo(mstate m) {
-  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-  ensure_initialization();
-  if (!PREACTION(m)) {
-    check_malloc_state(m);
-    if (is_initialized(m)) {
-      size_t nfree = SIZE_T_ONE; /* top always free */
-      size_t mfree = m->topsize + TOP_FOOT_SIZE;
-      size_t sum = mfree;
-      msegmentptr s = &m->seg;
-      while (s != 0) {
-        mchunkptr q = align_as_chunk(s->base);
-        while (segment_holds(s, q) &&
-               q != m->top && q->head != FENCEPOST_HEAD) {
-          size_t sz = chunksize(q);
-          sum += sz;
-          if (!is_inuse(q)) {
-            mfree += sz;
-            ++nfree;
-          }
-          q = next_chunk(q);
-        }
-        s = s->next;
-      }
-
-      nm.arena    = sum;
-      nm.ordblks  = nfree;
-      nm.hblkhd   = m->footprint - sum;
-      nm.usmblks  = m->max_footprint;
-      nm.uordblks = m->footprint - mfree;
-      nm.fordblks = mfree;
-      nm.keepcost = m->topsize;
-    }
-
-    POSTACTION(m);
-  }
-  return nm;
-}
-#endif /* !NO_MALLINFO */
-
-static void internal_malloc_stats(mstate m) {
-  ensure_initialization();
-  if (!PREACTION(m)) {
-    size_t maxfp = 0;
-    size_t fp = 0;
-    size_t used = 0;
-    check_malloc_state(m);
-    if (is_initialized(m)) {
-      msegmentptr s = &m->seg;
-      maxfp = m->max_footprint;
-      fp = m->footprint;
-      used = fp - (m->topsize + TOP_FOOT_SIZE);
-
-      while (s != 0) {
-        mchunkptr q = align_as_chunk(s->base);
-        while (segment_holds(s, q) &&
-               q != m->top && q->head != FENCEPOST_HEAD) {
-          if (!is_inuse(q))
-            used -= chunksize(q);
-          q = next_chunk(q);
-        }
-        s = s->next;
-      }
-    }
-
-    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
-    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
-    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
-
-    POSTACTION(m);
-  }
-}
-
-/* ----------------------- Operations on smallbins ----------------------- */
-
-/*
-  Various forms of linking and unlinking are defined as macros.  Even
-  the ones for trees, which are very long but have very short typical
-  paths.  This is ugly but reduces reliance on inlining support of
-  compilers.
-*/
-
-/* Link a free chunk into a smallbin  */
-#define insert_small_chunk(M, P, S) {\
-  bindex_t I  = small_index(S);\
-  mchunkptr B = smallbin_at(M, I);\
-  mchunkptr F = B;\
-  assert(S >= MIN_CHUNK_SIZE);\
-  if (!smallmap_is_marked(M, I))\
-    mark_smallmap(M, I);\
-  else if (RTCHECK(ok_address(M, B->fd)))\
-    F = B->fd;\
-  else {\
-    CORRUPTION_ERROR_ACTION(M);\
-  }\
-  B->fd = P;\
-  F->bk = P;\
-  P->fd = F;\
-  P->bk = B;\
-}
-
-/* Unlink a chunk from a smallbin  */
-#define unlink_small_chunk(M, P, S) {\
-  mchunkptr F = P->fd;\
-  mchunkptr B = P->bk;\
-  bindex_t I = small_index(S);\
-  assert(P != B);\
-  assert(P != F);\
-  assert(chunksize(P) == small_index2size(I));\
-  if (F == B)\
-    clear_smallmap(M, I);\
-  else if (RTCHECK((F == smallbin_at(M,I) || ok_address(M, F)) &&\
-                   (B == smallbin_at(M,I) || ok_address(M, B)))) {\
-    F->bk = B;\
-    B->fd = F;\
-  }\
-  else {\
-    CORRUPTION_ERROR_ACTION(M);\
-  }\
-}
-
-/* Unlink the first chunk from a smallbin */
-#define unlink_first_small_chunk(M, B, P, I) {\
-  mchunkptr F = P->fd;\
-  assert(P != B);\
-  assert(P != F);\
-  assert(chunksize(P) == small_index2size(I));\
-  if (B == F)\
-    clear_smallmap(M, I);\
-  else if (RTCHECK(ok_address(M, F))) {\
-    B->fd = F;\
-    F->bk = B;\
-  }\
-  else {\
-    CORRUPTION_ERROR_ACTION(M);\
-  }\
-}
-
-
-
-/* Replace dv node, binning the old one */
-/* Used only when dvsize known to be small */
-#define replace_dv(M, P, S) {\
-  size_t DVS = M->dvsize;\
-  if (DVS != 0) {\
-    mchunkptr DV = M->dv;\
-    assert(is_small(DVS));\
-    insert_small_chunk(M, DV, DVS);\
-  }\
-  M->dvsize = S;\
-  M->dv = P;\
-}
-
-/* ------------------------- Operations on trees ------------------------- */
-
-/* Insert chunk into tree */
-#define insert_large_chunk(M, X, S) {\
-  tbinptr* H;\
-  bindex_t I;\
-  compute_tree_index(S, I);\
-  H = treebin_at(M, I);\
-  X->index = I;\
-  X->child[0] = X->child[1] = 0;\
-  if (!treemap_is_marked(M, I)) {\
-    mark_treemap(M, I);\
-    *H = X;\
-    X->parent = (tchunkptr)H;\
-    X->fd = X->bk = X;\
-  }\
-  else {\
-    tchunkptr T = *H;\
-    size_t K = S << leftshift_for_tree_index(I);\
-    for (;;) {\
-      if (chunksize(T) != S) {\
-        tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
-        K <<= 1;\
-        if (*C != 0)\
-          T = *C;\
-        else if (RTCHECK(ok_address(M, C))) {\
-          *C = X;\
-          X->parent = T;\
-          X->fd = X->bk = X;\
-          break;\
-        }\
-        else {\
-          CORRUPTION_ERROR_ACTION(M);\
-          break;\
-        }\
-      }\
-      else {\
-        tchunkptr F = T->fd;\
-        if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
-          T->fd = F->bk = X;\
-          X->fd = F;\
-          X->bk = T;\
-          X->parent = 0;\
-          break;\
-        }\
-        else {\
-          CORRUPTION_ERROR_ACTION(M);\
-          break;\
-        }\
-      }\
-    }\
-  }\
-}
-
-/*
-  Unlink steps:
-
-  1. If x is a chained node, unlink it from its same-sized fd/bk links
-     and choose its bk node as its replacement.
-  2. If x was the last node of its size, but not a leaf node, it must
-     be replaced with a leaf node (not merely one with an open left or
-     right), to make sure that lefts and rights of descendents
-     correspond properly to bit masks.  We use the rightmost descendent
-     of x.  We could use any other leaf, but this is easy to locate and
-     tends to counteract removal of leftmosts elsewhere, and so keeps
-     paths shorter than minimally guaranteed.  This doesn't loop much
-     because on average a node in a tree is near the bottom.
-  3. If x is the base of a chain (i.e., has parent links) relink
-     x's parent and children to x's replacement (or null if none).
-*/
-
-#define unlink_large_chunk(M, X) {\
-  tchunkptr XP = X->parent;\
-  tchunkptr R;\
-  if (X->bk != X) {\
-    tchunkptr F = X->fd;\
-    R = X->bk;\
-    if (RTCHECK(ok_address(M, F))) {\
-      F->bk = R;\
-      R->fd = F;\
-    }\
-    else {\
-      CORRUPTION_ERROR_ACTION(M);\
-    }\
-  }\
-  else {\
-    tchunkptr* RP;\
-    if (((R = *(RP = &(X->child[1]))) != 0) ||\
-        ((R = *(RP = &(X->child[0]))) != 0)) {\
-      tchunkptr* CP;\
-      while ((*(CP = &(R->child[1])) != 0) ||\
-             (*(CP = &(R->child[0])) != 0)) {\
-        R = *(RP = CP);\
-      }\
-      if (RTCHECK(ok_address(M, RP)))\
-        *RP = 0;\
-      else {\
-        CORRUPTION_ERROR_ACTION(M);\
-      }\
-    }\
-  }\
-  if (XP != 0) {\
-    tbinptr* H = treebin_at(M, X->index);\
-    if (X == *H) {\
-      if ((*H = R) == 0) \
-        clear_treemap(M, X->index);\
-    }\
-    else if (RTCHECK(ok_address(M, XP))) {\
-      if (XP->child[0] == X) \
-        XP->child[0] = R;\
-      else \
-        XP->child[1] = R;\
-    }\
-    else\
-      CORRUPTION_ERROR_ACTION(M);\
-    if (R != 0) {\
-      if (RTCHECK(ok_address(M, R))) {\
-        tchunkptr C0, C1;\
-        R->parent = XP;\
-        if ((C0 = X->child[0]) != 0) {\
-          if (RTCHECK(ok_address(M, C0))) {\
-            R->child[0] = C0;\
-            C0->parent = R;\
-          }\
-          else\
-            CORRUPTION_ERROR_ACTION(M);\
-        }\
-        if ((C1 = X->child[1]) != 0) {\
-          if (RTCHECK(ok_address(M, C1))) {\
-            R->child[1] = C1;\
-            C1->parent = R;\
-          }\
-          else\
-            CORRUPTION_ERROR_ACTION(M);\
-        }\
-      }\
-      else\
-        CORRUPTION_ERROR_ACTION(M);\
-    }\
-  }\
-}
-
-/* Relays to large vs small bin operations */
-
-#define insert_chunk(M, P, S)\
-  if (is_small(S)) insert_small_chunk(M, P, S)\
-  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }
-
-#define unlink_chunk(M, P, S)\
-  if (is_small(S)) unlink_small_chunk(M, P, S)\
-  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }
-
-
-/* Relays to internal calls to malloc/free from realloc, memalign etc */
-
-#if ONLY_MSPACES
-#define internal_malloc(m, b) mspace_malloc(m, b)
-#define internal_free(m, mem) mspace_free(m,mem);
-#else /* ONLY_MSPACES */
-#if MSPACES
-#define internal_malloc(m, b)\
-   (m == gm)? dlmalloc(b) : mspace_malloc(m, b)
-#define internal_free(m, mem)\
-   if (m == gm) dlfree(mem); else mspace_free(m,mem);
-#else /* MSPACES */
-#define internal_malloc(m, b) dlmalloc(b)
-#define internal_free(m, mem) dlfree(mem)
-#endif /* MSPACES */
-#endif /* ONLY_MSPACES */
-
-/* -----------------------  Direct-mmapping chunks ----------------------- */
-
-/*
-  Directly mmapped chunks are set up with an offset to the start of
-  the mmapped region stored in the prev_foot field of the chunk. This
-  allows reconstruction of the required argument to MUNMAP when freed,
-  and also allows adjustment of the returned chunk to meet alignment
-  requirements (especially in memalign).
-*/
-
-/* Malloc using mmap */
-static void* mmap_alloc(mstate m, size_t nb) {
-  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
-  if (mmsize > nb) {     /* Check for wrap around 0 */
-    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
-    if (mm != CMFAIL) {
-      size_t offset = align_offset(chunk2mem(mm));
-      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
-      mchunkptr p = (mchunkptr)(mm + offset);
-      p->prev_foot = offset;
-      p->head = psize;
-      mark_inuse_foot(m, p, psize);
-      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
-      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;
-
-      if (m->least_addr == 0 || mm < m->least_addr)
-        m->least_addr = mm;
-      if ((m->footprint += mmsize) > m->max_footprint)
-        m->max_footprint = m->footprint;
-      assert(is_aligned(chunk2mem(p)));
-      check_mmapped_chunk(m, p);
-      return chunk2mem(p);
-    }
-  }
-  return 0;
-}
-
-/* Realloc using mmap */
-static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) {
-  size_t oldsize = chunksize(oldp);
-  if (is_small(nb)) /* Can't shrink mmap regions below small size */
-    return 0;
-  /* Keep old chunk if big enough but not too big */
-  if (oldsize >= nb + SIZE_T_SIZE &&
-      (oldsize - nb) <= (mparams.granularity << 1))
-    return oldp;
-  else {
-    size_t offset = oldp->prev_foot;
-    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
-    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
-    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
-                                  oldmmsize, newmmsize, 1);
-    if (cp != CMFAIL) {
-      mchunkptr newp = (mchunkptr)(cp + offset);
-      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
-      newp->head = psize;
-      mark_inuse_foot(m, newp, psize);
-      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
-      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;
-
-      if (cp < m->least_addr)
-        m->least_addr = cp;
-      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
-        m->max_footprint = m->footprint;
-      check_mmapped_chunk(m, newp);
-      return newp;
-    }
-  }
-  return 0;
-}
-
-/* -------------------------- mspace management -------------------------- */
-
-/* Initialize top chunk and its size */
-static void init_top(mstate m, mchunkptr p, size_t psize) {
-  /* Ensure alignment */
-  size_t offset = align_offset(chunk2mem(p));
-  p = (mchunkptr)((char*)p + offset);
-  psize -= offset;
-
-  m->top = p;
-  m->topsize = psize;
-  p->head = psize | PINUSE_BIT;
-  /* set size of fake trailing chunk holding overhead space only once */
-  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
-  m->trim_check = mparams.trim_threshold; /* reset on each update */
-}
-
-/* Initialize bins for a new mstate that is otherwise zeroed out */
-static void init_bins(mstate m) {
-  /* Establish circular links for smallbins */
-  bindex_t i;
-  for (i = 0; i < NSMALLBINS; ++i) {
-    sbinptr bin = smallbin_at(m,i);
-    bin->fd = bin->bk = bin;
-  }
-}
-
-#if PROCEED_ON_ERROR
-
-/* default corruption action */
-static void reset_on_error(mstate m) {
-  int i;
-  ++malloc_corruption_error_count;
-  /* Reinitialize fields to forget about all memory */
-  m->smallbins = m->treebins = 0;
-  m->dvsize = m->topsize = 0;
-  m->seg.base = 0;
-  m->seg.size = 0;
-  m->seg.next = 0;
-  m->top = m->dv = 0;
-  for (i = 0; i < NTREEBINS; ++i)
-    *treebin_at(m, i) = 0;
-  init_bins(m);
-}
-#endif /* PROCEED_ON_ERROR */
-
-/* Allocate chunk and prepend remainder with chunk in successor base. */
-static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
-                           size_t nb) {
-  mchunkptr p = align_as_chunk(newbase);
-  mchunkptr oldfirst = align_as_chunk(oldbase);
-  size_t psize = (char*)oldfirst - (char*)p;
-  mchunkptr q = chunk_plus_offset(p, nb);
-  size_t qsize = psize - nb;
-  set_size_and_pinuse_of_inuse_chunk(m, p, nb);
-
-  assert((char*)oldfirst > (char*)q);
-  assert(pinuse(oldfirst));
-  assert(qsize >= MIN_CHUNK_SIZE);
-
-  /* consolidate remainder with first chunk of old base */
-  if (oldfirst == m->top) {
-    size_t tsize = m->topsize += qsize;
-    m->top = q;
-    q->head = tsize | PINUSE_BIT;
-    check_top_chunk(m, q);
-  }
-  else if (oldfirst == m->dv) {
-    size_t dsize = m->dvsize += qsize;
-    m->dv = q;
-    set_size_and_pinuse_of_free_chunk(q, dsize);
-  }
-  else {
-    if (!is_inuse(oldfirst)) {
-      size_t nsize = chunksize(oldfirst);
-      unlink_chunk(m, oldfirst, nsize);
-      oldfirst = chunk_plus_offset(oldfirst, nsize);
-      qsize += nsize;
-    }
-    set_free_with_pinuse(q, qsize, oldfirst);
-    insert_chunk(m, q, qsize);
-    check_free_chunk(m, q);
-  }
-
-  check_malloced_chunk(m, chunk2mem(p), nb);
-  return chunk2mem(p);
-}
-
-/* Add a segment to hold a new noncontiguous region */
-static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
-  /* Determine locations and sizes of segment, fenceposts, old top */
-  char* old_top = (char*)m->top;
-  msegmentptr oldsp = segment_holding(m, old_top);
-  char* old_end = oldsp->base + oldsp->size;
-  size_t ssize = pad_request(sizeof(struct malloc_segment));
-  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
-  size_t offset = align_offset(chunk2mem(rawsp));
-  char* asp = rawsp + offset;
-  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
-  mchunkptr sp = (mchunkptr)csp;
-  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
-  mchunkptr tnext = chunk_plus_offset(sp, ssize);
-  mchunkptr p = tnext;
-  int nfences = 0;
-
-  /* reset top to new space */
-  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
-
-  /* Set up segment record */
-  assert(is_aligned(ss));
-  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
-  *ss = m->seg; /* Push current record */
-  m->seg.base = tbase;
-  m->seg.size = tsize;
-  m->seg.sflags = mmapped;
-  m->seg.next = ss;
-
-  /* Insert trailing fenceposts */
-  for (;;) {
-    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
-    p->head = FENCEPOST_HEAD;
-    ++nfences;
-    if ((char*)(&(nextp->head)) < old_end)
-      p = nextp;
-    else
-      break;
-  }
-  assert(nfences >= 2);
-
-  /* Insert the rest of old top into a bin as an ordinary free chunk */
-  if (csp != old_top) {
-    mchunkptr q = (mchunkptr)old_top;
-    size_t psize = csp - old_top;
-    mchunkptr tn = chunk_plus_offset(q, psize);
-    set_free_with_pinuse(q, psize, tn);
-    insert_chunk(m, q, psize);
-  }
-
-  check_top_chunk(m, m->top);
-}
-
-/* -------------------------- System allocation -------------------------- */
-
-/* Get memory from system using MORECORE or MMAP */
-static void* sys_alloc(mstate m, size_t nb) {
-  char* tbase = CMFAIL;
-  size_t tsize = 0;
-  flag_t mmap_flag = 0;
-
-  ensure_initialization();
-
-  /* Directly map large chunks, but only if already initialized */
-  if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize != 0) {
-    void* mem = mmap_alloc(m, nb);
-    if (mem != 0)
-      return mem;
-  }
-
-  /*
-    Try getting memory in any of three ways (in most-preferred to
-    least-preferred order):
-    1. A call to MORECORE that can normally contiguously extend memory.
-       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
-       or main space is mmapped or a previous contiguous call failed)
-    2. A call to MMAP new space (disabled if not HAVE_MMAP).
-       Note that under the default settings, if MORECORE is unable to
-       fulfill a request, and HAVE_MMAP is true, then mmap is
-       used as a noncontiguous system allocator. This is a useful backup
-       strategy for systems with holes in address spaces -- in this case
-       sbrk cannot contiguously expand the heap, but mmap may be able to
-       find space.
-    3. A call to MORECORE that cannot usually contiguously extend memory.
-       (disabled if not HAVE_MORECORE)
-
-   In all cases, we need to request enough bytes from system to ensure
-   we can malloc nb bytes upon success, so pad with enough space for
-   top_foot, plus alignment-pad to make sure we don't lose bytes if
-   not on boundary, and round this up to a granularity unit.
-  */
-
-  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
-    char* br = CMFAIL;
-    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top);
-    size_t asize = 0;
-    ACQUIRE_MALLOC_GLOBAL_LOCK();
-
-    if (ss == 0) {  /* First time through or recovery */
-      char* base = (char*)CALL_MORECORE(0);
-      if (base != CMFAIL) {
-        asize = granularity_align(nb + SYS_ALLOC_PADDING);
-        /* Adjust to end on a page boundary */
-        if (!is_page_aligned(base))
-          asize += (page_align((size_t)base) - (size_t)base);
-        /* Can't call MORECORE if size is negative when treated as signed */
-        if (asize < HALF_MAX_SIZE_T &&
-            (br = (char*)(CALL_MORECORE(asize))) == base) {
-          tbase = base;
-          tsize = asize;
-        }
-      }
-    }
-    else {
-      /* Subtract out existing available top space from MORECORE request. */
-      asize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
-      /* Use mem here only if it did continuously extend old space */
-      if (asize < HALF_MAX_SIZE_T &&
-          (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) {
-        tbase = br;
-        tsize = asize;
-      }
-    }
-
-    if (tbase == CMFAIL) {    /* Cope with partial failure */
-      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
-        if (asize < HALF_MAX_SIZE_T &&
-            asize < nb + SYS_ALLOC_PADDING) {
-          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - asize);
-          if (esize < HALF_MAX_SIZE_T) {
-            char* end = (char*)CALL_MORECORE(esize);
-            if (end != CMFAIL)
-              asize += esize;
-            else {            /* Can't use; try to release */
-              (void) CALL_MORECORE(-asize);
-              br = CMFAIL;
-            }
-          }
-        }
-      }
-      if (br != CMFAIL) {    /* Use the space we did get */
-        tbase = br;
-        tsize = asize;
-      }
-      else
-        disable_contiguous(m); /* Don't try contiguous path in the future */
-    }
-
-    RELEASE_MALLOC_GLOBAL_LOCK();
-  }
-
-  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
-    size_t rsize = granularity_align(nb + SYS_ALLOC_PADDING);
-    if (rsize > nb) { /* Fail if wraps around zero */
-      char* mp = (char*)(CALL_MMAP(rsize));
-      if (mp != CMFAIL) {
-        tbase = mp;
-        tsize = rsize;
-        mmap_flag = USE_MMAP_BIT;
-      }
-    }
-  }
-
-  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
-    size_t asize = granularity_align(nb + SYS_ALLOC_PADDING);
-    if (asize < HALF_MAX_SIZE_T) {
-      char* br = CMFAIL;
-      char* end = CMFAIL;
-      ACQUIRE_MALLOC_GLOBAL_LOCK();
-      br = (char*)(CALL_MORECORE(asize));
-      end = (char*)(CALL_MORECORE(0));
-      RELEASE_MALLOC_GLOBAL_LOCK();
-      if (br != CMFAIL && end != CMFAIL && br < end) {
-        size_t ssize = end - br;
-        if (ssize > nb + TOP_FOOT_SIZE) {
-          tbase = br;
-          tsize = ssize;
-        }
-      }
-    }
-  }
-
-  if (tbase != CMFAIL) {
-
-    if ((m->footprint += tsize) > m->max_footprint)
-      m->max_footprint = m->footprint;
-
-    if (!is_initialized(m)) { /* first-time initialization */
-      if (m->least_addr == 0 || tbase < m->least_addr)
-        m->least_addr = tbase;
-      m->seg.base = tbase;
-      m->seg.size = tsize;
-      m->seg.sflags = mmap_flag;
-      m->magic = mparams.magic;
-      m->release_checks = MAX_RELEASE_CHECK_RATE;
-      init_bins(m);
-#if !ONLY_MSPACES
-      if (is_global(m))
-        init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
-      else
-#endif
-      {
-        /* Offset top by embedded malloc_state */
-        mchunkptr mn = next_chunk(mem2chunk(m));
-        init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
-      }
-    }
-
-    else {
-      /* Try to merge with an existing segment */
-      msegmentptr sp = &m->seg;
-      /* Only consider most recent segment if traversal suppressed */
-      while (sp != 0 && tbase != sp->base + sp->size)
-        sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
-      if (sp != 0 &&
-          !is_extern_segment(sp) &&
-          (sp->sflags & USE_MMAP_BIT) == mmap_flag &&
-          segment_holds(sp, m->top)) { /* append */
-        sp->size += tsize;
-        init_top(m, m->top, m->topsize + tsize);
-      }
-      else {
-        if (tbase < m->least_addr)
-          m->least_addr = tbase;
-        sp = &m->seg;
-        while (sp != 0 && sp->base != tbase + tsize)
-          sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
-        if (sp != 0 &&
-            !is_extern_segment(sp) &&
-            (sp->sflags & USE_MMAP_BIT) == mmap_flag) {
-          char* oldbase = sp->base;
-          sp->base = tbase;
-          sp->size += tsize;
-          return prepend_alloc(m, tbase, oldbase, nb);
-        }
-        else
-          add_segment(m, tbase, tsize, mmap_flag);
-      }
-    }
-
-    if (nb < m->topsize) { /* Allocate from new or extended top space */
-      size_t rsize = m->topsize -= nb;
-      mchunkptr p = m->top;
-      mchunkptr r = m->top = chunk_plus_offset(p, nb);
-      r->head = rsize | PINUSE_BIT;
-      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
-      check_top_chunk(m, m->top);
-      check_malloced_chunk(m, chunk2mem(p), nb);
-      return chunk2mem(p);
-    }
-  }
-
-  MALLOC_FAILURE_ACTION;
-  return 0;
-}
-
-/* -----------------------  system deallocation -------------------------- */
-
-/* Unmap and unlink any mmapped segments that don't contain used chunks */
-static size_t release_unused_segments(mstate m) {
-  size_t released = 0;
-  int nsegs = 0;
-  msegmentptr pred = &m->seg;
-  msegmentptr sp = pred->next;
-  while (sp != 0) {
-    char* base = sp->base;
-    size_t size = sp->size;
-    msegmentptr next = sp->next;
-    ++nsegs;
-    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
-      mchunkptr p = align_as_chunk(base);
-      size_t psize = chunksize(p);
-      /* Can unmap if first chunk holds entire segment and not pinned */
-      if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
-        tchunkptr tp = (tchunkptr)p;
-        assert(segment_holds(sp, (char*)sp));
-        if (p == m->dv) {
-          m->dv = 0;
-          m->dvsize = 0;
-        }
-        else {
-          unlink_large_chunk(m, tp);
-        }
-        if (CALL_MUNMAP(base, size) == 0) {
-          released += size;
-          m->footprint -= size;
-          /* unlink obsoleted record */
-          sp = pred;
-          sp->next = next;
-        }
-        else { /* back out if cannot unmap */
-          insert_large_chunk(m, tp, psize);
-        }
-      }
-    }
-    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
-      break;
-    pred = sp;
-    sp = next;
-  }
-  /* Reset check counter */
-  m->release_checks = ((nsegs > MAX_RELEASE_CHECK_RATE)?
-                       nsegs : MAX_RELEASE_CHECK_RATE);
-  return released;
-}
-
-static int sys_trim(mstate m, size_t pad) {
-  size_t released = 0;
-  ensure_initialization();
-  if (pad < MAX_REQUEST && is_initialized(m)) {
-    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */
-
-    if (m->topsize > pad) {
-      /* Shrink top space in granularity-size units, keeping at least one */
-      size_t unit = mparams.granularity;
-      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
-                      SIZE_T_ONE) * unit;
-      msegmentptr sp = segment_holding(m, (char*)m->top);
-
-      if (!is_extern_segment(sp)) {
-        if (is_mmapped_segment(sp)) {
-          if (HAVE_MMAP &&
-              sp->size >= extra &&
-              !has_segment_link(m, sp)) { /* can't shrink if pinned */
-            size_t newsize = sp->size - extra;
-            /* Prefer mremap, fall back to munmap */
-            if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
-                (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
-              released = extra;
-            }
-          }
-        }
-        else if (HAVE_MORECORE) {
-          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
-            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
-          ACQUIRE_MALLOC_GLOBAL_LOCK();
-          {
-            /* Make sure end of memory is where we last set it. */
-            char* old_br = (char*)(CALL_MORECORE(0));
-            if (old_br == sp->base + sp->size) {
-              char* rel_br = (char*)(CALL_MORECORE(-extra));
-              char* new_br = (char*)(CALL_MORECORE(0));
-              if (rel_br != CMFAIL && new_br < old_br)
-                released = old_br - new_br;
-            }
-          }
-          RELEASE_MALLOC_GLOBAL_LOCK();
-        }
-      }
-
-      if (released != 0) {
-        sp->size -= released;
-        m->footprint -= released;
-        init_top(m, m->top, m->topsize - released);
-        check_top_chunk(m, m->top);
-      }
-    }
-
-    /* Unmap any unused mmapped segments */
-    if (HAVE_MMAP)
-      released += release_unused_segments(m);
-
-    /* On failure, disable autotrim to avoid repeated failed future calls */
-    if (released == 0 && m->topsize > m->trim_check)
-      m->trim_check = MAX_SIZE_T;
-  }
-
-  return (released != 0)? 1 : 0;
-}
-
-
-/* ---------------------------- malloc support --------------------------- */
-
-/* allocate a large request from the best fitting chunk in a treebin */
-static void* tmalloc_large(mstate m, size_t nb) {
-  tchunkptr v = 0;
-  size_t rsize = -nb; /* Unsigned negation */
-  tchunkptr t;
-  bindex_t idx;
-  compute_tree_index(nb, idx);
-  if ((t = *treebin_at(m, idx)) != 0) {
-    /* Traverse tree for this bin looking for node with size == nb */
-    size_t sizebits = nb << leftshift_for_tree_index(idx);
-    tchunkptr rst = 0;  /* The deepest untaken right subtree */
-    for (;;) {
-      tchunkptr rt;
-      size_t trem = chunksize(t) - nb;
-      if (trem < rsize) {
-        v = t;
-        if ((rsize = trem) == 0)
-          break;
-      }
-      rt = t->child[1];
-      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
-      if (rt != 0 && rt != t)
-        rst = rt;
-      if (t == 0) {
-        t = rst; /* set t to least subtree holding sizes > nb */
-        break;
-      }
-      sizebits <<= 1;
-    }
-  }
-  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
-    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
-    if (leftbits != 0) {
-      bindex_t i;
-      binmap_t leastbit = least_bit(leftbits);
-      compute_bit2idx(leastbit, i);
-      t = *treebin_at(m, i);
-    }
-  }
-
-  while (t != 0) { /* find smallest of tree or subtree */
-    size_t trem = chunksize(t) - nb;
-    if (trem < rsize) {
-      rsize = trem;
-      v = t;
-    }
-    t = leftmost_child(t);
-  }
-
-  /*  If dv is a better fit, return 0 so malloc will use it */
-  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
-    if (RTCHECK(ok_address(m, v))) { /* split */
-      mchunkptr r = chunk_plus_offset(v, nb);
-      assert(chunksize(v) == rsize + nb);
-      if (RTCHECK(ok_next(v, r))) {
-        unlink_large_chunk(m, v);
-        if (rsize < MIN_CHUNK_SIZE)
-          set_inuse_and_pinuse(m, v, (rsize + nb));
-        else {
-          set_size_and_pinuse_of_inuse_chunk(m, v, nb);
-          set_size_and_pinuse_of_free_chunk(r, rsize);
-          insert_chunk(m, r, rsize);
-        }
-        return chunk2mem(v);
-      }
-    }
-    CORRUPTION_ERROR_ACTION(m);
-  }
-  return 0;
-}
-
-/* allocate a small request from the best fitting chunk in a treebin */
-static void* tmalloc_small(mstate m, size_t nb) {
-  tchunkptr t, v;
-  size_t rsize;
-  bindex_t i;
-  binmap_t leastbit = least_bit(m->treemap);
-  compute_bit2idx(leastbit, i);
-  v = t = *treebin_at(m, i);
-  rsize = chunksize(t) - nb;
-
-  while ((t = leftmost_child(t)) != 0) {
-    size_t trem = chunksize(t) - nb;
-    if (trem < rsize) {
-      rsize = trem;
-      v = t;
-    }
-  }
-
-  if (RTCHECK(ok_address(m, v))) {
-    mchunkptr r = chunk_plus_offset(v, nb);
-    assert(chunksize(v) == rsize + nb);
-    if (RTCHECK(ok_next(v, r))) {
-      unlink_large_chunk(m, v);
-      if (rsize < MIN_CHUNK_SIZE)
-        set_inuse_and_pinuse(m, v, (rsize + nb));
-      else {
-        set_size_and_pinuse_of_inuse_chunk(m, v, nb);
-        set_size_and_pinuse_of_free_chunk(r, rsize);
-        replace_dv(m, r, rsize);
-      }
-      return chunk2mem(v);
-    }
-  }
-
-  CORRUPTION_ERROR_ACTION(m);
-  return 0;
-}
-
-/* --------------------------- realloc support --------------------------- */
-
-static void* internal_realloc(mstate m, void* oldmem, size_t bytes) {
-  if (bytes >= MAX_REQUEST) {
-    MALLOC_FAILURE_ACTION;
-    return 0;
-  }
-  if (!PREACTION(m)) {
-    mchunkptr oldp = mem2chunk(oldmem);
-    size_t oldsize = chunksize(oldp);
-    mchunkptr next = chunk_plus_offset(oldp, oldsize);
-    mchunkptr newp = 0;
-    void* extra = 0;
-
-    /* Try to either shrink or extend into top. Else malloc-copy-free */
-
-    if (RTCHECK(ok_address(m, oldp) && ok_inuse(oldp) &&
-                ok_next(oldp, next) && ok_pinuse(next))) {
-      size_t nb = request2size(bytes);
-      if (is_mmapped(oldp))
-        newp = mmap_resize(m, oldp, nb);
-      else if (oldsize >= nb) { /* already big enough */
-        size_t rsize = oldsize - nb;
-        newp = oldp;
-        if (rsize >= MIN_CHUNK_SIZE) {
-          mchunkptr remainder = chunk_plus_offset(newp, nb);
-          set_inuse(m, newp, nb);
-          set_inuse_and_pinuse(m, remainder, rsize);
-          extra = chunk2mem(remainder);
-        }
-      }
-      else if (next == m->top && oldsize + m->topsize > nb) {
-        /* Expand into top */
-        size_t newsize = oldsize + m->topsize;
-        size_t newtopsize = newsize - nb;
-        mchunkptr newtop = chunk_plus_offset(oldp, nb);
-        set_inuse(m, oldp, nb);
-        newtop->head = newtopsize |PINUSE_BIT;
-        m->top = newtop;
-        m->topsize = newtopsize;
-        newp = oldp;
-      }
-    }
-    else {
-      USAGE_ERROR_ACTION(m, oldmem);
-      POSTACTION(m);
-      return 0;
-    }
-#if DEBUG
-    if (newp != 0) {
-      check_inuse_chunk(m, newp); /* Check requires lock */
-    }
-#endif
-
-    POSTACTION(m);
-
-    if (newp != 0) {
-      if (extra != 0) {
-        internal_free(m, extra);
-      }
-      return chunk2mem(newp);
-    }
-    else {
-      void* newmem = internal_malloc(m, bytes);
-      if (newmem != 0) {
-        size_t oc = oldsize - overhead_for(oldp);
-        memcpy(newmem, oldmem, (oc < bytes)? oc : bytes);
-        internal_free(m, oldmem);
-      }
-      return newmem;
-    }
-  }
-  return 0;
-}
-
-/* --------------------------- memalign support -------------------------- */
-
-static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
-  if (alignment <= MALLOC_ALIGNMENT)    /* Can just use malloc */
-    return internal_malloc(m, bytes);
-  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
-    alignment = MIN_CHUNK_SIZE;
-  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
-    size_t a = MALLOC_ALIGNMENT << 1;
-    while (a < alignment) a <<= 1;
-    alignment = a;
-  }
-
-  if (bytes >= MAX_REQUEST - alignment) {
-    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
-      MALLOC_FAILURE_ACTION;
-    }
-  }
-  else {
-    size_t nb = request2size(bytes);
-    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
-    char* mem = (char*)internal_malloc(m, req);
-    if (mem != 0) {
-      void* leader = 0;
-      void* trailer = 0;
-      mchunkptr p = mem2chunk(mem);
-
-      if (PREACTION(m)) return 0;
-      if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */
-        /*
-          Find an aligned spot inside chunk.  Since we need to give
-          back leading space in a chunk of at least MIN_CHUNK_SIZE, if
-          the first calculation places us at a spot with less than
-          MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
-          We've allocated enough total room so that this is always
-          possible.
-        */
-        char* br = (char*)mem2chunk((size_t)(((size_t)(mem +
-                                                       alignment -
-                                                       SIZE_T_ONE)) &
-                                             -alignment));
-        char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
-          br : br+alignment;
-        mchunkptr newp = (mchunkptr)pos;
-        size_t leadsize = pos - (char*)(p);
-        size_t newsize = chunksize(p) - leadsize;
-
-        if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
-          newp->prev_foot = p->prev_foot + leadsize;
-          newp->head = newsize;
-        }
-        else { /* Otherwise, give back leader, use the rest */
-          set_inuse(m, newp, newsize);
-          set_inuse(m, p, leadsize);
-          leader = chunk2mem(p);
-        }
-        p = newp;
-      }
-
-      /* Give back spare room at the end */
-      if (!is_mmapped(p)) {
-        size_t size = chunksize(p);
-        if (size > nb + MIN_CHUNK_SIZE) {
-          size_t remainder_size = size - nb;
-          mchunkptr remainder = chunk_plus_offset(p, nb);
-          set_inuse(m, p, nb);
-          set_inuse(m, remainder, remainder_size);
-          trailer = chunk2mem(remainder);
-        }
-      }
-
-      assert (chunksize(p) >= nb);
-      assert((((size_t)(chunk2mem(p))) % alignment) == 0);
-      check_inuse_chunk(m, p);
-      POSTACTION(m);
-      if (leader != 0) {
-        internal_free(m, leader);
-      }
-      if (trailer != 0) {
-        internal_free(m, trailer);
-      }
-      return chunk2mem(p);
-    }
-  }
-  return 0;
-}
-
-/* ------------------------ comalloc/coalloc support --------------------- */
-
-static void** ialloc(mstate m,
-                     size_t n_elements,
-                     size_t* sizes,
-                     int opts,
-                     void* chunks[]) {
-  /*
-    This provides common support for independent_X routines, handling
-    all of the combinations that can result.
-
-    The opts arg has:
-    bit 0 set if all elements are same size (using sizes[0])
-    bit 1 set if elements should be zeroed
-  */
-
-  size_t    element_size;   /* chunksize of each element, if all same */
-  size_t    contents_size;  /* total size of elements */
-  size_t    array_size;     /* request size of pointer array */
-  void*     mem;            /* malloced aggregate space */
-  mchunkptr p;              /* corresponding chunk */
-  size_t    remainder_size; /* remaining bytes while splitting */
-  void**    marray;         /* either "chunks" or malloced ptr array */
-  mchunkptr array_chunk;    /* chunk for malloced ptr array */
-  flag_t    was_enabled;    /* to disable mmap */
-  size_t    size;
-  size_t    i;
-
-  ensure_initialization();
-  /* compute array length, if needed */
-  if (chunks != 0) {
-    if (n_elements == 0)
-      return chunks; /* nothing to do */
-    marray = chunks;
-    array_size = 0;
-  }
-  else {
-    /* if empty req, must still return chunk representing empty array */
-    if (n_elements == 0)
-      return (void**)internal_malloc(m, 0);
-    marray = 0;
-    array_size = request2size(n_elements * (sizeof(void*)));
-  }
-
-  /* compute total element size */
-  if (opts & 0x1) { /* all-same-size */
-    element_size = request2size(*sizes);
-    contents_size = n_elements * element_size;
-  }
-  else { /* add up all the sizes */
-    element_size = 0;
-    contents_size = 0;
-    for (i = 0; i != n_elements; ++i)
-      contents_size += request2size(sizes[i]);
-  }
-
-  size = contents_size + array_size;
-
-  /*
-     Allocate the aggregate chunk.  First disable direct-mmapping so
-     malloc won't use it, since we would not be able to later
-     free/realloc space internal to a segregated mmap region.
-  */
-  was_enabled = use_mmap(m);
-  disable_mmap(m);
-  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
-  if (was_enabled)
-    enable_mmap(m);
-  if (mem == 0)
-    return 0;
-
-  if (PREACTION(m)) return 0;
-  p = mem2chunk(mem);
-  remainder_size = chunksize(p);
-
-  assert(!is_mmapped(p));
-
-  if (opts & 0x2) {       /* optionally clear the elements */
-    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
-  }
-
-  /* If not provided, allocate the pointer array as final part of chunk */
-  if (marray == 0) {
-    size_t  array_chunk_size;
-    array_chunk = chunk_plus_offset(p, contents_size);
-    array_chunk_size = remainder_size - contents_size;
-    marray = (void**) (chunk2mem(array_chunk));
-    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
-    remainder_size = contents_size;
-  }
-
-  /* split out elements */
-  for (i = 0; ; ++i) {
-    marray[i] = chunk2mem(p);
-    if (i != n_elements-1) {
-      if (element_size != 0)
-        size = element_size;
-      else
-        size = request2size(sizes[i]);
-      remainder_size -= size;
-      set_size_and_pinuse_of_inuse_chunk(m, p, size);
-      p = chunk_plus_offset(p, size);
-    }
-    else { /* the final element absorbs any overallocation slop */
-      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
-      break;
-    }
-  }
-
-#if DEBUG
-  if (marray != chunks) {
-    /* final element must have exactly exhausted chunk */
-    if (element_size != 0) {
-      assert(remainder_size == element_size);
-    }
-    else {
-      assert(remainder_size == request2size(sizes[i]));
-    }
-    check_inuse_chunk(m, mem2chunk(marray));
-  }
-  for (i = 0; i != n_elements; ++i)
-    check_inuse_chunk(m, mem2chunk(marray[i]));
-
-#endif /* DEBUG */
-
-  POSTACTION(m);
-  return marray;
-}
-
-
-/* -------------------------- public routines ---------------------------- */
-
-#if !ONLY_MSPACES
-
-void* dlmalloc(size_t bytes) {
-  /*
-     Basic algorithm:
-     If a small request (< 256 bytes minus per-chunk overhead):
-       1. If one exists, use a remainderless chunk in associated smallbin.
-          (Remainderless means that there are too few excess bytes to
-          represent as a chunk.)
-       2. If it is big enough, use the dv chunk, which is normally the
-          chunk adjacent to the one used for the most recent small request.
-       3. If one exists, split the smallest available chunk in a bin,
-          saving remainder in dv.
-       4. If it is big enough, use the top chunk.
-       5. If available, get memory from system and use it
-     Otherwise, for a large request:
-       1. Find the smallest available binned chunk that fits, and use it
-          if it is better fitting than dv chunk, splitting if necessary.
-       2. If better fitting than any binned chunk, use the dv chunk.
-       3. If it is big enough, use the top chunk.
-       4. If request size >= mmap threshold, try to directly mmap this chunk.
-       5. If available, get memory from system and use it
-
-     The ugly goto's here ensure that postaction occurs along all paths.
-  */
-
-#if USE_LOCKS
-  ensure_initialization(); /* initialize in sys_alloc if not using locks */
-#endif
-
-  if (!PREACTION(gm)) {
-    void* mem;
-    size_t nb;
-    if (bytes <= MAX_SMALL_REQUEST) {
-      bindex_t idx;
-      binmap_t smallbits;
-      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
-      idx = small_index(nb);
-      smallbits = gm->smallmap >> idx;
-
-      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
-        mchunkptr b, p;
-        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
-        b = smallbin_at(gm, idx);
-        p = b->fd;
-        assert(chunksize(p) == small_index2size(idx));
-        unlink_first_small_chunk(gm, b, p, idx);
-        set_inuse_and_pinuse(gm, p, small_index2size(idx));
-        mem = chunk2mem(p);
-        check_malloced_chunk(gm, mem, nb);
-        goto postaction;
-      }
-
-      else if (nb > gm->dvsize) {
-        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
-          mchunkptr b, p, r;
-          size_t rsize;
-          bindex_t i;
-          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
-          binmap_t leastbit = least_bit(leftbits);
-          compute_bit2idx(leastbit, i);
-          b = smallbin_at(gm, i);
-          p = b->fd;
-          assert(chunksize(p) == small_index2size(i));
-          unlink_first_small_chunk(gm, b, p, i);
-          rsize = small_index2size(i) - nb;
-          /* Fit here cannot be remainderless if 4byte sizes */
-          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
-            set_inuse_and_pinuse(gm, p, small_index2size(i));
-          else {
-            set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
-            r = chunk_plus_offset(p, nb);
-            set_size_and_pinuse_of_free_chunk(r, rsize);
-            replace_dv(gm, r, rsize);
-          }
-          mem = chunk2mem(p);
-          check_malloced_chunk(gm, mem, nb);
-          goto postaction;
-        }
-
-        else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
-          check_malloced_chunk(gm, mem, nb);
-          goto postaction;
-        }
-      }
-    }
-    else if (bytes >= MAX_REQUEST)
-      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
-    else {
-      nb = pad_request(bytes);
-      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
-        check_malloced_chunk(gm, mem, nb);
-        goto postaction;
-      }
-    }
-
-    if (nb <= gm->dvsize) {
-      size_t rsize = gm->dvsize - nb;
-      mchunkptr p = gm->dv;
-      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
-        mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
-        gm->dvsize = rsize;
-        set_size_and_pinuse_of_free_chunk(r, rsize);
-        set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
-      }
-      else { /* exhaust dv */
-        size_t dvs = gm->dvsize;
-        gm->dvsize = 0;
-        gm->dv = 0;
-        set_inuse_and_pinuse(gm, p, dvs);
-      }
-      mem = chunk2mem(p);
-      check_malloced_chunk(gm, mem, nb);
-      goto postaction;
-    }
-
-    else if (nb < gm->topsize) { /* Split top */
-      size_t rsize = gm->topsize -= nb;
-      mchunkptr p = gm->top;
-      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
-      r->head = rsize | PINUSE_BIT;
-      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
-      mem = chunk2mem(p);
-      check_top_chunk(gm, gm->top);
-      check_malloced_chunk(gm, mem, nb);
-      goto postaction;
-    }
-
-    mem = sys_alloc(gm, nb);
-
-  postaction:
-    POSTACTION(gm);
-    return mem;
-  }
-
-  return 0;
-}
-
-void dlfree(void* mem) {
-  /*
-     Consolidate freed chunks with preceeding or succeeding bordering
-     free chunks, if they exist, and then place in a bin.  Intermixed
-     with special cases for top, dv, mmapped chunks, and usage errors.
-  */
-
-  if (mem != 0) {
-    mchunkptr p  = mem2chunk(mem);
-#if FOOTERS
-    mstate fm = get_mstate_for(p);
-    if (!ok_magic(fm)) {
-      USAGE_ERROR_ACTION(fm, p);
-      return;
-    }
-#else /* FOOTERS */
-#define fm gm
-#endif /* FOOTERS */
-    if (!PREACTION(fm)) {
-      check_inuse_chunk(fm, p);
-      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
-        size_t psize = chunksize(p);
-        mchunkptr next = chunk_plus_offset(p, psize);
-        if (!pinuse(p)) {
-          size_t prevsize = p->prev_foot;
-          if (is_mmapped(p)) {
-            psize += prevsize + MMAP_FOOT_PAD;
-            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
-              fm->footprint -= psize;
-            goto postaction;
-          }
-          else {
-            mchunkptr prev = chunk_minus_offset(p, prevsize);
-            psize += prevsize;
-            p = prev;
-            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
-              if (p != fm->dv) {
-                unlink_chunk(fm, p, prevsize);
-              }
-              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
-                fm->dvsize = psize;
-                set_free_with_pinuse(p, psize, next);
-                goto postaction;
-              }
-            }
-            else
-              goto erroraction;
-          }
-        }
-
-        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
-          if (!cinuse(next)) {  /* consolidate forward */
-            if (next == fm->top) {
-              size_t tsize = fm->topsize += psize;
-              fm->top = p;
-              p->head = tsize | PINUSE_BIT;
-              if (p == fm->dv) {
-                fm->dv = 0;
-                fm->dvsize = 0;
-              }
-              if (should_trim(fm, tsize))
-                sys_trim(fm, 0);
-              goto postaction;
-            }
-            else if (next == fm->dv) {
-              size_t dsize = fm->dvsize += psize;
-              fm->dv = p;
-              set_size_and_pinuse_of_free_chunk(p, dsize);
-              goto postaction;
-            }
-            else {
-              size_t nsize = chunksize(next);
-              psize += nsize;
-              unlink_chunk(fm, next, nsize);
-              set_size_and_pinuse_of_free_chunk(p, psize);
-              if (p == fm->dv) {
-                fm->dvsize = psize;
-                goto postaction;
-              }
-            }
-          }
-          else
-            set_free_with_pinuse(p, psize, next);
-
-          if (is_small(psize)) {
-            insert_small_chunk(fm, p, psize);
-            check_free_chunk(fm, p);
-          }
-          else {
-            tchunkptr tp = (tchunkptr)p;
-            insert_large_chunk(fm, tp, psize);
-            check_free_chunk(fm, p);
-            if (--fm->release_checks == 0)
-              release_unused_segments(fm);
-          }
-          goto postaction;
-        }
-      }
-    erroraction:
-      USAGE_ERROR_ACTION(fm, p);
-    postaction:
-      POSTACTION(fm);
-    }
-  }
-#if !FOOTERS
-#undef fm
-#endif /* FOOTERS */
-}
-
-void* dlcalloc(size_t n_elements, size_t elem_size) {
-  void* mem;
-  size_t req = 0;
-  if (n_elements != 0) {
-    req = n_elements * elem_size;
-    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
-        (req / n_elements != elem_size))
-      req = MAX_SIZE_T; /* force downstream failure on overflow */
-  }
-  mem = dlmalloc(req);
-  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
-    memset(mem, 0, req);
-  return mem;
-}
-
-void* dlrealloc(void* oldmem, size_t bytes) {
-  if (oldmem == 0)
-    return dlmalloc(bytes);
-#ifdef REALLOC_ZERO_BYTES_FREES
-  if (bytes == 0) {
-    dlfree(oldmem);
-    return 0;
-  }
-#endif /* REALLOC_ZERO_BYTES_FREES */
-  else {
-#if ! FOOTERS
-    mstate m = gm;
-#else /* FOOTERS */
-    mstate m = get_mstate_for(mem2chunk(oldmem));
-    if (!ok_magic(m)) {
-      USAGE_ERROR_ACTION(m, oldmem);
-      return 0;
-    }
-#endif /* FOOTERS */
-    return internal_realloc(m, oldmem, bytes);
-  }
-}
-
-void* dlmemalign(size_t alignment, size_t bytes) {
-  return internal_memalign(gm, alignment, bytes);
-}
-
-void** dlindependent_calloc(size_t n_elements, size_t elem_size,
-                                 void* chunks[]) {
-  size_t sz = elem_size; /* serves as 1-element array */
-  return ialloc(gm, n_elements, &sz, 3, chunks);
-}
-
-void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
-                                   void* chunks[]) {
-  return ialloc(gm, n_elements, sizes, 0, chunks);
-}
-
-void* dlvalloc(size_t bytes) {
-  size_t pagesz;
-  ensure_initialization();
-  pagesz = mparams.page_size;
-  return dlmemalign(pagesz, bytes);
-}
-
-void* dlpvalloc(size_t bytes) {
-  size_t pagesz;
-  ensure_initialization();
-  pagesz = mparams.page_size;
-  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
-}
-
-int dlmalloc_trim(size_t pad) {
-  int result = 0;
-  ensure_initialization();
-  if (!PREACTION(gm)) {
-    result = sys_trim(gm, pad);
-    POSTACTION(gm);
-  }
-  return result;
-}
-
-size_t dlmalloc_footprint(void) {
-  return gm->footprint;
-}
-
-size_t dlmalloc_max_footprint(void) {
-  return gm->max_footprint;
-}
-
-#if !NO_MALLINFO
-struct mallinfo dlmallinfo(void) {
-  return internal_mallinfo(gm);
-}
-#endif /* NO_MALLINFO */
-
-void dlmalloc_stats() {
-  internal_malloc_stats(gm);
-}
-
-int dlmallopt(int param_number, int value) {
-  return change_mparam(param_number, value);
-}
-
-#endif /* !ONLY_MSPACES */
-
-size_t dlmalloc_usable_size(void* mem) {
-  if (mem != 0) {
-    mchunkptr p = mem2chunk(mem);
-    if (is_inuse(p))
-      return chunksize(p) - overhead_for(p);
-  }
-  return 0;
-}
-
-/* ----------------------------- user mspaces ---------------------------- */
-
-#if MSPACES
-
-static mstate init_user_mstate(char* tbase, size_t tsize) {
-  size_t msize = pad_request(sizeof(struct malloc_state));
-  mchunkptr mn;
-  mchunkptr msp = align_as_chunk(tbase);
-  mstate m = (mstate)(chunk2mem(msp));
-  memset(m, 0, msize);
-  INITIAL_LOCK(&m->mutex);
-  msp->head = (msize|INUSE_BITS);
-  m->seg.base = m->least_addr = tbase;
-  m->seg.size = m->footprint = m->max_footprint = tsize;
-  m->magic = mparams.magic;
-  m->release_checks = MAX_RELEASE_CHECK_RATE;
-  m->mflags = mparams.default_mflags;
-  m->extp = 0;
-  m->exts = 0;
-  disable_contiguous(m);
-  init_bins(m);
-  mn = next_chunk(mem2chunk(m));
-  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
-  check_top_chunk(m, m->top);
-  return m;
-}
-
-mspace create_mspace(size_t capacity, int locked) {
-  mstate m = 0;
-  size_t msize;
-  ensure_initialization();
-  msize = pad_request(sizeof(struct malloc_state));
-  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
-    size_t rs = ((capacity == 0)? mparams.granularity :
-                 (capacity + TOP_FOOT_SIZE + msize));
-    size_t tsize = granularity_align(rs);
-    char* tbase = (char*)(CALL_MMAP(tsize));
-    if (tbase != CMFAIL) {
-      m = init_user_mstate(tbase, tsize);
-      m->seg.sflags = USE_MMAP_BIT;
-      set_lock(m, locked);
-    }
-  }
-  return (mspace)m;
-}
-
-mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
-  mstate m = 0;
-  size_t msize;
-  ensure_initialization();
-  msize = pad_request(sizeof(struct malloc_state));
-  if (capacity > msize + TOP_FOOT_SIZE &&
-      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
-    m = init_user_mstate((char*)base, capacity);
-    m->seg.sflags = EXTERN_BIT;
-    set_lock(m, locked);
-  }
-  return (mspace)m;
-}
-
-int mspace_track_large_chunks(mspace msp, int enable) {
-  int ret = 0;
-  mstate ms = (mstate)msp;
-  if (!PREACTION(ms)) {
-    if (!use_mmap(ms))
-      ret = 1;
-    if (!enable)
-      enable_mmap(ms);
-    else
-      disable_mmap(ms);
-    POSTACTION(ms);
-  }
-  return ret;
-}
-
-size_t destroy_mspace(mspace msp) {
-  size_t freed = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    msegmentptr sp = &ms->seg;
-    while (sp != 0) {
-      char* base = sp->base;
-      size_t size = sp->size;
-      flag_t flag = sp->sflags;
-      sp = sp->next;
-      if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
-          CALL_MUNMAP(base, size) == 0)
-        freed += size;
-    }
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return freed;
-}
-
-/*
-  mspace versions of routines are near-clones of the global
-  versions. This is not so nice but better than the alternatives.
-*/
-
-
-void* mspace_malloc(mspace msp, size_t bytes) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  if (!PREACTION(ms)) {
-    void* mem;
-    size_t nb;
-    if (bytes <= MAX_SMALL_REQUEST) {
-      bindex_t idx;
-      binmap_t smallbits;
-      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
-      idx = small_index(nb);
-      smallbits = ms->smallmap >> idx;
-
-      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
-        mchunkptr b, p;
-        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
-        b = smallbin_at(ms, idx);
-        p = b->fd;
-        assert(chunksize(p) == small_index2size(idx));
-        unlink_first_small_chunk(ms, b, p, idx);
-        set_inuse_and_pinuse(ms, p, small_index2size(idx));
-        mem = chunk2mem(p);
-        check_malloced_chunk(ms, mem, nb);
-        goto postaction;
-      }
-
-      else if (nb > ms->dvsize) {
-        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
-          mchunkptr b, p, r;
-          size_t rsize;
-          bindex_t i;
-          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
-          binmap_t leastbit = least_bit(leftbits);
-          compute_bit2idx(leastbit, i);
-          b = smallbin_at(ms, i);
-          p = b->fd;
-          assert(chunksize(p) == small_index2size(i));
-          unlink_first_small_chunk(ms, b, p, i);
-          rsize = small_index2size(i) - nb;
-          /* Fit here cannot be remainderless if 4byte sizes */
-          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
-            set_inuse_and_pinuse(ms, p, small_index2size(i));
-          else {
-            set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
-            r = chunk_plus_offset(p, nb);
-            set_size_and_pinuse_of_free_chunk(r, rsize);
-            replace_dv(ms, r, rsize);
-          }
-          mem = chunk2mem(p);
-          check_malloced_chunk(ms, mem, nb);
-          goto postaction;
-        }
-
-        else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
-          check_malloced_chunk(ms, mem, nb);
-          goto postaction;
-        }
-      }
-    }
-    else if (bytes >= MAX_REQUEST)
-      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
-    else {
-      nb = pad_request(bytes);
-      if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
-        check_malloced_chunk(ms, mem, nb);
-        goto postaction;
-      }
-    }
-
-    if (nb <= ms->dvsize) {
-      size_t rsize = ms->dvsize - nb;
-      mchunkptr p = ms->dv;
-      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
-        mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
-        ms->dvsize = rsize;
-        set_size_and_pinuse_of_free_chunk(r, rsize);
-        set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
-      }
-      else { /* exhaust dv */
-        size_t dvs = ms->dvsize;
-        ms->dvsize = 0;
-        ms->dv = 0;
-        set_inuse_and_pinuse(ms, p, dvs);
-      }
-      mem = chunk2mem(p);
-      check_malloced_chunk(ms, mem, nb);
-      goto postaction;
-    }
-
-    else if (nb < ms->topsize) { /* Split top */
-      size_t rsize = ms->topsize -= nb;
-      mchunkptr p = ms->top;
-      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
-      r->head = rsize | PINUSE_BIT;
-      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
-      mem = chunk2mem(p);
-      check_top_chunk(ms, ms->top);
-      check_malloced_chunk(ms, mem, nb);
-      goto postaction;
-    }
-
-    mem = sys_alloc(ms, nb);
-
-  postaction:
-    POSTACTION(ms);
-    return mem;
-  }
-
-  return 0;
-}
-
-void mspace_free(mspace msp, void* mem) {
-  if (mem != 0) {
-    mchunkptr p  = mem2chunk(mem);
-#if FOOTERS
-    mstate fm = get_mstate_for(p);
-    msp = msp; /* placate people compiling -Wunused */
-#else /* FOOTERS */
-    mstate fm = (mstate)msp;
-#endif /* FOOTERS */
-    if (!ok_magic(fm)) {
-      USAGE_ERROR_ACTION(fm, p);
-      return;
-    }
-    if (!PREACTION(fm)) {
-      check_inuse_chunk(fm, p);
-      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
-        size_t psize = chunksize(p);
-        mchunkptr next = chunk_plus_offset(p, psize);
-        if (!pinuse(p)) {
-          size_t prevsize = p->prev_foot;
-          if (is_mmapped(p)) {
-            psize += prevsize + MMAP_FOOT_PAD;
-            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
-              fm->footprint -= psize;
-            goto postaction;
-          }
-          else {
-            mchunkptr prev = chunk_minus_offset(p, prevsize);
-            psize += prevsize;
-            p = prev;
-            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
-              if (p != fm->dv) {
-                unlink_chunk(fm, p, prevsize);
-              }
-              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
-                fm->dvsize = psize;
-                set_free_with_pinuse(p, psize, next);
-                goto postaction;
-              }
-            }
-            else
-              goto erroraction;
-          }
-        }
-
-        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
-          if (!cinuse(next)) {  /* consolidate forward */
-            if (next == fm->top) {
-              size_t tsize = fm->topsize += psize;
-              fm->top = p;
-              p->head = tsize | PINUSE_BIT;
-              if (p == fm->dv) {
-                fm->dv = 0;
-                fm->dvsize = 0;
-              }
-              if (should_trim(fm, tsize))
-                sys_trim(fm, 0);
-              goto postaction;
-            }
-            else if (next == fm->dv) {
-              size_t dsize = fm->dvsize += psize;
-              fm->dv = p;
-              set_size_and_pinuse_of_free_chunk(p, dsize);
-              goto postaction;
-            }
-            else {
-              size_t nsize = chunksize(next);
-              psize += nsize;
-              unlink_chunk(fm, next, nsize);
-              set_size_and_pinuse_of_free_chunk(p, psize);
-              if (p == fm->dv) {
-                fm->dvsize = psize;
-                goto postaction;
-              }
-            }
-          }
-          else
-            set_free_with_pinuse(p, psize, next);
-
-          if (is_small(psize)) {
-            insert_small_chunk(fm, p, psize);
-            check_free_chunk(fm, p);
-          }
-          else {
-            tchunkptr tp = (tchunkptr)p;
-            insert_large_chunk(fm, tp, psize);
-            check_free_chunk(fm, p);
-            if (--fm->release_checks == 0)
-              release_unused_segments(fm);
-          }
-          goto postaction;
-        }
-      }
-    erroraction:
-      USAGE_ERROR_ACTION(fm, p);
-    postaction:
-      POSTACTION(fm);
-    }
-  }
-}
-
-void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
-  void* mem;
-  size_t req = 0;
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  if (n_elements != 0) {
-    req = n_elements * elem_size;
-    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
-        (req / n_elements != elem_size))
-      req = MAX_SIZE_T; /* force downstream failure on overflow */
-  }
-  mem = internal_malloc(ms, req);
-  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
-    memset(mem, 0, req);
-  return mem;
-}
-
-void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
-  if (oldmem == 0)
-    return mspace_malloc(msp, bytes);
-#ifdef REALLOC_ZERO_BYTES_FREES
-  if (bytes == 0) {
-    mspace_free(msp, oldmem);
-    return 0;
-  }
-#endif /* REALLOC_ZERO_BYTES_FREES */
-  else {
-#if FOOTERS
-    mchunkptr p  = mem2chunk(oldmem);
-    mstate ms = get_mstate_for(p);
-#else /* FOOTERS */
-    mstate ms = (mstate)msp;
-#endif /* FOOTERS */
-    if (!ok_magic(ms)) {
-      USAGE_ERROR_ACTION(ms,ms);
-      return 0;
-    }
-    return internal_realloc(ms, oldmem, bytes);
-  }
-}
-
-void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  return internal_memalign(ms, alignment, bytes);
-}
-
-void** mspace_independent_calloc(mspace msp, size_t n_elements,
-                                 size_t elem_size, void* chunks[]) {
-  size_t sz = elem_size; /* serves as 1-element array */
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  return ialloc(ms, n_elements, &sz, 3, chunks);
-}
-
-void** mspace_independent_comalloc(mspace msp, size_t n_elements,
-                                   size_t sizes[], void* chunks[]) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-    return 0;
-  }
-  return ialloc(ms, n_elements, sizes, 0, chunks);
-}
-
-int mspace_trim(mspace msp, size_t pad) {
-  int result = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    if (!PREACTION(ms)) {
-      result = sys_trim(ms, pad);
-      POSTACTION(ms);
-    }
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return result;
-}
-
-void mspace_malloc_stats(mspace msp) {
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    internal_malloc_stats(ms);
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-}
-
-size_t mspace_footprint(mspace msp) {
-  size_t result = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    result = ms->footprint;
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return result;
-}
-
-
-size_t mspace_max_footprint(mspace msp) {
-  size_t result = 0;
-  mstate ms = (mstate)msp;
-  if (ok_magic(ms)) {
-    result = ms->max_footprint;
-  }
-  else {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return result;
-}
-
-
-#if !NO_MALLINFO
-struct mallinfo mspace_mallinfo(mspace msp) {
-  mstate ms = (mstate)msp;
-  if (!ok_magic(ms)) {
-    USAGE_ERROR_ACTION(ms,ms);
-  }
-  return internal_mallinfo(ms);
-}
-#endif /* NO_MALLINFO */
-
-size_t mspace_usable_size(void* mem) {
-  if (mem != 0) {
-    mchunkptr p = mem2chunk(mem);
-    if (is_inuse(p))
-      return chunksize(p) - overhead_for(p);
-  }
-  return 0;
-}
-
-int mspace_mallopt(int param_number, int value) {
-  return change_mparam(param_number, value);
-}
-
-#endif /* MSPACES */
-
-
-/* -------------------- Alternative MORECORE functions ------------------- */
-
-/*
-  Guidelines for creating a custom version of MORECORE:
-
-  * For best performance, MORECORE should allocate in multiples of pagesize.
-  * MORECORE may allocate more memory than requested. (Or even less,
-      but this will usually result in a malloc failure.)
-  * MORECORE must not allocate memory when given argument zero, but
-      instead return one past the end address of memory from previous
-      nonzero call.
-  * For best performance, consecutive calls to MORECORE with positive
-      arguments should return increasing addresses, indicating that
-      space has been contiguously extended.
-  * Even though consecutive calls to MORECORE need not return contiguous
-      addresses, it must be OK for malloc'ed chunks to span multiple
-      regions in those cases where they do happen to be contiguous.
-  * MORECORE need not handle negative arguments -- it may instead
-      just return MFAIL when given negative arguments.
-      Negative arguments are always multiples of pagesize. MORECORE
-      must not misinterpret negative args as large positive unsigned
-      args. You can suppress all such calls from even occurring by defining
-      MORECORE_CANNOT_TRIM,
-
-  As an example alternative MORECORE, here is a custom allocator
-  kindly contributed for pre-OSX macOS.  It uses virtually but not
-  necessarily physically contiguous non-paged memory (locked in,
-  present and won't get swapped out).  You can use it by uncommenting
-  this section, adding some #includes, and setting up the appropriate
-  defines above:
-
-      #define MORECORE osMoreCore
-
-  There is also a shutdown routine that should somehow be called for
-  cleanup upon program exit.
-
-  #define MAX_POOL_ENTRIES 100
-  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
-  static int next_os_pool;
-  void *our_os_pools[MAX_POOL_ENTRIES];
-
-  void *osMoreCore(int size)
-  {
-    void *ptr = 0;
-    static void *sbrk_top = 0;
-
-    if (size > 0)
-    {
-      if (size < MINIMUM_MORECORE_SIZE)
-         size = MINIMUM_MORECORE_SIZE;
-      if (CurrentExecutionLevel() == kTaskLevel)
-         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
-      if (ptr == 0)
-      {
-        return (void *) MFAIL;
-      }
-      // save ptrs so they can be freed during cleanup
-      our_os_pools[next_os_pool] = ptr;
-      next_os_pool++;
-      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
-      sbrk_top = (char *) ptr + size;
-      return ptr;
-    }
-    else if (size < 0)
-    {
-      // we don't currently support shrink behavior
-      return (void *) MFAIL;
-    }
-    else
-    {
-      return sbrk_top;
-    }
-  }
-
-  // cleanup any allocated memory pools
-  // called as last thing before shutting down driver
-
-  void osCleanupMem(void)
-  {
-    void **ptr;
-
-    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
-      if (*ptr)
-      {
-         PoolDeallocate(*ptr);
-         *ptr = 0;
-      }
-  }
-
-*/
-
-
-/* -----------------------------------------------------------------------
-History:
-    V2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
-      * Use zeros instead of prev foot for is_mmapped
-      * Add mspace_track_large_chunks; thanks to Jean Brouwers
-      * Fix set_inuse in internal_realloc; thanks to Jean Brouwers
-      * Fix insufficient sys_alloc padding when using 16byte alignment
-      * Fix bad error check in mspace_footprint
-      * Adaptations for ptmalloc; thanks to Wolfram Gloger.
-      * Reentrant spin locks; thanks to Earl Chew and others
-      * Win32 improvements; thanks to Niall Douglas and Earl Chew
-      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
-      * Extension hook in malloc_state
-      * Various small adjustments to reduce warnings on some compilers
-      * Various configuration extensions/changes for more platforms. Thanks
-         to all who contributed these.
-
-    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
-      * Add max_footprint functions
-      * Ensure all appropriate literals are size_t
-      * Fix conditional compilation problem for some #define settings
-      * Avoid concatenating segments with the one provided
-        in create_mspace_with_base
-      * Rename some variables to avoid compiler shadowing warnings
-      * Use explicit lock initialization.
-      * Better handling of sbrk interference.
-      * Simplify and fix segment insertion, trimming and mspace_destroy
-      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
-      * Thanks especially to Dennis Flanagan for help on these.
-
-    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
-      * Fix memalign brace error.
-
-    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
-      * Fix improper #endif nesting in C++
-      * Add explicit casts needed for C++
-
-    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
-      * Use trees for large bins
-      * Support mspaces
-      * Use segments to unify sbrk-based and mmap-based system allocation,
-        removing need for emulation on most platforms without sbrk.
-      * Default safety checks
-      * Optional footer checks. Thanks to William Robertson for the idea.
-      * Internal code refactoring
-      * Incorporate suggestions and platform-specific changes.
-        Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
-        Aaron Bachmann,  Emery Berger, and others.
-      * Speed up non-fastbin processing enough to remove fastbins.
-      * Remove useless cfree() to avoid conflicts with other apps.
-      * Remove internal memcpy, memset. Compilers handle builtins better.
-      * Remove some options that no one ever used and rename others.
-
-    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
-      * Fix malloc_state bitmap array misdeclaration
-
-    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
-      * Allow tuning of FIRST_SORTED_BIN_SIZE
-      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
-      * Better detection and support for non-contiguousness of MORECORE.
-        Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
-      * Bypass most of malloc if no frees. Thanks To Emery Berger.
-      * Fix freeing of old top non-contiguous chunk im sysmalloc.
-      * Raised default trim and map thresholds to 256K.
-      * Fix mmap-related #defines. Thanks to Lubos Lunak.
-      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
-      * Branch-free bin calculation
-      * Default trim and mmap thresholds now 256K.
-
-    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
-      * Introduce independent_comalloc and independent_calloc.
-        Thanks to Michael Pachos for motivation and help.
-      * Make optional .h file available
-      * Allow > 2GB requests on 32bit systems.
-      * new WIN32 sbrk, mmap, munmap, lock code from <[email protected]>.
-        Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
-        and Anonymous.
-      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
-        helping test this.)
-      * memalign: check alignment arg
-      * realloc: don't try to shift chunks backwards, since this
-        leads to  more fragmentation in some programs and doesn't
-        seem to help in any others.
-      * Collect all cases in malloc requiring system memory into sysmalloc
-      * Use mmap as backup to sbrk
-      * Place all internal state in malloc_state
-      * Introduce fastbins (although similar to 2.5.1)
-      * Many minor tunings and cosmetic improvements
-      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
-      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
-        Thanks to Tony E. Bennett <[email protected]> and others.
-      * Include errno.h to support default failure action.
-
-    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
-      * return null for negative arguments
-      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
-         * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
-          (e.g. WIN32 platforms)
-         * Cleanup header file inclusion for WIN32 platforms
-         * Cleanup code to avoid Microsoft Visual C++ compiler complaints
-         * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
-           memory allocation routines
-         * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
-         * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
-           usage of 'assert' in non-WIN32 code
-         * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
-           avoid infinite loop
-      * Always call 'fREe()' rather than 'free()'
-
-    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
-      * Fixed ordering problem with boundary-stamping
-
-    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
-      * Added pvalloc, as recommended by H.J. Liu
-      * Added 64bit pointer support mainly from Wolfram Gloger
-      * Added anonymously donated WIN32 sbrk emulation
-      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
-      * malloc_extend_top: fix mask error that caused wastage after
-        foreign sbrks
-      * Add linux mremap support code from HJ Liu
-
-    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
-      * Integrated most documentation with the code.
-      * Add support for mmap, with help from
-        Wolfram Gloger ([email protected]).
-      * Use last_remainder in more cases.
-      * Pack bins using idea from  [email protected]
-      * Use ordered bins instead of best-fit threshhold
-      * Eliminate block-local decls to simplify tracing and debugging.
-      * Support another case of realloc via move into top
-      * Fix error occuring when initial sbrk_base not word-aligned.
-      * Rely on page size for units instead of SBRK_UNIT to
-        avoid surprises about sbrk alignment conventions.
-      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
-        ([email protected]) for the suggestion.
-      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
-      * More precautions for cases where other routines call sbrk,
-        courtesy of Wolfram Gloger ([email protected]).
-      * Added macros etc., allowing use in linux libc from
-        H.J. Lu ([email protected])
-      * Inverted this history list
-
-    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
-      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
-      * Removed all preallocation code since under current scheme
-        the work required to undo bad preallocations exceeds
-        the work saved in good cases for most test programs.
-      * No longer use return list or unconsolidated bins since
-        no scheme using them consistently outperforms those that don't
-        given above changes.
-      * Use best fit for very large chunks to prevent some worst-cases.
-      * Added some support for debugging
-
-    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
-      * Removed footers when chunks are in use. Thanks to
-        Paul Wilson ([email protected]) for the suggestion.
-
-    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
-      * Added malloc_trim, with help from Wolfram Gloger
-        ([email protected]).
-
-    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)
-
-    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
-      * realloc: try to expand in both directions
-      * malloc: swap order of clean-bin strategy;
-      * realloc: only conditionally expand backwards
-      * Try not to scavenge used bins
-      * Use bin counts as a guide to preallocation
-      * Occasionally bin return list chunks in first scan
-      * Add a few optimizations from [email protected]
-
-    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
-      * faster bin computation & slightly different binning
-      * merged all consolidations to one part of malloc proper
-         (eliminating old malloc_find_space & malloc_clean_bin)
-      * Scan 2 returns chunks (not just 1)
-      * Propagate failure in realloc if malloc returns 0
-      * Add stuff to allow compilation on non-ANSI compilers
-          from [email protected]
-
-    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
-      * removed potential for odd address access in prev_chunk
-      * removed dependency on getpagesize.h
-      * misc cosmetics and a bit more internal documentation
-      * anticosmetics: mangled names in macros to evade debugger strangeness
-      * tested on sparc, hp-700, dec-mips, rs6000
-          with gcc & native cc (hp, dec only) allowing
-          Detlefs & Zorn comparison study (in SIGPLAN Notices.)
-
-    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
-      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
-         structure of old version,  but most details differ.)
-
-*/
-
-#endif
+#ifdef NEDMALLOC_ENABLED
+/*
+  This is a version (aka dlmalloc) of malloc/free/realloc written by
+  Doug Lea and released to the public domain, as explained at
+  http://creativecommons.org/licenses/publicdomain.  Send questions,
+  comments, complaints, performance data, etc to [email protected]
+
+* Version 2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
+
+   Note: There may be an updated version of this malloc obtainable at
+           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
+         Check before installing!
+
+* Quickstart
+
+  This library is all in one file to simplify the most common usage:
+  ftp it, compile it (-O3), and link it into another program. All of
+  the compile-time options default to reasonable values for use on
+  most platforms.  You might later want to step through various
+  compile-time and dynamic tuning options.
+
+  For convenience, an include file for code using this malloc is at:
+     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.4.h
+  You don't really need this .h file unless you call functions not
+  defined in your system include files.  The .h file contains only the
+  excerpts from this file needed for using this malloc on ANSI C/C++
+  systems, so long as you haven't changed compile-time options about
+  naming and tuning parameters.  If you do, then you can create your
+  own malloc.h that does include all settings by cutting at the point
+  indicated below. Note that you may already by default be using a C
+  library containing a malloc that is based on some version of this
+  malloc (for example in linux). You might still want to use the one
+  in this file to customize settings or to avoid overheads associated
+  with library versions.
+
+* Vital statistics:
+
+  Supported pointer/size_t representation:       4 or 8 bytes
+       size_t MUST be an unsigned type of the same width as
+       pointers. (If you are using an ancient system that declares
+       size_t as a signed type, or need it to be a different width
+       than pointers, you can use a previous release of this malloc
+       (e.g. 2.7.2) supporting these.)
+
+  Alignment:                                     8 bytes (default)
+       This suffices for nearly all current machines and C compilers.
+       However, you can define MALLOC_ALIGNMENT to be wider than this
+       if necessary (up to 128bytes), at the expense of using more space.
+
+  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
+                                          8 or 16 bytes (if 8byte sizes)
+       Each malloced chunk has a hidden word of overhead holding size
+       and status information, and additional cross-check word
+       if FOOTERS is defined.
+
+  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
+                          8-byte ptrs:  32 bytes    (including overhead)
+
+       Even a request for zero bytes (i.e., malloc(0)) returns a
+       pointer to something of the minimum allocatable size.
+       The maximum overhead wastage (i.e., number of extra bytes
+       allocated than were requested in malloc) is less than or equal
+       to the minimum size, except for requests >= mmap_threshold that
+       are serviced via mmap(), where the worst case wastage is about
+       32 bytes plus the remainder from a system page (the minimal
+       mmap unit); typically 4096 or 8192 bytes.
+
+  Security: static-safe; optionally more or less
+       The "security" of malloc refers to the ability of malicious
+       code to accentuate the effects of errors (for example, freeing
+       space that is not currently malloc'ed or overwriting past the
+       ends of chunks) in code that calls malloc.  This malloc
+       guarantees not to modify any memory locations below the base of
+       heap, i.e., static variables, even in the presence of usage
+       errors.  The routines additionally detect most improper frees
+       and reallocs.  All this holds as long as the static bookkeeping
+       for malloc itself is not corrupted by some other means.  This
+       is only one aspect of security -- these checks do not, and
+       cannot, detect all possible programming errors.
+
+       If FOOTERS is defined nonzero, then each allocated chunk
+       carries an additional check word to verify that it was malloced
+       from its space.  These check words are the same within each
+       execution of a program using malloc, but differ across
+       executions, so externally crafted fake chunks cannot be
+       freed. This improves security by rejecting frees/reallocs that
+       could corrupt heap memory, in addition to the checks preventing
+       writes to statics that are always on.  This may further improve
+       security at the expense of time and space overhead.  (Note that
+       FOOTERS may also be worth using with MSPACES.)
+
+       By default detected errors cause the program to abort (calling
+       "abort()"). You can override this to instead proceed past
+       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
+       has no effect, and a malloc that encounters a bad address
+       caused by user overwrites will ignore the bad address by
+       dropping pointers and indices to all known memory. This may
+       be appropriate for programs that should continue if at all
+       possible in the face of programming errors, although they may
+       run out of memory because dropped memory is never reclaimed.
+
+       If you don't like either of these options, you can define
+       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
+       else. And if if you are sure that your program using malloc has
+       no errors or vulnerabilities, you can define INSECURE to 1,
+       which might (or might not) provide a small performance improvement.
+
+  Thread-safety: NOT thread-safe unless USE_LOCKS defined
+       When USE_LOCKS is defined, each public call to malloc, free,
+       etc is surrounded with either a pthread mutex or a win32
+       spinlock (depending on WIN32). This is not especially fast, and
+       can be a major bottleneck.  It is designed only to provide
+       minimal protection in concurrent environments, and to provide a
+       basis for extensions.  If you are using malloc in a concurrent
+       program, consider instead using nedmalloc
+       (http://www.nedprod.com/programs/portable/nedmalloc/) or
+       ptmalloc (See http://www.malloc.de), which are derived
+       from versions of this malloc.
+
+  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
+       This malloc can use unix sbrk or any emulation (invoked using
+       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
+       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
+       memory.  On most unix systems, it tends to work best if both
+       MORECORE and MMAP are enabled.  On Win32, it uses emulations
+       based on VirtualAlloc. It also uses common C library functions
+       like memset.
+
+  Compliance: I believe it is compliant with the Single Unix Specification
+       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
+       others as well.
+
+* Overview of algorithms
+
+  This is not the fastest, most space-conserving, most portable, or
+  most tunable malloc ever written. However it is among the fastest
+  while also being among the most space-conserving, portable and
+  tunable.  Consistent balance across these factors results in a good
+  general-purpose allocator for malloc-intensive programs.
+
+  In most ways, this malloc is a best-fit allocator. Generally, it
+  chooses the best-fitting existing chunk for a request, with ties
+  broken in approximately least-recently-used order. (This strategy
+  normally maintains low fragmentation.) However, for requests less
+  than 256bytes, it deviates from best-fit when there is not an
+  exactly fitting available chunk by preferring to use space adjacent
+  to that used for the previous small request, as well as by breaking
+  ties in approximately most-recently-used order. (These enhance
+  locality of series of small allocations.)  And for very large requests
+  (>= 256Kb by default), it relies on system memory mapping
+  facilities, if supported.  (This helps avoid carrying around and
+  possibly fragmenting memory used only for large chunks.)
+
+  All operations (except malloc_stats and mallinfo) have execution
+  times that are bounded by a constant factor of the number of bits in
+  a size_t, not counting any clearing in calloc or copying in realloc,
+  or actions surrounding MORECORE and MMAP that have times
+  proportional to the number of non-contiguous regions returned by
+  system allocation routines, which is often just 1. In real-time
+  applications, you can optionally suppress segment traversals using
+  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
+  system allocators return non-contiguous spaces, at the typical
+  expense of carrying around more memory and increased fragmentation.
+
+  The implementation is not very modular and seriously overuses
+  macros. Perhaps someday all C compilers will do as good a job
+  inlining modular code as can now be done by brute-force expansion,
+  but now, enough of them seem not to.
+
+  Some compilers issue a lot of warnings about code that is
+  dead/unreachable only on some platforms, and also about intentional
+  uses of negation on unsigned types. All known cases of each can be
+  ignored.
+
+  For a longer but out of date high-level description, see
+     http://gee.cs.oswego.edu/dl/html/malloc.html
+
+* MSPACES
+  If MSPACES is defined, then in addition to malloc, free, etc.,
+  this file also defines mspace_malloc, mspace_free, etc. These
+  are versions of malloc routines that take an "mspace" argument
+  obtained using create_mspace, to control all internal bookkeeping.
+  If ONLY_MSPACES is defined, only these versions are compiled.
+  So if you would like to use this allocator for only some allocations,
+  and your system malloc for others, you can compile with
+  ONLY_MSPACES and then do something like...
+    static mspace mymspace = create_mspace(0,0); // for example
+    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)
+
+  (Note: If you only need one instance of an mspace, you can instead
+  use "USE_DL_PREFIX" to relabel the global malloc.)
+
+  You can similarly create thread-local allocators by storing
+  mspaces as thread-locals. For example:
+    static __thread mspace tlms = 0;
+    void*  tlmalloc(size_t bytes) {
+      if (tlms == 0) tlms = create_mspace(0, 0);
+      return mspace_malloc(tlms, bytes);
+    }
+    void  tlfree(void* mem) { mspace_free(tlms, mem); }
+
+  Unless FOOTERS is defined, each mspace is completely independent.
+  You cannot allocate from one and free to another (although
+  conformance is only weakly checked, so usage errors are not always
+  caught). If FOOTERS is defined, then each chunk carries around a tag
+  indicating its originating mspace, and frees are directed to their
+  originating spaces.
+
+ -------------------------  Compile-time options ---------------------------
+
+Be careful in setting #define values for numerical constants of type
+size_t. On some systems, literal values are not automatically extended
+to size_t precision unless they are explicitly casted. You can also
+use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.
+
+WIN32                    default: defined if _WIN32 defined
+  Defining WIN32 sets up defaults for MS environment and compilers.
+  Otherwise defaults are for unix. Beware that there seem to be some
+  cases where this malloc might not be a pure drop-in replacement for
+  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
+  SetDIBits()) may be due to bugs in some video driver implementations
+  when pixel buffers are malloc()ed, and the region spans more than
+  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
+  default granularity, pixel buffers may straddle virtual allocation
+  regions more often than when using the Microsoft allocator.  You can
+  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
+  buffers rather than using malloc().  If this is not possible,
+  recompile this malloc with a larger DEFAULT_GRANULARITY.
+
+MALLOC_ALIGNMENT         default: (size_t)8
+  Controls the minimum alignment for malloc'ed chunks.  It must be a
+  power of two and at least 8, even on machines for which smaller
+  alignments would suffice. It may be defined as larger than this
+  though. Note however that code and data structures are optimized for
+  the case of 8-byte alignment.
+
+MSPACES                  default: 0 (false)
+  If true, compile in support for independent allocation spaces.
+  This is only supported if HAVE_MMAP is true.
+
+ONLY_MSPACES             default: 0 (false)
+  If true, only compile in mspace versions, not regular versions.
+
+USE_LOCKS                default: 0 (false)
+  Causes each call to each public routine to be surrounded with
+  pthread or WIN32 mutex lock/unlock. (If set true, this can be
+  overridden on a per-mspace basis for mspace versions.) If set to a
+  non-zero value other than 1, locks are used, but their
+  implementation is left out, so lock functions must be supplied manually,
+  as described below.
+
+USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and on x86 using gcc or MSC
+  If true, uses custom spin locks for locking. This is currently
+  supported only for x86 platforms using gcc or recent MS compilers.
+  Otherwise, posix locks or win32 critical sections are used.
+
+FOOTERS                  default: 0
+  If true, provide extra checking and dispatching by placing
+  information in the footers of allocated chunks. This adds
+  space and time overhead.
+
+INSECURE                 default: 0
+  If true, omit checks for usage errors and heap space overwrites.
+
+USE_DL_PREFIX            default: NOT defined
+  Causes compiler to prefix all public routines with the string 'dl'.
+  This can be useful when you only want to use this malloc in one part
+  of a program, using your regular system malloc elsewhere.
+
+ABORT                    default: defined as abort()
+  Defines how to abort on failed checks.  On most systems, a failed
+  check cannot die with an "assert" or even print an informative
+  message, because the underlying print routines in turn call malloc,
+  which will fail again.  Generally, the best policy is to simply call
+  abort(). It's not very useful to do more than this because many
+  errors due to overwriting will show up as address faults (null, odd
+  addresses etc) rather than malloc-triggered checks, so will also
+  abort.  Also, most compilers know that abort() does not return, so
+  can better optimize code conditionally calling it.
+
+PROCEED_ON_ERROR           default: defined as 0 (false)
+  Controls whether detected bad addresses cause them to bypassed
+  rather than aborting. If set, detected bad arguments to free and
+  realloc are ignored. And all bookkeeping information is zeroed out
+  upon a detected overwrite of freed heap space, thus losing the
+  ability to ever return it from malloc again, but enabling the
+  application to proceed. If PROCEED_ON_ERROR is defined, the
+  static variable malloc_corruption_error_count is compiled in
+  and can be examined to see if errors have occurred. This option
+  generates slower code than the default abort policy.
+
+DEBUG                    default: NOT defined
+  The DEBUG setting is mainly intended for people trying to modify
+  this code or diagnose problems when porting to new platforms.
+  However, it may also be able to better isolate user errors than just
+  using runtime checks.  The assertions in the check routines spell
+  out in more detail the assumptions and invariants underlying the
+  algorithms.  The checking is fairly extensive, and will slow down
+  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
+  set will attempt to check every non-mmapped allocated and free chunk
+  in the course of computing the summaries.
+
+ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
+  Debugging assertion failures can be nearly impossible if your
+  version of the assert macro causes malloc to be called, which will
+  lead to a cascade of further failures, blowing the runtime stack.
+  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
+  which will usually make debugging easier.
+
+MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
+  The action to take before "return 0" when malloc fails to be able to
+  return memory because there is none available.
+
+HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
+  True if this system supports sbrk or an emulation of it.
+
+MORECORE                  default: sbrk
+  The name of the sbrk-style system routine to call to obtain more
+  memory.  See below for guidance on writing custom MORECORE
+  functions. The type of the argument to sbrk/MORECORE varies across
+  systems.  It cannot be size_t, because it supports negative
+  arguments, so it is normally the signed type of the same width as
+  size_t (sometimes declared as "intptr_t").  It doesn't much matter
+  though. Internally, we only call it with arguments less than half
+  the max value of a size_t, which should work across all reasonable
+  possibilities, although sometimes generating compiler warnings.
+
+MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
+  If true, take advantage of fact that consecutive calls to MORECORE
+  with positive arguments always return contiguous increasing
+  addresses.  This is true of unix sbrk. It does not hurt too much to
+  set it true anyway, since malloc copes with non-contiguities.
+  Setting it false when definitely non-contiguous saves time
+  and possibly wasted space it would take to discover this though.
+
+MORECORE_CANNOT_TRIM      default: NOT defined
+  True if MORECORE cannot release space back to the system when given
+  negative arguments. This is generally necessary only if you are
+  using a hand-crafted MORECORE function that cannot handle negative
+  arguments.
+
+NO_SEGMENT_TRAVERSAL       default: 0
+  If non-zero, suppresses traversals of memory segments
+  returned by either MORECORE or CALL_MMAP. This disables
+  merging of segments that are contiguous, and selectively
+  releasing them to the OS if unused, but bounds execution times.
+
+HAVE_MMAP                 default: 1 (true)
+  True if this system supports mmap or an emulation of it.  If so, and
+  HAVE_MORECORE is not true, MMAP is used for all system
+  allocation. If set and HAVE_MORECORE is true as well, MMAP is
+  primarily used to directly allocate very large blocks. It is also
+  used as a backup strategy in cases where MORECORE fails to provide
+  space from system. Note: A single call to MUNMAP is assumed to be
+  able to unmap memory that may have be allocated using multiple calls
+  to MMAP, so long as they are adjacent.
+
+HAVE_MREMAP               default: 1 on linux, else 0
+  If true realloc() uses mremap() to re-allocate large blocks and
+  extend or shrink allocation spaces.
+
+MMAP_CLEARS               default: 1 except on WINCE.
+  True if mmap clears memory so calloc doesn't need to. This is true
+  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.
+
+USE_BUILTIN_FFS            default: 0 (i.e., not used)
+  Causes malloc to use the builtin ffs() function to compute indices.
+  Some compilers may recognize and intrinsify ffs to be faster than the
+  supplied C version. Also, the case of x86 using gcc is special-cased
+  to an asm instruction, so is already as fast as it can be, and so
+  this setting has no effect. Similarly for Win32 under recent MS compilers.
+  (On most x86s, the asm version is only slightly faster than the C version.)
+
+malloc_getpagesize         default: derive from system includes, or 4096.
+  The system page size. To the extent possible, this malloc manages
+  memory from the system in page-size units.  This may be (and
+  usually is) a function rather than a constant. This is ignored
+  if WIN32, where page size is determined using getSystemInfo during
+  initialization. This may be several megabytes if ENABLE_LARGE_PAGES
+  is enabled.
+
+ENABLE_LARGE_PAGES         default: NOT defined
+  Causes the system page size to be the value of GetLargePageMinimum()
+  if that function is available (Windows Server 2003/Vista or later).
+  This allows the use of large page entries in the MMU which can
+  significantly improve performance in large working set applications
+  as TLB cache load is reduced by a factor of three. Note that enabling
+  this option is equal to locking the process' memory in current
+  implementations of Windows and requires the SE_LOCK_MEMORY_PRIVILEGE
+  to be held by the process in order to succeed.
+
+USE_DEV_RANDOM             default: 0 (i.e., not used)
+  Causes malloc to use /dev/random to initialize secure magic seed for
+  stamping footers. Otherwise, the current time is used.
+
+NO_MALLINFO                default: 0
+  If defined, don't compile "mallinfo". This can be a simple way
+  of dealing with mismatches between system declarations and
+  those in this file.
+
+MALLINFO_FIELD_TYPE        default: size_t
+  The type of the fields in the mallinfo struct. This was originally
+  defined as "int" in SVID etc, but is more usefully defined as
+  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set
+
+REALLOC_ZERO_BYTES_FREES    default: not defined
+  This should be set if a call to realloc with zero bytes should
+  be the same as a call to free. Some people think it should. Otherwise,
+  since this malloc returns a unique pointer for malloc(0), so does
+  realloc(p, 0).
+
+LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
+LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
+LACKS_STDLIB_H                default: NOT defined unless on WIN32
+  Define these if your system does not have these header files.
+  You might need to manually insert some of the declarations they provide.
+
+DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
+                                system_info.dwAllocationGranularity in WIN32,
+                                GetLargePageMinimum() if ENABLE_LARGE_PAGES,
+                                otherwise 64K.
+      Also settable using mallopt(M_GRANULARITY, x)
+  The unit for allocating and deallocating memory from the system.  On
+  most systems with contiguous MORECORE, there is no reason to
+  make this more than a page. However, systems with MMAP tend to
+  either require or encourage larger granularities.  You can increase
+  this value to prevent system allocation functions to be called so
+  often, especially if they are slow.  The value must be at least one
+  page and must be a power of two.  Setting to 0 causes initialization
+  to either page size or win32 region size.  (Note: In previous
+  versions of malloc, the equivalent of this option was called
+  "TOP_PAD")
+
+DEFAULT_GRANULARITY_ALIGNED default: undefined (which means page size)
+  Whether to enforce alignment when allocating and deallocating memory
+  from the system i.e. the base address of all allocations will be
+  aligned to DEFAULT_GRANULARITY if it is set. Note that enabling this carries
+  some overhead as multiple calls must now be made when probing for a valid
+  aligned value, however it does greatly ease the checking for whether
+  a given memory pointer was allocated by this allocator rather than
+  some other.
+
+DEFAULT_TRIM_THRESHOLD    default: 2MB
+      Also settable using mallopt(M_TRIM_THRESHOLD, x)
+  The maximum amount of unused top-most memory to keep before
+  releasing via malloc_trim in free().  Automatic trimming is mainly
+  useful in long-lived programs using contiguous MORECORE.  Because
+  trimming via sbrk can be slow on some systems, and can sometimes be
+  wasteful (in cases where programs immediately afterward allocate
+  more large chunks) the value should be high enough so that your
+  overall system performance would improve by releasing this much
+  memory.  As a rough guide, you might set to a value close to the
+  average size of a process (program) running on your system.
+  Releasing this much memory would allow such a process to run in
+  memory.  Generally, it is worth tuning trim thresholds when a
+  program undergoes phases where several large chunks are allocated
+  and released in ways that can reuse each other's storage, perhaps
+  mixed with phases where there are no such chunks at all. The trim
+  value must be greater than page size to have any useful effect.  To
+  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
+  some people use of mallocing a huge space and then freeing it at
+  program startup, in an attempt to reserve system memory, doesn't
+  have the intended effect under automatic trimming, since that memory
+  will immediately be returned to the system.
+
+DEFAULT_MMAP_THRESHOLD       default: 256K
+      Also settable using mallopt(M_MMAP_THRESHOLD, x)
+  The request size threshold for using MMAP to directly service a
+  request. Requests of at least this size that cannot be allocated
+  using already-existing space will be serviced via mmap.  (If enough
+  normal freed space already exists it is used instead.)  Using mmap
+  segregates relatively large chunks of memory so that they can be
+  individually obtained and released from the host system. A request
+  serviced through mmap is never reused by any other request (at least
+  not directly; the system may just so happen to remap successive
+  requests to the same locations).  Segregating space in this way has
+  the benefits that: Mmapped space can always be individually released
+  back to the system, which helps keep the system level memory demands
+  of a long-lived program low.  Also, mapped memory doesn't become
+  `locked' between other chunks, as can happen with normally allocated
+  chunks, which means that even trimming via malloc_trim would not
+  release them.  However, it has the disadvantage that the space
+  cannot be reclaimed, consolidated, and then used to service later
+  requests, as happens with normal chunks.  The advantages of mmap
+  nearly always outweigh disadvantages for "large" chunks, but the
+  value of "large" may vary across systems.  The default is an
+  empirically derived value that works well in most systems. You can
+  disable mmap by setting to MAX_SIZE_T.
+
+MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
+  The number of consolidated frees between checks to release
+  unused segments when freeing. When using non-contiguous segments,
+  especially with multiple mspaces, checking only for topmost space
+  doesn't always suffice to trigger trimming. To compensate for this,
+  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
+  current number of segments, if greater) try to release unused
+  segments to the OS when freeing chunks that result in
+  consolidation. The best value for this parameter is a compromise
+  between slowing down frees with relatively costly checks that
+  rarely trigger versus holding on to unused memory. To effectively
+  disable, set to MAX_SIZE_T. This may lead to a very slight speed
+  improvement at the expense of carrying around more memory.
+*/
+
+/* Version identifier to allow people to support multiple versions */
+#ifndef DLMALLOC_VERSION
+#define DLMALLOC_VERSION 20804
+#endif /* DLMALLOC_VERSION */
+
+#ifndef WIN32
+#ifdef _WIN32
+#define WIN32 1
+#endif  /* _WIN32 */
+#ifdef _WIN32_WCE
+#define LACKS_FCNTL_H
+#define WIN32 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <tchar.h>
+#define HAVE_MMAP 1
+#define HAVE_MORECORE 0
+#define LACKS_UNISTD_H
+#define LACKS_SYS_PARAM_H
+#define LACKS_SYS_MMAN_H
+#define LACKS_STRING_H
+#define LACKS_STRINGS_H
+#define LACKS_SYS_TYPES_H
+#define LACKS_ERRNO_H
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION
+#endif /* MALLOC_FAILURE_ACTION */
+#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
+#define MMAP_CLEARS 0
+#else
+#define MMAP_CLEARS 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+
+#if defined(DARWIN) || defined(_DARWIN)
+/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
+#ifndef HAVE_MORECORE
+#define HAVE_MORECORE 0
+#define HAVE_MMAP 1
+/* OSX allocators provide 16 byte alignment */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)16U)
+#endif
+#endif  /* HAVE_MORECORE */
+#endif  /* DARWIN */
+
+#ifndef LACKS_SYS_TYPES_H
+#include <sys/types.h>  /* For size_t */
+#endif  /* LACKS_SYS_TYPES_H */
+
+#if (defined(__GNUC__) && ((defined(__i386__) || defined(__x86_64__)))) || (defined(_MSC_VER) && _MSC_VER>=1310)
+#define SPIN_LOCKS_AVAILABLE 1
+#else
+#define SPIN_LOCKS_AVAILABLE 0
+#endif
+
+/* The maximum possible size_t value has all bits set */
+#define MAX_SIZE_T           (~(size_t)0)
+
+#ifndef ONLY_MSPACES
+#define ONLY_MSPACES 0     /* define to a value */
+#else
+#define ONLY_MSPACES 1
+#endif  /* ONLY_MSPACES */
+#ifndef MSPACES
+#if ONLY_MSPACES
+#define MSPACES 1
+#else   /* ONLY_MSPACES */
+#define MSPACES 0
+#endif  /* ONLY_MSPACES */
+#endif  /* MSPACES */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)8U)
+#endif  /* MALLOC_ALIGNMENT */
+#ifndef FOOTERS
+#define FOOTERS 0
+#endif  /* FOOTERS */
+#ifndef ABORT
+#define ABORT  abort()
+#endif  /* ABORT */
+#ifndef ABORT_ON_ASSERT_FAILURE
+#define ABORT_ON_ASSERT_FAILURE 1
+#endif  /* ABORT_ON_ASSERT_FAILURE */
+#ifndef PROCEED_ON_ERROR
+#define PROCEED_ON_ERROR 0
+#endif  /* PROCEED_ON_ERROR */
+#ifndef USE_LOCKS
+#define USE_LOCKS 0
+#endif  /* USE_LOCKS */
+#ifndef USE_SPIN_LOCKS
+#if USE_LOCKS && SPIN_LOCKS_AVAILABLE
+#define USE_SPIN_LOCKS 1
+#else
+#define USE_SPIN_LOCKS 0
+#endif /* USE_LOCKS && SPIN_LOCKS_AVAILABLE. */
+#endif /* USE_SPIN_LOCKS */
+#ifndef INSECURE
+#define INSECURE 0
+#endif  /* INSECURE */
+#ifndef HAVE_MMAP
+#define HAVE_MMAP 1
+#endif  /* HAVE_MMAP */
+#ifndef MMAP_CLEARS
+#define MMAP_CLEARS 1
+#endif  /* MMAP_CLEARS */
+#ifndef HAVE_MREMAP
+#ifdef linux
+#define HAVE_MREMAP 1
+#else   /* linux */
+#define HAVE_MREMAP 0
+#endif  /* linux */
+#endif  /* HAVE_MREMAP */
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
+#endif  /* MALLOC_FAILURE_ACTION */
+#ifndef HAVE_MORECORE
+#if ONLY_MSPACES
+#define HAVE_MORECORE 0
+#else   /* ONLY_MSPACES */
+#define HAVE_MORECORE 1
+#endif  /* ONLY_MSPACES */
+#endif  /* HAVE_MORECORE */
+#if !HAVE_MORECORE
+#define MORECORE_CONTIGUOUS 0
+#else   /* !HAVE_MORECORE */
+#define MORECORE_DEFAULT sbrk
+#ifndef MORECORE_CONTIGUOUS
+#define MORECORE_CONTIGUOUS 1
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* HAVE_MORECORE */
+#ifndef DEFAULT_GRANULARITY
+#if (MORECORE_CONTIGUOUS || defined(WIN32))
+#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
+#else   /* MORECORE_CONTIGUOUS */
+#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* DEFAULT_GRANULARITY */
+#ifndef DEFAULT_TRIM_THRESHOLD
+#ifndef MORECORE_CANNOT_TRIM
+#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
+#else   /* MORECORE_CANNOT_TRIM */
+#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
+#endif  /* MORECORE_CANNOT_TRIM */
+#endif  /* DEFAULT_TRIM_THRESHOLD */
+#ifndef DEFAULT_MMAP_THRESHOLD
+#if HAVE_MMAP
+#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
+#else   /* HAVE_MMAP */
+#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
+#endif  /* HAVE_MMAP */
+#endif  /* DEFAULT_MMAP_THRESHOLD */
+#ifndef MAX_RELEASE_CHECK_RATE
+#if HAVE_MMAP
+#define MAX_RELEASE_CHECK_RATE 4095
+#else
+#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
+#endif /* HAVE_MMAP */
+#endif /* MAX_RELEASE_CHECK_RATE */
+#ifndef USE_BUILTIN_FFS
+#define USE_BUILTIN_FFS 0
+#endif  /* USE_BUILTIN_FFS */
+#ifndef USE_DEV_RANDOM
+#define USE_DEV_RANDOM 0
+#endif  /* USE_DEV_RANDOM */
+#ifndef NO_MALLINFO
+#define NO_MALLINFO 0
+#endif  /* NO_MALLINFO */
+#ifndef MALLINFO_FIELD_TYPE
+#define MALLINFO_FIELD_TYPE size_t
+#endif  /* MALLINFO_FIELD_TYPE */
+#ifndef NO_SEGMENT_TRAVERSAL
+#define NO_SEGMENT_TRAVERSAL 0
+#endif /* NO_SEGMENT_TRAVERSAL */
+
+/*
+  mallopt tuning options.  SVID/XPG defines four standard parameter
+  numbers for mallopt, normally defined in malloc.h.  None of these
+  are used in this malloc, so setting them has no effect. But this
+  malloc does support the following options.
+*/
+
+#define M_TRIM_THRESHOLD     (-1)
+#define M_GRANULARITY        (-2)
+#define M_MMAP_THRESHOLD     (-3)
+
+/* ------------------------ Mallinfo declarations ------------------------ */
+
+#if !NO_MALLINFO
+/*
+  This version of malloc supports the standard SVID/XPG mallinfo
+  routine that returns a struct containing usage properties and
+  statistics. It should work on any system that has a
+  /usr/include/malloc.h defining struct mallinfo.  The main
+  declaration needed is the mallinfo struct that is returned (by-copy)
+  by mallinfo().  The malloinfo struct contains a bunch of fields that
+  are not even meaningful in this version of malloc.  These fields are
+  are instead filled by mallinfo() with other numbers that might be of
+  interest.
+
+  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
+  /usr/include/malloc.h file that includes a declaration of struct
+  mallinfo.  If so, it is included; else a compliant version is
+  declared below.  These must be precisely the same for mallinfo() to
+  work.  The original SVID version of this struct, defined on most
+  systems with mallinfo, declares all fields as ints. But some others
+  define as unsigned long. If your system defines the fields using a
+  type of different width than listed here, you MUST #include your
+  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
+*/
+
+/* #define HAVE_USR_INCLUDE_MALLOC_H */
+
+#ifdef HAVE_USR_INCLUDE_MALLOC_H
+#include "/usr/include/malloc.h"
+#else /* HAVE_USR_INCLUDE_MALLOC_H */
+#ifndef STRUCT_MALLINFO_DECLARED
+#define STRUCT_MALLINFO_DECLARED 1
+struct mallinfo {
+  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
+  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
+  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
+  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
+  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
+  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
+  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
+  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
+  MALLINFO_FIELD_TYPE fordblks; /* total free space */
+  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
+};
+#endif /* STRUCT_MALLINFO_DECLARED */
+#endif /* HAVE_USR_INCLUDE_MALLOC_H */
+#endif /* NO_MALLINFO */
+
+/*
+  Try to persuade compilers to inline. The most critical functions for
+  inlining are defined as macros, so these aren't used for them.
+*/
+
+#ifndef FORCEINLINE
+  #if defined(__GNUC__)
+#define FORCEINLINE __inline __attribute__ ((always_inline))
+  #elif defined(_MSC_VER)
+    #define FORCEINLINE __forceinline
+  #endif
+#endif
+#ifndef NOINLINE
+  #if defined(__GNUC__)
+    #define NOINLINE __attribute__ ((noinline))
+  #elif defined(_MSC_VER)
+    #define NOINLINE __declspec(noinline)
+  #else
+    #define NOINLINE
+  #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#ifndef FORCEINLINE
+ #define FORCEINLINE inline
+#endif
+#endif /* __cplusplus */
+#ifndef FORCEINLINE
+ #define FORCEINLINE
+#endif
+
+#if !ONLY_MSPACES
+
+/* ------------------- Declarations of public routines ------------------- */
+
+#ifndef USE_DL_PREFIX
+#define dlcalloc               calloc
+#define dlfree                 free
+#define dlmalloc               malloc
+#define dlmemalign             memalign
+#define dlrealloc              realloc
+#define dlvalloc               valloc
+#define dlpvalloc              pvalloc
+#define dlmallinfo             mallinfo
+#define dlmallopt              mallopt
+#define dlmalloc_trim          malloc_trim
+#define dlmalloc_stats         malloc_stats
+#define dlmalloc_usable_size   malloc_usable_size
+#define dlmalloc_footprint     malloc_footprint
+#define dlmalloc_max_footprint malloc_max_footprint
+#define dlindependent_calloc   independent_calloc
+#define dlindependent_comalloc independent_comalloc
+#endif /* USE_DL_PREFIX */
+
+
+/*
+  malloc(size_t n)
+  Returns a pointer to a newly allocated chunk of at least n bytes, or
+  null if no space is available, in which case errno is set to ENOMEM
+  on ANSI C systems.
+
+  If n is zero, malloc returns a minimum-sized chunk. (The minimum
+  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
+  systems.)  Note that size_t is an unsigned type, so calls with
+  arguments that would be negative if signed are interpreted as
+  requests for huge amounts of space, which will often fail. The
+  maximum supported value of n differs across systems, but is in all
+  cases less than the maximum representable value of a size_t.
+*/
+void* dlmalloc(size_t);
+
+/*
+  free(void* p)
+  Releases the chunk of memory pointed to by p, that had been previously
+  allocated using malloc or a related routine such as realloc.
+  It has no effect if p is null. If p was not malloced or already
+  freed, free(p) will by default cause the current program to abort.
+*/
+void  dlfree(void*);
+
+/*
+  calloc(size_t n_elements, size_t element_size);
+  Returns a pointer to n_elements * element_size bytes, with all locations
+  set to zero.
+*/
+void* dlcalloc(size_t, size_t);
+
+/*
+  realloc(void* p, size_t n)
+  Returns a pointer to a chunk of size n that contains the same data
+  as does chunk p up to the minimum of (n, p's size) bytes, or null
+  if no space is available.
+
+  The returned pointer may or may not be the same as p. The algorithm
+  prefers extending p in most cases when possible, otherwise it
+  employs the equivalent of a malloc-copy-free sequence.
+
+  If p is null, realloc is equivalent to malloc.
+
+  If space is not available, realloc returns null, errno is set (if on
+  ANSI) and p is NOT freed.
+
+  if n is for fewer bytes than already held by p, the newly unused
+  space is lopped off and freed if possible.  realloc with a size
+  argument of zero (re)allocates a minimum-sized chunk.
+
+  The old unix realloc convention of allowing the last-free'd chunk
+  to be used as an argument to realloc is not supported.
+*/
+
+void* dlrealloc(void*, size_t);
+
+/*
+  memalign(size_t alignment, size_t n);
+  Returns a pointer to a newly allocated chunk of n bytes, aligned
+  in accord with the alignment argument.
+
+  The alignment argument should be a power of two. If the argument is
+  not a power of two, the nearest greater power is used.
+  8-byte alignment is guaranteed by normal malloc calls, so don't
+  bother calling memalign with an argument of 8 or less.
+
+  Overreliance on memalign is a sure way to fragment space.
+*/
+void* dlmemalign(size_t, size_t);
+
+/*
+  valloc(size_t n);
+  Equivalent to memalign(pagesize, n), where pagesize is the page
+  size of the system. If the pagesize is unknown, 4096 is used.
+*/
+void* dlvalloc(size_t);
+
+/*
+  mallopt(int parameter_number, int parameter_value)
+  Sets tunable parameters The format is to provide a
+  (parameter-number, parameter-value) pair.  mallopt then sets the
+  corresponding parameter to the argument value if it can (i.e., so
+  long as the value is meaningful), and returns 1 if successful else
+  0.  To workaround the fact that mallopt is specified to use int,
+  not size_t parameters, the value -1 is specially treated as the
+  maximum unsigned size_t value.
+
+  SVID/XPG/ANSI defines four standard param numbers for mallopt,
+  normally defined in malloc.h.  None of these are use in this malloc,
+  so setting them has no effect. But this malloc also supports other
+  options in mallopt. See below for details.  Briefly, supported
+  parameters are as follows (listed defaults are for "typical"
+  configurations).
+
+  Symbol            param #  default    allowed param values
+  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
+  M_GRANULARITY        -2     page size   any power of 2 >= page size
+  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
+*/
+int dlmallopt(int, int);
+
+/*
+  malloc_footprint();
+  Returns the number of bytes obtained from the system.  The total
+  number of bytes allocated by malloc, realloc etc., is less than this
+  value. Unlike mallinfo, this function returns only a precomputed
+  result, so can be called frequently to monitor memory consumption.
+  Even if locks are otherwise defined, this function does not use them,
+  so results might not be up to date.
+*/
+size_t dlmalloc_footprint(void);
+
+/*
+  malloc_max_footprint();
+  Returns the maximum number of bytes obtained from the system. This
+  value will be greater than current footprint if deallocated space
+  has been reclaimed by the system. The peak number of bytes allocated
+  by malloc, realloc etc., is less than this value. Unlike mallinfo,
+  this function returns only a precomputed result, so can be called
+  frequently to monitor memory consumption.  Even if locks are
+  otherwise defined, this function does not use them, so results might
+  not be up to date.
+*/
+size_t dlmalloc_max_footprint(void);
+
+#if !NO_MALLINFO
+/*
+  mallinfo()
+  Returns (by copy) a struct containing various summary statistics:
+
+  arena:     current total non-mmapped bytes allocated from system
+  ordblks:   the number of free chunks
+  smblks:    always zero.
+  hblks:     current number of mmapped regions
+  hblkhd:    total bytes held in mmapped regions
+  usmblks:   the maximum total allocated space. This will be greater
+                than current total if trimming has occurred.
+  fsmblks:   always zero
+  uordblks:  current total allocated space (normal or mmapped)
+  fordblks:  total free space
+  keepcost:  the maximum number of bytes that could ideally be released
+               back to system via malloc_trim. ("ideally" means that
+               it ignores page restrictions etc.)
+
+  Because these fields are ints, but internal bookkeeping may
+  be kept as longs, the reported values may wrap around zero and
+  thus be inaccurate.
+*/
+struct mallinfo dlmallinfo(void);
+#endif /* NO_MALLINFO */
+
+/*
+  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);
+
+  independent_calloc is similar to calloc, but instead of returning a
+  single cleared space, it returns an array of pointers to n_elements
+  independent elements that can hold contents of size elem_size, each
+  of which starts out cleared, and can be independently freed,
+  realloc'ed etc. The elements are guaranteed to be adjacently
+  allocated (this is not guaranteed to occur with multiple callocs or
+  mallocs), which may also improve cache locality in some
+  applications.
+
+  The "chunks" argument is optional (i.e., may be null, which is
+  probably the most typical usage). If it is null, the returned array
+  is itself dynamically allocated and should also be freed when it is
+  no longer needed. Otherwise, the chunks array must be of at least
+  n_elements in length. It is filled in with the pointers to the
+  chunks.
+
+  In either case, independent_calloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and "chunks"
+  is null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use regular calloc and assign pointers into this
+  space to represent elements.  (In this case though, you cannot
+  independently free elements.)
+
+  independent_calloc simplifies and speeds up implementations of many
+  kinds of pools.  It may also be useful when constructing large data
+  structures that initially have a fixed number of fixed-sized nodes,
+  but the number is not known at compile time, and some of the nodes
+  may later need to be freed. For example:
+
+  struct Node { int item; struct Node* next; };
+
+  struct Node* build_list() {
+    struct Node** pool;
+    int n = read_number_of_nodes_needed();
+    if (n <= 0) return 0;
+    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
+    if (pool == 0) die();
+    // organize into a linked list...
+    struct Node* first = pool[0];
+    for (i = 0; i < n-1; ++i)
+      pool[i]->next = pool[i+1];
+    free(pool);     // Can now free the array (or not, if it is needed later)
+    return first;
+  }
+*/
+void** dlindependent_calloc(size_t, size_t, void**);
+
+/*
+  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);
+
+  independent_comalloc allocates, all at once, a set of n_elements
+  chunks with sizes indicated in the "sizes" array.    It returns
+  an array of pointers to these elements, each of which can be
+  independently freed, realloc'ed etc. The elements are guaranteed to
+  be adjacently allocated (this is not guaranteed to occur with
+  multiple callocs or mallocs), which may also improve cache locality
+  in some applications.
+
+  The "chunks" argument is optional (i.e., may be null). If it is null
+  the returned array is itself dynamically allocated and should also
+  be freed when it is no longer needed. Otherwise, the chunks array
+  must be of at least n_elements in length. It is filled in with the
+  pointers to the chunks.
+
+  In either case, independent_comalloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and chunks is
+  null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use a single regular malloc, and assign pointers at
+  particular offsets in the aggregate space. (In this case though, you
+  cannot independently free elements.)
+
+  independent_comallac differs from independent_calloc in that each
+  element may have a different size, and also that it does not
+  automatically clear elements.
+
+  independent_comalloc can be used to speed up allocation in cases
+  where several structs or objects must always be allocated at the
+  same time.  For example:
+
+  struct Head { ... }
+  struct Foot { ... }
+
+  void send_message(char* msg) {
+    int msglen = strlen(msg);
+    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
+    void* chunks[3];
+    if (independent_comalloc(3, sizes, chunks) == 0)
+      die();
+    struct Head* head = (struct Head*)(chunks[0]);
+    char*        body = (char*)(chunks[1]);
+    struct Foot* foot = (struct Foot*)(chunks[2]);
+    // ...
+  }
+
+  In general though, independent_comalloc is worth using only for
+  larger values of n_elements. For small values, you probably won't
+  detect enough difference from series of malloc calls to bother.
+
+  Overuse of independent_comalloc can increase overall memory usage,
+  since it cannot reuse existing noncontiguous small chunks that
+  might be available for some of the elements.
+*/
+void** dlindependent_comalloc(size_t, size_t*, void**);
+
+
+/*
+  pvalloc(size_t n);
+  Equivalent to valloc(minimum-page-that-holds(n)), that is,
+  round up n to nearest pagesize.
+ */
+void*  dlpvalloc(size_t);
+
+/*
+  malloc_trim(size_t pad);
+
+  If possible, gives memory back to the system (via negative arguments
+  to sbrk) if there is unused memory at the `high' end of the malloc
+  pool or in unused MMAP segments. You can call this after freeing
+  large blocks of memory to potentially reduce the system-level memory
+  requirements of a program. However, it cannot guarantee to reduce
+  memory. Under some allocation patterns, some large free blocks of
+  memory will be locked between two used chunks, so they cannot be
+  given back to the system.
+
+  The `pad' argument to malloc_trim represents the amount of free
+  trailing space to leave untrimmed. If this argument is zero, only
+  the minimum amount of memory to maintain internal data structures
+  will be left. Non-zero arguments can be supplied to maintain enough
+  trailing space to service future expected allocations without having
+  to re-obtain memory from the system.
+
+  Malloc_trim returns 1 if it actually released any memory, else 0.
+*/
+int  dlmalloc_trim(size_t);
+
+/*
+  malloc_stats();
+  Prints on stderr the amount of space obtained from the system (both
+  via sbrk and mmap), the maximum amount (which may be more than
+  current if malloc_trim and/or munmap got called), and the current
+  number of bytes allocated via malloc (or realloc, etc) but not yet
+  freed. Note that this is the number of bytes allocated, not the
+  number requested. It will be larger than the number requested
+  because of alignment and bookkeeping overhead. Because it includes
+  alignment wastage as being in use, this figure may be greater than
+  zero even when no user-level chunks are allocated.
+
+  The reported current and maximum system memory can be inaccurate if
+  a program makes other calls to system memory allocation functions
+  (normally sbrk) outside of malloc.
+
+  malloc_stats prints only the most commonly interesting statistics.
+  More information can be obtained by calling mallinfo.
+*/
+void  dlmalloc_stats(void);
+
+#endif /* ONLY_MSPACES */
+
+/*
+  malloc_usable_size(void* p);
+
+  Returns the number of bytes you can actually use in
+  an allocated chunk, which may be more than you requested (although
+  often not) due to alignment and minimum size constraints.
+  You can use this many bytes without worrying about
+  overwriting other allocated objects. This is not a particularly great
+  programming practice. malloc_usable_size can be more useful in
+  debugging and assertions, for example:
+
+  p = malloc(n);
+  assert(malloc_usable_size(p) >= 256);
+*/
+size_t dlmalloc_usable_size(void*);
+
+
+#if MSPACES
+
+/*
+  mspace is an opaque type representing an independent
+  region of space that supports mspace_malloc, etc.
+*/
+typedef void* mspace;
+
+/*
+  create_mspace creates and returns a new independent space with the
+  given initial capacity, or, if 0, the default granularity size.  It
+  returns null if there is no system memory available to create the
+  space.  If argument locked is non-zero, the space uses a separate
+  lock to control access. The capacity of the space will grow
+  dynamically as needed to service mspace_malloc requests.  You can
+  control the sizes of incremental increases of this space by
+  compiling with a different DEFAULT_GRANULARITY or dynamically
+  setting with mallopt(M_GRANULARITY, value).
+*/
+mspace create_mspace(size_t capacity, int locked);
+
+/*
+  destroy_mspace destroys the given space, and attempts to return all
+  of its memory back to the system, returning the total number of
+  bytes freed. After destruction, the results of access to all memory
+  used by the space become undefined.
+*/
+size_t destroy_mspace(mspace msp);
+
+/*
+  create_mspace_with_base uses the memory supplied as the initial base
+  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
+  space is used for bookkeeping, so the capacity must be at least this
+  large. (Otherwise 0 is returned.) When this initial space is
+  exhausted, additional memory will be obtained from the system.
+  Destroying this space will deallocate all additionally allocated
+  space (if possible) but not the initial base.
+*/
+mspace create_mspace_with_base(void* base, size_t capacity, int locked);
+
+/*
+  mspace_track_large_chunks controls whether requests for large chunks
+  are allocated in their own untracked mmapped regions, separate from
+  others in this mspace. By default large chunks are not tracked,
+  which reduces fragmentation. However, such chunks are not
+  necessarily released to the system upon destroy_mspace.  Enabling
+  tracking by setting to true may increase fragmentation, but avoids
+  leakage when relying on destroy_mspace to release all memory
+  allocated using this space.  The function returns the previous
+  setting.
+*/
+int mspace_track_large_chunks(mspace msp, int enable);
+
+
+/*
+  mspace_malloc behaves as malloc, but operates within
+  the given space.
+*/
+void* mspace_malloc(mspace msp, size_t bytes);
+
+/*
+  mspace_free behaves as free, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_free is not actually needed.
+  free may be called instead of mspace_free because freed chunks from
+  any space are handled by their originating spaces.
+*/
+void mspace_free(mspace msp, void* mem);
+
+/*
+  mspace_realloc behaves as realloc, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_realloc is not actually
+  needed.  realloc may be called instead of mspace_realloc because
+  realloced chunks from any space are handled by their originating
+  spaces.
+*/
+void* mspace_realloc(mspace msp, void* mem, size_t newsize);
+
+/*
+  mspace_calloc behaves as calloc, but operates within
+  the given space.
+*/
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);
+
+/*
+  mspace_memalign behaves as memalign, but operates within
+  the given space.
+*/
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);
+
+/*
+  mspace_independent_calloc behaves as independent_calloc, but
+  operates within the given space.
+*/
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+                                 size_t elem_size, void* chunks[]);
+
+/*
+  mspace_independent_comalloc behaves as independent_comalloc, but
+  operates within the given space.
+*/
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+                                   size_t sizes[], void* chunks[]);
+
+/*
+  mspace_footprint() returns the number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_footprint(mspace msp);
+
+/*
+  mspace_max_footprint() returns the peak number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_max_footprint(mspace msp);
+
+
+#if !NO_MALLINFO
+/*
+  mspace_mallinfo behaves as mallinfo, but reports properties of
+  the given space.
+*/
+struct mallinfo mspace_mallinfo(mspace msp);
+#endif /* NO_MALLINFO */
+
+/*
+  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
+*/
+  size_t mspace_usable_size(void* mem);
+
+/*
+  mspace_malloc_stats behaves as malloc_stats, but reports
+  properties of the given space.
+*/
+void mspace_malloc_stats(mspace msp);
+
+/*
+  mspace_trim behaves as malloc_trim, but
+  operates within the given space.
+*/
+int mspace_trim(mspace msp, size_t pad);
+
+/*
+  An alias for mallopt.
+*/
+int mspace_mallopt(int, int);
+
+#endif /* MSPACES */
+
+#ifdef __cplusplus
+}  /* end of extern "C" */
+#endif /* __cplusplus */
+
+/*
+  ========================================================================
+  To make a fully customizable malloc.h header file, cut everything
+  above this line, put into file malloc.h, edit to suit, and #include it
+  on the next line, as well as in programs that use this malloc.
+  ========================================================================
+*/
+
+/* #include "malloc.h" */
+
+/*------------------------------ internal #includes ---------------------- */
+
+#ifdef WIN32
+#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
+#endif /* WIN32 */
+
+#include <stdio.h>       /* for printing in malloc_stats */
+
+#ifndef LACKS_ERRNO_H
+#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
+#endif /* LACKS_ERRNO_H */
+#if FOOTERS || DEBUG
+#include <time.h>        /* for magic initialization */
+#endif /* FOOTERS */
+#ifndef LACKS_STDLIB_H
+#include <stdlib.h>      /* for abort() */
+#endif /* LACKS_STDLIB_H */
+#ifdef DEBUG
+#if ABORT_ON_ASSERT_FAILURE
+#undef assert
+#define assert(x) if(!(x)) ABORT
+#else /* ABORT_ON_ASSERT_FAILURE */
+#include <assert.h>
+#endif /* ABORT_ON_ASSERT_FAILURE */
+#else  /* DEBUG */
+#ifndef assert
+#define assert(x)
+#endif
+#define DEBUG 0
+#endif /* DEBUG */
+#ifndef LACKS_STRING_H
+#include <string.h>      /* for memset etc */
+#endif  /* LACKS_STRING_H */
+#if USE_BUILTIN_FFS
+#ifndef LACKS_STRINGS_H
+#include <strings.h>     /* for ffs */
+#endif /* LACKS_STRINGS_H */
+#endif /* USE_BUILTIN_FFS */
+#if HAVE_MMAP
+#ifndef LACKS_SYS_MMAN_H
+/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
+#if (defined(linux) && !defined(__USE_GNU))
+#define __USE_GNU 1
+#include <sys/mman.h>    /* for mmap */
+#undef __USE_GNU
+#else
+#include <sys/mman.h>    /* for mmap */
+#endif /* linux */
+#endif /* LACKS_SYS_MMAN_H */
+#ifndef LACKS_FCNTL_H
+#include <fcntl.h>
+#endif /* LACKS_FCNTL_H */
+#endif /* HAVE_MMAP */
+#ifndef LACKS_UNISTD_H
+#include <unistd.h>     /* for sbrk, sysconf */
+#else /* LACKS_UNISTD_H */
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
+extern void*     sbrk(ptrdiff_t);
+#endif /* FreeBSD etc */
+#endif /* LACKS_UNISTD_H */
+
+/* Declarations for locking */
+#if USE_LOCKS
+#ifndef WIN32
+#include <pthread.h>
+#if defined (__SVR4) && defined (__sun)  /* solaris */
+#include <thread.h>
+#endif /* solaris */
+#else
+#ifndef _M_AMD64
+/* These are already defined on AMD64 builds */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
+LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* _M_AMD64 */
+#pragma intrinsic (_InterlockedCompareExchange)
+#pragma intrinsic (_InterlockedExchange)
+#define interlockedcompareexchange _InterlockedCompareExchange
+#define interlockedexchange _InterlockedExchange
+#endif /* Win32 */
+#endif /* USE_LOCKS */
+
+/* Declarations for bit scanning on win32 */
+#if defined(_MSC_VER) && _MSC_VER>=1300
+#ifndef BitScanForward	/* Try to avoid pulling in WinNT.h */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
+unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#define BitScanForward _BitScanForward
+#define BitScanReverse _BitScanReverse
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#endif /* BitScanForward */
+#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */
+
+#ifndef WIN32
+#ifndef malloc_getpagesize
+#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
+#    ifndef _SC_PAGE_SIZE
+#      define _SC_PAGE_SIZE _SC_PAGESIZE
+#    endif
+#  endif
+#  ifdef _SC_PAGE_SIZE
+#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
+#  else
+#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
+       extern size_t getpagesize();
+#      define malloc_getpagesize getpagesize()
+#    else
+#      ifdef WIN32 /* use supplied emulation of getpagesize */
+#        define malloc_getpagesize getpagesize()
+#      else
+#        ifndef LACKS_SYS_PARAM_H
+#          include <sys/param.h>
+#        endif
+#        ifdef EXEC_PAGESIZE
+#          define malloc_getpagesize EXEC_PAGESIZE
+#        else
+#          ifdef NBPG
+#            ifndef CLSIZE
+#              define malloc_getpagesize NBPG
+#            else
+#              define malloc_getpagesize (NBPG * CLSIZE)
+#            endif
+#          else
+#            ifdef NBPC
+#              define malloc_getpagesize NBPC
+#            else
+#              ifdef PAGESIZE
+#                define malloc_getpagesize PAGESIZE
+#              else /* just guess */
+#                define malloc_getpagesize ((size_t)4096U)
+#              endif
+#            endif
+#          endif
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+#endif
+
+
+
+/* ------------------- size_t and alignment properties -------------------- */
+
+/* The byte and bit size of a size_t */
+#define SIZE_T_SIZE         (sizeof(size_t))
+#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)
+
+/* Some constants coerced to size_t */
+/* Annoying but necessary to avoid errors on some platforms */
+#define SIZE_T_ZERO         ((size_t)0)
+#define SIZE_T_ONE          ((size_t)1)
+#define SIZE_T_TWO          ((size_t)2)
+#define SIZE_T_FOUR         ((size_t)4)
+#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
+#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
+#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
+#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)
+
+/* The bit mask value corresponding to MALLOC_ALIGNMENT */
+#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)
+
+/* True if address a has acceptable alignment */
+#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)
+
+/* the number of bytes to offset an address to align it */
+#define align_offset(A)\
+ ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
+  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))
+
+/*
+  malloc_params holds global properties, including those that can be
+  dynamically set using mallopt. There is a single instance, mparams,
+  initialized in init_mparams. Note that the non-zeroness of "magic"
+  also serves as an initialization flag.
+*/
+typedef unsigned int flag_t;
+struct malloc_params {
+  volatile size_t magic;
+  size_t page_size;
+  size_t granularity;
+  size_t mmap_threshold;
+  size_t trim_threshold;
+  flag_t default_mflags;
+};
+
+static struct malloc_params mparams;
+
+/* Ensure mparams initialized */
+#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams())
+
+/* -------------------------- MMAP preliminaries ------------------------- */
+
+/*
+   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
+   checks to fail so compiler optimizer can delete code rather than
+   using so many "#if"s.
+*/
+
+
+/* MORECORE and MMAP must return MFAIL on failure */
+#define MFAIL                ((void*)(MAX_SIZE_T))
+#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */
+
+#if HAVE_MMAP
+
+#ifndef WIN32
+#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
+#define MAP_ANONYMOUS        MAP_ANON
+#endif /* MAP_ANON */
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+#define MMAP_IMPL mmap_aligned
+static void* lastAlignedmmap; /* Used as a hint */
+static void* mmap_aligned(void *start, size_t length, int prot, int flags, int fd, off_t offset) {
+  void* baseaddress = 0;
+  void* ptr = 0;
+  if(!start) {
+    baseaddress = lastAlignedmmap;
+    for(;;) {
+      if(baseaddress) flags|=MAP_FIXED;
+      ptr = mmap(baseaddress, length, prot, flags, fd, offset);
+      if(!ptr)
+        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
+      else if((size_t)ptr & (mparams.granularity - SIZE_T_ONE)) {
+        munmap(ptr, length);
+        baseaddress = (void*)(((size_t)ptr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
+      }
+      else break;
+    }
+  }
+  else ptr = mmap(start, length, prot, flags, fd, offset);
+  if(ptr) lastAlignedmmap = (void*)((size_t) ptr + mparams.granularity);
+  return ptr;
+}
+#else
+#define MMAP_IMPL mmap
+#endif /* DEFAULT_GRANULARITY_ALIGNED */
+#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
+#define MMAP_PROT            (PROT_READ|PROT_WRITE)
+#ifdef MAP_ANONYMOUS
+#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
+#define MMAP_DEFAULT(s)       MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
+#else /* MAP_ANONYMOUS */
+/*
+   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
+   is unlikely to be needed, but is supplied just in case.
+*/
+#define MMAP_FLAGS           (MAP_PRIVATE)
+static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
+#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
+           (dev_zero_fd = open("/dev/zero", O_RDWR), \
+            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
+            MMAP_IMPL(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
+#endif /* MAP_ANONYMOUS */
+
+#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)
+
+#else /* WIN32 */
+
+/* Win32 MMAP via VirtualAlloc */
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+static void* lastWin32mmap; /* Used as a hint */
+#endif /* DEFAULT_GRANULARITY_ALIGNED */
+#ifdef ENABLE_LARGE_PAGES
+static int largepagesavailable = 1;
+#endif /* ENABLE_LARGE_PAGES */
+static FORCEINLINE void* win32mmap(size_t size) {
+  void* baseaddress = 0;
+  void* ptr = 0;
+#ifdef ENABLE_LARGE_PAGES
+  /* Note that large pages are *always* allocated on a large page boundary.
+  If however granularity is small then don't waste a kernel call if size
+  isn't around the size of a large page */
+  if(largepagesavailable && size >= 1*1024*1024) {
+    ptr = VirtualAlloc(baseaddress, size, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
+    if(!ptr && ERROR_PRIVILEGE_NOT_HELD==GetLastError()) largepagesavailable=0;
+  }
+#endif
+  if(!ptr) {
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+    /* We try to avoid overhead by speculatively reserving at aligned
+    addresses until we succeed */
+    baseaddress = lastWin32mmap;
+    for(;;) {
+      void* reserveaddr = VirtualAlloc(baseaddress, size, MEM_RESERVE, PAGE_READWRITE);
+      if(!reserveaddr)
+        baseaddress = (void*)((size_t)baseaddress + mparams.granularity);
+      else if((size_t)reserveaddr & (mparams.granularity - SIZE_T_ONE)) {
+        VirtualFree(reserveaddr, 0, MEM_RELEASE);
+        baseaddress = (void*)(((size_t)reserveaddr + mparams.granularity) & ~(mparams.granularity - SIZE_T_ONE));
+      }
+      else break;
+    }
+#endif
+    if(!ptr) ptr = VirtualAlloc(baseaddress, size, baseaddress ? MEM_COMMIT : MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+#if DEBUG
+    if(lastWin32mmap && ptr!=lastWin32mmap) printf("Non-contiguous VirtualAlloc between %p and %p\n", ptr, lastWin32mmap);
+#endif
+#ifdef DEFAULT_GRANULARITY_ALIGNED
+    if(ptr) lastWin32mmap = (void*)((size_t) ptr + mparams.granularity);
+#endif
+  }
+#if DEBUG
+#ifdef ENABLE_LARGE_PAGES
+  printf("VirtualAlloc returns %p size %u. LargePagesAvailable=%d\n", ptr, size, largepagesavailable);
+#else
+  printf("VirtualAlloc returns %p size %u\n", ptr, size);
+#endif
+#endif
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
+static FORCEINLINE void* win32direct_mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
+                           PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* This function supports releasing coalesed segments */
+static FORCEINLINE int win32munmap(void* ptr, size_t size) {
+  MEMORY_BASIC_INFORMATION minfo;
+  char* cptr = (char*)ptr;
+  while (size) {
+    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
+      return -1;
+    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
+        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
+      return -1;
+    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
+      return -1;
+    cptr += minfo.RegionSize;
+    size -= minfo.RegionSize;
+  }
+  return 0;
+}
+
+#define MMAP_DEFAULT(s)             win32mmap(s)
+#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
+#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
+#endif /* WIN32 */
+#endif /* HAVE_MMAP */
+
+#if HAVE_MREMAP
+#ifndef WIN32
+#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
+#endif /* WIN32 */
+#endif /* HAVE_MREMAP */
+
+
+/**
+ * Define CALL_MORECORE
+ */
+#if HAVE_MORECORE
+    #ifdef MORECORE
+        #define CALL_MORECORE(S)    MORECORE(S)
+    #else  /* MORECORE */
+        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
+    #endif /* MORECORE */
+#else  /* HAVE_MORECORE */
+    #define CALL_MORECORE(S)        MFAIL
+#endif /* HAVE_MORECORE */
+
+/**
+ * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
+ */
+#if HAVE_MMAP
+    #define USE_MMAP_BIT            (SIZE_T_ONE)
+
+    #ifdef MMAP
+        #define CALL_MMAP(s)        MMAP(s)
+    #else /* MMAP */
+        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
+    #endif /* MMAP */
+    #ifdef MUNMAP
+        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
+    #else /* MUNMAP */
+        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
+    #endif /* MUNMAP */
+    #ifdef DIRECT_MMAP
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
+    #else /* DIRECT_MMAP */
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
+    #endif /* DIRECT_MMAP */
+#else  /* HAVE_MMAP */
+    #define USE_MMAP_BIT            (SIZE_T_ZERO)
+
+    #define MMAP(s)                 MFAIL
+    #define MUNMAP(a, s)            (-1)
+    #define DIRECT_MMAP(s)          MFAIL
+    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
+    #define CALL_MMAP(s)            MMAP(s)
+    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
+#endif /* HAVE_MMAP */
+
+/**
+ * Define CALL_MREMAP
+ */
+#if HAVE_MMAP && HAVE_MREMAP
+    #ifdef MREMAP
+        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
+    #else /* MREMAP */
+        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
+    #endif /* MREMAP */
+#else  /* HAVE_MMAP && HAVE_MREMAP */
+    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
+#endif /* HAVE_MMAP && HAVE_MREMAP */
+
+/* mstate bit set if continguous morecore disabled or failed */
+#define USE_NONCONTIGUOUS_BIT (4U)
+
+/* segment bit set in create_mspace_with_base */
+#define EXTERN_BIT            (8U)
+
+
+/* --------------------------- Lock preliminaries ------------------------ */
+
+/*
+  When locks are defined, there is one global lock, plus
+  one per-mspace lock.
+
+  The global lock_ensures that mparams.magic and other unique
+  mparams values are initialized only once. It also protects
+  sequences of calls to MORECORE.  In many cases sys_alloc requires
+  two calls, that should not be interleaved with calls by other
+  threads.  This does not protect against direct calls to MORECORE
+  by other threads not using this lock, so there is still code to
+  cope the best we can on interference.
+
+  Per-mspace locks surround calls to malloc, free, etc.  To enable use
+  in layered extensions, per-mspace locks are reentrant.
+
+  Because lock-protected regions generally have bounded times, it is
+  OK to use the supplied simple spinlocks in the custom versions for
+  x86. Spinlocks are likely to improve performance for lightly
+  contended applications, but worsen performance under heavy
+  contention.
+
+  If USE_LOCKS is > 1, the definitions of lock routines here are
+  bypassed, in which case you will need to define the type MLOCK_T,
+  and at least INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK and possibly
+  TRY_LOCK (which is not used in this malloc, but commonly needed in
+  extensions.)  You must also declare a
+    static MLOCK_T malloc_global_mutex = { initialization values };.
+
+*/
+
+#if USE_LOCKS == 1
+
+#if USE_SPIN_LOCKS && SPIN_LOCKS_AVAILABLE
+#ifndef WIN32
+
+/* Custom pthread-style spin locks on x86 and x64 for gcc */
+struct pthread_mlock_t {
+  volatile unsigned int l;
+  char cachelinepadding[64];
+  unsigned int c;
+  pthread_t threadid;
+};
+#define MLOCK_T               struct pthread_mlock_t
+#define CURRENT_THREAD        pthread_self()
+#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
+#define ACQUIRE_LOCK(sl)      pthread_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_release_lock(sl)
+#define TRY_LOCK(sl)          pthread_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+static MLOCK_T malloc_global_mutex = { 0, "", 0, 0};
+
+static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  volatile unsigned int* lp = &sl->l;
+  for (;;) {
+    if (*lp != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+        ++sl->c;
+        return 0;
+      }
+    }
+    else {
+      /* place args to cmpxchgl in locals to evade oddities in some gccs */
+      int cmp = 0;
+      int val = 1;
+      int ret;
+      __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+                             : "=a" (ret)
+                             : "r" (val), "m" (*(lp)), "0"(cmp)
+                             : "memory", "cc");
+      if (!ret) {
+        assert(!sl->threadid);
+        sl->threadid = CURRENT_THREAD;
+        sl->c = 1;
+        return 0;
+      }
+    }
+    if ((++spins & SPINS_PER_YIELD) == 0) {
+#if defined (__SVR4) && defined (__sun) /* solaris */
+      thr_yield();
+#else
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
+      sched_yield();
+#else  /* no-op yield on unknown systems */
+      ;
+#endif /* __linux__ || __FreeBSD__ || __APPLE__ */
+#endif /* solaris */
+    }
+  }
+}
+
+static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) {
+  volatile unsigned int* lp = &sl->l;
+  assert(*lp != 0);
+  assert(sl->threadid == CURRENT_THREAD);
+  if (--sl->c == 0) {
+    sl->threadid = 0;
+    int prev = 0;
+    int ret;
+    __asm__ __volatile__ ("lock; xchgl %0, %1"
+                          : "=r" (ret)
+                          : "m" (*(lp)), "0"(prev)
+                          : "memory");
+  }
+}
+
+static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) {
+  volatile unsigned int* lp = &sl->l;
+  if (*lp != 0) {
+    if (sl->threadid == CURRENT_THREAD) {
+      ++sl->c;
+      return 1;
+    }
+  }
+  else {
+    int cmp = 0;
+    int val = 1;
+    int ret;
+    __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+                           : "=a" (ret)
+                           : "r" (val), "m" (*(lp)), "0"(cmp)
+                           : "memory", "cc");
+    if (!ret) {
+      assert(!sl->threadid);
+      sl->threadid = CURRENT_THREAD;
+      sl->c = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+
+#else /* WIN32 */
+/* Custom win32-style spin locks on x86 and x64 for MSC */
+struct win32_mlock_t {
+  volatile long l;
+  char cachelinepadding[64];
+  unsigned int c;
+  long threadid;
+};
+
+#define MLOCK_T               struct win32_mlock_t
+#define CURRENT_THREAD        ((long)GetCurrentThreadId())
+#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
+#define ACQUIRE_LOCK(sl)      win32_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      win32_release_lock(sl)
+#define TRY_LOCK(sl)          win32_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+static MLOCK_T malloc_global_mutex = { 0, 0, 0};
+
+static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  for (;;) {
+    if (sl->l != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+        ++sl->c;
+        return 0;
+      }
+    }
+    else {
+      if (!interlockedexchange(&sl->l, 1)) {
+        assert(!sl->threadid);
+        sl->threadid = CURRENT_THREAD;
+        sl->c = 1;
+        return 0;
+      }
+    }
+    if ((++spins & SPINS_PER_YIELD) == 0)
+      SleepEx(0, FALSE);
+  }
+}
+
+static FORCEINLINE void win32_release_lock (MLOCK_T *sl) {
+  assert(sl->threadid == CURRENT_THREAD);
+  assert(sl->l != 0);
+  if (--sl->c == 0) {
+    sl->threadid = 0;
+    interlockedexchange (&sl->l, 0);
+  }
+}
+
+static FORCEINLINE int win32_try_lock (MLOCK_T *sl) {
+  if (sl->l != 0) {
+    if (sl->threadid == CURRENT_THREAD) {
+      ++sl->c;
+      return 1;
+    }
+  }
+  else {
+    if (!interlockedexchange(&sl->l, 1)){
+      assert(!sl->threadid);
+      sl->threadid = CURRENT_THREAD;
+      sl->c = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+#endif /* WIN32 */
+#else /* USE_SPIN_LOCKS */
+
+#ifndef WIN32
+/* pthreads-based locks */
+
+#define MLOCK_T               pthread_mutex_t
+#define CURRENT_THREAD        pthread_self()
+#define INITIAL_LOCK(sl)      pthread_init_lock(sl)
+#define ACQUIRE_LOCK(sl)      pthread_mutex_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_mutex_unlock(sl)
+#define TRY_LOCK(sl)          (!pthread_mutex_trylock(sl))
+
+static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Cope with old-style linux recursive lock initialization by adding */
+/* skipped internal declaration from pthread.h */
+#ifdef linux
+#ifndef PTHREAD_MUTEX_RECURSIVE
+extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
+					   int __kind));
+#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
+#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
+#endif
+#endif
+
+static int pthread_init_lock (MLOCK_T *sl) {
+  pthread_mutexattr_t attr;
+  if (pthread_mutexattr_init(&attr)) return 1;
+  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
+  if (pthread_mutex_init(sl, &attr)) return 1;
+  if (pthread_mutexattr_destroy(&attr)) return 1;
+  return 0;
+}
+
+#else /* WIN32 */
+/* Win32 critical sections */
+#define MLOCK_T               CRITICAL_SECTION
+#define CURRENT_THREAD        GetCurrentThreadId()
+#define INITIAL_LOCK(s)       (!InitializeCriticalSectionAndSpinCount((s), 0x80000000|4000))
+#define ACQUIRE_LOCK(s)       (EnterCriticalSection(sl), 0)
+#define RELEASE_LOCK(s)       LeaveCriticalSection(sl)
+#define TRY_LOCK(s)           TryEnterCriticalSection(sl)
+#define NEED_GLOBAL_LOCK_INIT
+
+static MLOCK_T malloc_global_mutex;
+static volatile long malloc_global_mutex_status;
+
+/* Use spin loop to initialize global lock */
+static void init_malloc_global_mutex() {
+  for (;;) {
+    long stat = malloc_global_mutex_status;
+    if (stat > 0)
+      return;
+    /* transition to < 0 while initializing, then to > 0) */
+    if (stat == 0 &&
+        interlockedcompareexchange(&malloc_global_mutex_status, -1, 0) == 0) {
+      InitializeCriticalSection(&malloc_global_mutex);
+      interlockedexchange(&malloc_global_mutex_status,1);
+      return;
+    }
+    SleepEx(0, FALSE);
+  }
+}
+
+#endif /* WIN32 */
+#endif /* USE_SPIN_LOCKS */
+#endif /* USE_LOCKS == 1 */
+
+/* -----------------------  User-defined locks ------------------------ */
+
+#if USE_LOCKS > 1
+/* Define your own lock implementation here */
+/* #define INITIAL_LOCK(sl)  ... */
+/* #define ACQUIRE_LOCK(sl)  ... */
+/* #define RELEASE_LOCK(sl)  ... */
+/* #define TRY_LOCK(sl) ... */
+/* static MLOCK_T malloc_global_mutex = ... */
+#endif /* USE_LOCKS > 1 */
+
+/* -----------------------  Lock-based state ------------------------ */
+
+#if USE_LOCKS
+#define USE_LOCK_BIT               (2U)
+#else  /* USE_LOCKS */
+#define USE_LOCK_BIT               (0U)
+#define INITIAL_LOCK(l)
+#endif /* USE_LOCKS */
+
+#if USE_LOCKS
+#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
+#endif
+#ifndef RELEASE_MALLOC_GLOBAL_LOCK
+#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
+#endif
+#else  /* USE_LOCKS */
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()
+#define RELEASE_MALLOC_GLOBAL_LOCK()
+#endif /* USE_LOCKS */
+
+
+/* -----------------------  Chunk representations ------------------------ */
+
+/*
+  (The following includes lightly edited explanations by Colin Plumb.)
+
+  The malloc_chunk declaration below is misleading (but accurate and
+  necessary).  It declares a "view" into memory allowing access to
+  necessary fields at known offsets from a given base.
+
+  Chunks of memory are maintained using a `boundary tag' method as
+  originally described by Knuth.  (See the paper by Paul Wilson
+  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
+  techniques.)  Sizes of free chunks are stored both in the front of
+  each chunk and at the end.  This makes consolidating fragmented
+  chunks into bigger chunks fast.  The head fields also hold bits
+  representing whether chunks are free or in use.
+
+  Here are some pictures to make it clearer.  They are "exploded" to
+  show that the state of a chunk can be thought of as extending from
+  the high 31 bits of the head field of its header through the
+  prev_foot and PINUSE_BIT bit of the following chunk header.
+
+  A chunk that's in use looks like:
+
+   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+           | Size of previous chunk (if P = 0)                             |
+           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+         | Size of this chunk                                         1| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         |                                                               |
+         +-                                                             -+
+         |                                                               |
+         +-                                                             -+
+         |                                                               :
+         +-      size - sizeof(size_t) available payload bytes          -+
+         :                                                               |
+ chunk-> +-                                                             -+
+         |                                                               |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
+       | Size of next chunk (may or may not be in use)               | +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+    And if it's free, it looks like this:
+
+   chunk-> +-                                                             -+
+           | User payload (must be in use, or we would have merged!)       |
+           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+         | Size of this chunk                                         0| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Next pointer                                                  |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Prev pointer                                                  |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         |                                                               :
+         +-      size - sizeof(struct chunk) unused bytes               -+
+         :                                                               |
+ chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Size of this chunk                                            |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
+       | Size of next chunk (must be in use, or we would have merged)| +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       |                                                               :
+       +- User payload                                                -+
+       :                                                               |
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+                                                                     |0|
+                                                                     +-+
+  Note that since we always merge adjacent free chunks, the chunks
+  adjacent to a free chunk must be in use.
+
+  Given a pointer to a chunk (which can be derived trivially from the
+  payload pointer) we can, in O(1) time, find out whether the adjacent
+  chunks are free, and if so, unlink them from the lists that they
+  are on and merge them with the current chunk.
+
+  Chunks always begin on even word boundaries, so the mem portion
+  (which is returned to the user) is also on an even word boundary, and
+  thus at least double-word aligned.
+
+  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
+  chunk size (which is always a multiple of two words), is an in-use
+  bit for the *previous* chunk.  If that bit is *clear*, then the
+  word before the current chunk size contains the previous chunk
+  size, and can be used to find the front of the previous chunk.
+  The very first chunk allocated always has this bit set, preventing
+  access to non-existent (or non-owned) memory. If pinuse is set for
+  any given chunk, then you CANNOT determine the size of the
+  previous chunk, and might even get a memory addressing fault when
+  trying to do so.
+
+  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
+  the chunk size redundantly records whether the current chunk is
+  inuse (unless the chunk is mmapped). This redundancy enables usage
+  checks within free and realloc, and reduces indirection when freeing
+  and consolidating chunks.
+
+  Each freshly allocated chunk must have both cinuse and pinuse set.
+  That is, each allocated chunk borders either a previously allocated
+  and still in-use chunk, or the base of its memory arena. This is
+  ensured by making all allocations from the the `lowest' part of any
+  found chunk.  Further, no free chunk physically borders another one,
+  so each free chunk is known to be preceded and followed by either
+  inuse chunks or the ends of memory.
+
+  Note that the `foot' of the current chunk is actually represented
+  as the prev_foot of the NEXT chunk. This makes it easier to
+  deal with alignments etc but can be very confusing when trying
+  to extend or adapt this code.
+
+  The exceptions to all this are
+
+     1. The special chunk `top' is the top-most available chunk (i.e.,
+        the one bordering the end of available memory). It is treated
+        specially.  Top is never included in any bin, is used only if
+        no other chunk is available, and is released back to the
+        system if it is very large (see M_TRIM_THRESHOLD).  In effect,
+        the top chunk is treated as larger (and thus less well
+        fitting) than any other available chunk.  The top chunk
+        doesn't update its trailing size field since there is no next
+        contiguous chunk that would have to index off it. However,
+        space is still allocated for it (TOP_FOOT_SIZE) to enable
+        separation or merging when space is extended.
+
+     3. Chunks allocated via mmap, have both cinuse and pinuse bits
+        cleared in their head fields.  Because they are allocated
+        one-by-one, each must carry its own prev_foot field, which is
+        also used to hold the offset this chunk has within its mmapped
+        region, which is needed to preserve alignment. Each mmapped
+        chunk is trailed by the first two fields of a fake next-chunk
+        for sake of usage checks.
+
+*/
+
+struct malloc_chunk {
+  size_t               prev_foot;  /* Size of previous chunk (if free).  */
+  size_t               head;       /* Size and inuse bits. */
+  struct malloc_chunk* fd;         /* double links -- used only if free. */
+  struct malloc_chunk* bk;
+};
+
+typedef struct malloc_chunk  mchunk;
+typedef struct malloc_chunk* mchunkptr;
+typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
+typedef unsigned int bindex_t;         /* Described below */
+typedef unsigned int binmap_t;         /* Described below */
+
+/* ------------------- Chunks sizes and alignments ----------------------- */
+
+#define MCHUNK_SIZE         (sizeof(mchunk))
+
+#if FOOTERS
+#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
+#else /* FOOTERS */
+#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
+#endif /* FOOTERS */
+
+/* MMapped chunks need a second word of overhead ... */
+#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
+/* ... and additional padding for fake next-chunk at foot */
+#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)
+
+/* The smallest size we can malloc is an aligned minimal chunk */
+#define MIN_CHUNK_SIZE\
+  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* conversion from malloc headers to user pointers, and back */
+#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
+#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
+/* chunk associated with aligned address A */
+#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))
+
+/* Bounds on request (not chunk) sizes. */
+#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
+#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)
+
+/* pad request bytes into a usable size */
+#define pad_request(req) \
+   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* pad request, checking for minimum (but not maximum) */
+#define request2size(req) \
+  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))
+
+
+/* ------------------ Operations on head and foot fields ----------------- */
+
+/*
+  The head field of a chunk is or'ed with PINUSE_BIT when previous
+  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
+  use, unless mmapped, in which case both bits are cleared.
+
+  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
+*/
+
+#define PINUSE_BIT          (SIZE_T_ONE)
+#define CINUSE_BIT          (SIZE_T_TWO)
+#define FLAG4_BIT           (SIZE_T_FOUR)
+#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
+#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)
+
+/* Head value for fenceposts */
+#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)
+
+/* extraction of fields from head words */
+#define cinuse(p)           ((p)->head & CINUSE_BIT)
+#define pinuse(p)           ((p)->head & PINUSE_BIT)
+#define is_inuse(p)         (((p)->head & INUSE_BITS) != PINUSE_BIT)
+#define is_mmapped(p)       (((p)->head & INUSE_BITS) == 0)
+
+#define chunksize(p)        ((p)->head & ~(FLAG_BITS))
+
+#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
+
+/* Treat space at ptr +/- offset as a chunk */
+#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
+#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))
+
+/* Ptr to next or previous physical malloc_chunk. */
+#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
+#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))
+
+/* extract next chunk's pinuse bit */
+#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)
+
+/* Get/set size at footer */
+#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
+#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))
+
+/* Set size, pinuse bit, and foot */
+#define set_size_and_pinuse_of_free_chunk(p, s)\
+  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))
+
+/* Set size, pinuse bit, foot, and clear next pinuse */
+#define set_free_with_pinuse(p, s, n)\
+  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))
+
+/* Get the internal overhead associated with chunk p */
+#define overhead_for(p)\
+ (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)
+
+/* Return true if malloced space is not necessarily cleared */
+#if MMAP_CLEARS
+#define calloc_must_clear(p) (!is_mmapped(p))
+#else /* MMAP_CLEARS */
+#define calloc_must_clear(p) (1)
+#endif /* MMAP_CLEARS */
+
+/* ---------------------- Overlaid data structures ----------------------- */
+
+/*
+  When chunks are not in use, they are treated as nodes of either
+  lists or trees.
+
+  "Small"  chunks are stored in circular doubly-linked lists, and look
+  like this:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk                            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Forward pointer to next chunk in list             |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Back pointer to previous chunk in list            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Unused space (may be 0 bytes long)                .
+            .                                                               .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Larger chunks are kept in a form of bitwise digital trees (aka
+  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
+  free chunks greater than 256 bytes, their size doesn't impose any
+  constraints on user chunk sizes.  Each node looks like:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk                            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Forward pointer to next chunk of same size        |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Back pointer to previous chunk of same size       |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to left child (child[0])                  |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to right child (child[1])                 |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to parent                                 |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             bin index of this chunk                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Unused space                                      .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
+  of the same size are arranged in a circularly-linked list, with only
+  the oldest chunk (the next to be used, in our FIFO ordering)
+  actually in the tree.  (Tree members are distinguished by a non-null
+  parent pointer.)  If a chunk with the same size an an existing node
+  is inserted, it is linked off the existing node using pointers that
+  work in the same way as fd/bk pointers of small chunks.
+
+  Each tree contains a power of 2 sized range of chunk sizes (the
+  smallest is 0x100 <= x < 0x180), which is is divided in half at each
+  tree level, with the chunks in the smaller half of the range (0x100
+  <= x < 0x140 for the top nose) in the left subtree and the larger
+  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
+  done by inspecting individual bits.
+
+  Using these rules, each node's left subtree contains all smaller
+  sizes than its right subtree.  However, the node at the root of each
+  subtree has no particular ordering relationship to either.  (The
+  dividing line between the subtree sizes is based on trie relation.)
+  If we remove the last chunk of a given size from the interior of the
+  tree, we need to replace it with a leaf node.  The tree ordering
+  rules permit a node to be replaced by any leaf below it.
+
+  The smallest chunk in a tree (a common operation in a best-fit
+  allocator) can be found by walking a path to the leftmost leaf in
+  the tree.  Unlike a usual binary tree, where we follow left child
+  pointers until we reach a null, here we follow the right child
+  pointer any time the left one is null, until we reach a leaf with
+  both child pointers null. The smallest chunk in the tree will be
+  somewhere along that path.
+
+  The worst case number of steps to add, find, or remove a node is
+  bounded by the number of bits differentiating chunks within
+  bins. Under current bin calculations, this ranges from 6 up to 21
+  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
+  is of course much better.
+*/
+
+struct malloc_tree_chunk {
+  /* The first four fields must be compatible with malloc_chunk */
+  size_t                    prev_foot;
+  size_t                    head;
+  struct malloc_tree_chunk* fd;
+  struct malloc_tree_chunk* bk;
+
+  struct malloc_tree_chunk* child[2];
+  struct malloc_tree_chunk* parent;
+  bindex_t                  index;
+};
+
+typedef struct malloc_tree_chunk  tchunk;
+typedef struct malloc_tree_chunk* tchunkptr;
+typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */
+
+/* A little helper macro for trees */
+#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])
+
+/* ----------------------------- Segments -------------------------------- */
+
+/*
+  Each malloc space may include non-contiguous segments, held in a
+  list headed by an embedded malloc_segment record representing the
+  top-most space. Segments also include flags holding properties of
+  the space. Large chunks that are directly allocated by mmap are not
+  included in this list. They are instead independently created and
+  destroyed without otherwise keeping track of them.
+
+  Segment management mainly comes into play for spaces allocated by
+  MMAP.  Any call to MMAP might or might not return memory that is
+  adjacent to an existing segment.  MORECORE normally contiguously
+  extends the current space, so this space is almost always adjacent,
+  which is simpler and faster to deal with. (This is why MORECORE is
+  used preferentially to MMAP when both are available -- see
+  sys_alloc.)  When allocating using MMAP, we don't use any of the
+  hinting mechanisms (inconsistently) supported in various
+  implementations of unix mmap, or distinguish reserving from
+  committing memory. Instead, we just ask for space, and exploit
+  contiguity when we get it.  It is probably possible to do
+  better than this on some systems, but no general scheme seems
+  to be significantly better.
+
+  Management entails a simpler variant of the consolidation scheme
+  used for chunks to reduce fragmentation -- new adjacent memory is
+  normally prepended or appended to an existing segment. However,
+  there are limitations compared to chunk consolidation that mostly
+  reflect the fact that segment processing is relatively infrequent
+  (occurring only when getting memory from system) and that we
+  don't expect to have huge numbers of segments:
+
+  * Segments are not indexed, so traversal requires linear scans.  (It
+    would be possible to index these, but is not worth the extra
+    overhead and complexity for most programs on most platforms.)
+  * New segments are only appended to old ones when holding top-most
+    memory; if they cannot be prepended to others, they are held in
+    different segments.
+
+  Except for the top-most segment of an mstate, each segment record
+  is kept at the tail of its segment. Segments are added by pushing
+  segment records onto the list headed by &mstate.seg for the
+  containing mstate.
+
+  Segment flags control allocation/merge/deallocation policies:
+  * If EXTERN_BIT set, then we did not allocate this segment,
+    and so should not try to deallocate or merge with others.
+    (This currently holds only for the initial segment passed
+    into create_mspace_with_base.)
+  * If USE_MMAP_BIT set, the segment may be merged with
+    other surrounding mmapped segments and trimmed/de-allocated
+    using munmap.
+  * If neither bit is set, then the segment was obtained using
+    MORECORE so can be merged with surrounding MORECORE'd segments
+    and deallocated/trimmed using MORECORE with negative arguments.
+*/
+
+struct malloc_segment {
+  char*        base;             /* base address */
+  size_t       size;             /* allocated size */
+  struct malloc_segment* next;   /* ptr to next segment */
+  flag_t       sflags;           /* mmap and extern flag */
+};
+
+#define is_mmapped_segment(S)  ((S)->sflags & USE_MMAP_BIT)
+#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)
+
+typedef struct malloc_segment  msegment;
+typedef struct malloc_segment* msegmentptr;
+
+/* ---------------------------- malloc_state ----------------------------- */
+
+/*
+   A malloc_state holds all of the bookkeeping for a space.
+   The main fields are:
+
+  Top
+    The topmost chunk of the currently active segment. Its size is
+    cached in topsize.  The actual size of topmost space is
+    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
+    fenceposts and segment records if necessary when getting more
+    space from the system.  The size at which to autotrim top is
+    cached from mparams in trim_check, except that it is disabled if
+    an autotrim fails.
+
+  Designated victim (dv)
+    This is the preferred chunk for servicing small requests that
+    don't have exact fits.  It is normally the chunk split off most
+    recently to service another small request.  Its size is cached in
+    dvsize. The link fields of this chunk are not maintained since it
+    is not kept in a bin.
+
+  SmallBins
+    An array of bin headers for free chunks.  These bins hold chunks
+    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
+    chunks of all the same size, spaced 8 bytes apart.  To simplify
+    use in double-linked lists, each bin header acts as a malloc_chunk
+    pointing to the real first node, if it exists (else pointing to
+    itself).  This avoids special-casing for headers.  But to avoid
+    waste, we allocate only the fd/bk pointers of bins, and then use
+    repositioning tricks to treat these as the fields of a chunk.
+
+  TreeBins
+    Treebins are pointers to the roots of trees holding a range of
+    sizes. There are 2 equally spaced treebins for each power of two
+    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
+    larger.
+
+  Bin maps
+    There is one bit map for small bins ("smallmap") and one for
+    treebins ("treemap).  Each bin sets its bit when non-empty, and
+    clears the bit when empty.  Bit operations are then used to avoid
+    bin-by-bin searching -- nearly all "search" is done without ever
+    looking at bins that won't be selected.  The bit maps
+    conservatively use 32 bits per map word, even if on 64bit system.
+    For a good description of some of the bit-based techniques used
+    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
+    supplement at http://hackersdelight.org/). Many of these are
+    intended to reduce the branchiness of paths through malloc etc, as
+    well as to reduce the number of memory locations read or written.
+
+  Segments
+    A list of segments headed by an embedded malloc_segment record
+    representing the initial space.
+
+  Address check support
+    The least_addr field is the least address ever obtained from
+    MORECORE or MMAP. Attempted frees and reallocs of any address less
+    than this are trapped (unless INSECURE is defined).
+
+  Magic tag
+    A cross-check field that should always hold same value as mparams.magic.
+
+  Flags
+    Bits recording whether to use MMAP, locks, or contiguous MORECORE
+
+  Statistics
+    Each space keeps track of current and maximum system memory
+    obtained via MORECORE or MMAP.
+
+  Trim support
+    Fields holding the amount of unused topmost memory that should trigger
+    timming, and a counter to force periodic scanning to release unused
+    non-topmost segments.
+
+  Locking
+    If USE_LOCKS is defined, the "mutex" lock is acquired and released
+    around every public call using this mspace.
+
+  Extension support
+    A void* pointer and a size_t field that can be used to help implement
+    extensions to this malloc.
+*/
+
+/* Bin types, widths and sizes */
+#define NSMALLBINS        (32U)
+#define NTREEBINS         (32U)
+#define SMALLBIN_SHIFT    (3U)
+#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
+#define TREEBIN_SHIFT     (8U)
+#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
+#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
+#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)
+
+struct malloc_state {
+  binmap_t   smallmap;
+  binmap_t   treemap;
+  size_t     dvsize;
+  size_t     topsize;
+  char*      least_addr;
+  mchunkptr  dv;
+  mchunkptr  top;
+  size_t     trim_check;
+  size_t     release_checks;
+  size_t     magic;
+  mchunkptr  smallbins[(NSMALLBINS+1)*2];
+  tbinptr    treebins[NTREEBINS];
+  size_t     footprint;
+  size_t     max_footprint;
+  flag_t     mflags;
+  msegment   seg;
+#if USE_LOCKS
+  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
+#endif /* USE_LOCKS */
+  void*      extp;      /* Unused but available for extensions */
+  size_t     exts;
+};
+
+typedef struct malloc_state*    mstate;
+
+/* ------------- Global malloc_state and malloc_params ------------------- */
+
+#if !ONLY_MSPACES
+
+/* The global malloc_state used for all non-"mspace" calls */
+static struct malloc_state _gm_;
+#define gm                 (&_gm_)
+#define is_global(M)       ((M) == &_gm_)
+
+#endif /* !ONLY_MSPACES */
+
+#define is_initialized(M)  ((M)->top != 0)
+
+/* -------------------------- system alloc setup ------------------------- */
+
+/* Operations on mflags */
+
+#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
+#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
+#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
+
+#define use_mmap(M)           ((M)->mflags &   USE_MMAP_BIT)
+#define enable_mmap(M)        ((M)->mflags |=  USE_MMAP_BIT)
+#define disable_mmap(M)       ((M)->mflags &= ~USE_MMAP_BIT)
+
+#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
+#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)
+
+#define set_lock(M,L)\
+ ((M)->mflags = (L)?\
+  ((M)->mflags | USE_LOCK_BIT) :\
+  ((M)->mflags & ~USE_LOCK_BIT))
+
+/* page-align a size */
+#define page_align(S)\
+ (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))
+
+/* granularity-align a size */
+#define granularity_align(S)\
+  (((S) + (mparams.granularity - SIZE_T_ONE))\
+   & ~(mparams.granularity - SIZE_T_ONE))
+
+
+/* For mmap, use granularity alignment on windows, else page-align */
+#ifdef WIN32
+#define mmap_align(S) granularity_align(S)
+#else
+#define mmap_align(S) page_align(S)
+#endif
+
+/* For sys_alloc, enough padding to ensure can malloc request on success */
+#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)
+
+#define is_page_aligned(S)\
+   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
+#define is_granularity_aligned(S)\
+   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)
+
+/*  True if segment S holds address A */
+#define segment_holds(S, A)\
+  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)
+
+/* Return segment holding given address */
+static msegmentptr segment_holding(mstate m, char* addr) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if (addr >= sp->base && addr < sp->base + sp->size)
+      return sp;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+/* Return true if segment contains a segment link */
+static int has_segment_link(mstate m, msegmentptr ss) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
+      return 1;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+#ifndef MORECORE_CANNOT_TRIM
+#define should_trim(M,s)  ((s) > (M)->trim_check)
+#else  /* MORECORE_CANNOT_TRIM */
+#define should_trim(M,s)  (0)
+#endif /* MORECORE_CANNOT_TRIM */
+
+/*
+  TOP_FOOT_SIZE is padding at the end of a segment, including space
+  that may be needed to place segment records and fenceposts when new
+  noncontiguous segments are added.
+*/
+#define TOP_FOOT_SIZE\
+  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)
+
+
+/* -------------------------------  Hooks -------------------------------- */
+
+/*
+  PREACTION should be defined to return 0 on success, and nonzero on
+  failure. If you are not using locking, you can redefine these to do
+  anything you like.
+*/
+
+#if USE_LOCKS
+
+#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
+#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
+#else /* USE_LOCKS */
+
+#ifndef PREACTION
+#define PREACTION(M) (0)
+#endif  /* PREACTION */
+
+#ifndef POSTACTION
+#define POSTACTION(M)
+#endif  /* POSTACTION */
+
+#endif /* USE_LOCKS */
+
+/*
+  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
+  USAGE_ERROR_ACTION is triggered on detected bad frees and
+  reallocs. The argument p is an address that might have triggered the
+  fault. It is ignored by the two predefined actions, but might be
+  useful in custom actions that try to help diagnose errors.
+*/
+
+#if PROCEED_ON_ERROR
+
+/* A count of the number of corruption errors causing resets */
+int malloc_corruption_error_count;
+
+/* default corruption action */
+static void reset_on_error(mstate m);
+
+#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
+#define USAGE_ERROR_ACTION(m, p)
+
+#else /* PROCEED_ON_ERROR */
+
+#ifndef CORRUPTION_ERROR_ACTION
+#define CORRUPTION_ERROR_ACTION(m) ABORT
+#endif /* CORRUPTION_ERROR_ACTION */
+
+#ifndef USAGE_ERROR_ACTION
+#define USAGE_ERROR_ACTION(m,p) ABORT
+#endif /* USAGE_ERROR_ACTION */
+
+#endif /* PROCEED_ON_ERROR */
+
+/* -------------------------- Debugging setup ---------------------------- */
+
+#if ! DEBUG
+
+#define check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)
+#define check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)
+#define check_malloc_state(M)
+#define check_top_chunk(M,P)
+
+#else /* DEBUG */
+#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
+#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
+#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
+#define check_malloc_state(M)       do_check_malloc_state(M)
+
+static void   do_check_any_chunk(mstate m, mchunkptr p);
+static void   do_check_top_chunk(mstate m, mchunkptr p);
+static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
+static void   do_check_inuse_chunk(mstate m, mchunkptr p);
+static void   do_check_free_chunk(mstate m, mchunkptr p);
+static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
+static void   do_check_tree(mstate m, tchunkptr t);
+static void   do_check_treebin(mstate m, bindex_t i);
+static void   do_check_smallbin(mstate m, bindex_t i);
+static void   do_check_malloc_state(mstate m);
+static int    bin_find(mstate m, mchunkptr x);
+static size_t traverse_and_check(mstate m);
+#endif /* DEBUG */
+
+/* ---------------------------- Indexing Bins ---------------------------- */
+
+#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
+#define small_index(s)      (bindex_t)((s)  >> SMALLBIN_SHIFT)
+#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
+#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))
+
+/* addressing by index. See above about smallbin repositioning */
+#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
+#define treebin_at(M,i)     (&((M)->treebins[i]))
+
+/* assign tree index for size S to variable I. Use x86 asm if possible  */
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_tree_index(S, I)\
+{\
+  unsigned int X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "g"  (X));\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K = _bit_scan_reverse (X); \
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    _BitScanReverse((DWORD *) &K, (DWORD) X);\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#else /* GNUC */
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int Y = (unsigned int)X;\
+    unsigned int N = ((Y - 0x100) >> 16) & 8;\
+    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
+    N += K;\
+    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
+    K = 14 - N + ((Y <<= K) >> 15);\
+    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
+  }\
+}
+#endif /* GNUC */
+
+/* Bit representing maximum resolved size in a treebin at i */
+#define bit_for_tree_index(i) \
+   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)
+
+/* Shift placing maximum resolved bit in a treebin at i as sign bit */
+#define leftshift_for_tree_index(i) \
+   ((i == NTREEBINS-1)? 0 : \
+    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))
+
+/* The size of the smallest chunk held in bin with index i */
+#define minsize_for_tree_index(i) \
+   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
+   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))
+
+
+/* ------------------------ Operations on bin maps ----------------------- */
+
+/* bit corresponding to given index */
+#define idx2bit(i)              ((binmap_t)(1) << (i))
+
+/* Mark/Clear bits with given index */
+#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
+#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
+#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))
+
+#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
+#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
+#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))
+
+/* isolate the least set bit of a bitmap */
+#define least_bit(x)         ((x) & -(x))
+
+/* mask with all bits to left of least bit of x on */
+#define left_bits(x)         ((x<<1) | -(x<<1))
+
+/* mask with all bits to left of or equal to least bit of x on */
+#define same_or_left_bits(x) ((x) | -(x))
+
+/* index corresponding to given bit. Use x86 asm if possible */
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "g" (X));\
+  I = (bindex_t)J;\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  J = _bit_scan_forward (X); \
+  I = (bindex_t)J;\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  _BitScanForward((DWORD *) &J, X);\
+  I = (bindex_t)J;\
+}
+
+#elif USE_BUILTIN_FFS
+#define compute_bit2idx(X, I) I = ffs(X)-1
+
+#else
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int Y = X - 1;\
+  unsigned int K = Y >> (16-4) & 16;\
+  unsigned int N = K;        Y >>= K;\
+  N += K = Y >> (8-3) &  8;  Y >>= K;\
+  N += K = Y >> (4-2) &  4;  Y >>= K;\
+  N += K = Y >> (2-1) &  2;  Y >>= K;\
+  N += K = Y >> (1-0) &  1;  Y >>= K;\
+  I = (bindex_t)(N + Y);\
+}
+#endif /* GNUC */
+
+
+/* ----------------------- Runtime Check Support ------------------------- */
+
+/*
+  For security, the main invariant is that malloc/free/etc never
+  writes to a static address other than malloc_state, unless static
+  malloc_state itself has been corrupted, which cannot occur via
+  malloc (because of these checks). In essence this means that we
+  believe all pointers, sizes, maps etc held in malloc_state, but
+  check all of those linked or offsetted from other embedded data
+  structures.  These checks are interspersed with main code in a way
+  that tends to minimize their run-time cost.
+
+  When FOOTERS is defined, in addition to range checking, we also
+  verify footer fields of inuse chunks, which can be used guarantee
+  that the mstate controlling malloc/free is intact.  This is a
+  streamlined version of the approach described by William Robertson
+  et al in "Run-time Detection of Heap-based Overflows" LISA'03
+  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
+  of an inuse chunk holds the xor of its mstate and a random seed,
+  that is checked upon calls to free() and realloc().  This is
+  (probablistically) unguessable from outside the program, but can be
+  computed by any code successfully malloc'ing any chunk, so does not
+  itself provide protection against code that has already broken
+  security through some other means.  Unlike Robertson et al, we
+  always dynamically check addresses of all offset chunks (previous,
+  next, etc). This turns out to be cheaper than relying on hashes.
+*/
+
+#if !INSECURE
+/* Check if address a is at least as high as any from MORECORE or MMAP */
+#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
+/* Check if address of next chunk n is higher than base chunk p */
+#define ok_next(p, n)    ((char*)(p) < (char*)(n))
+/* Check if p has inuse status */
+#define ok_inuse(p)     is_inuse(p)
+/* Check if p has its pinuse bit on */
+#define ok_pinuse(p)     pinuse(p)
+
+#else /* !INSECURE */
+#define ok_address(M, a) (1)
+#define ok_next(b, n)    (1)
+#define ok_inuse(p)      (1)
+#define ok_pinuse(p)     (1)
+#endif /* !INSECURE */
+
+#if (FOOTERS && !INSECURE)
+/* Check if (alleged) mstate m has expected magic field */
+#define ok_magic(M)      ((M)->magic == mparams.magic)
+#else  /* (FOOTERS && !INSECURE) */
+#define ok_magic(M)      (1)
+#endif /* (FOOTERS && !INSECURE) */
+
+
+/* In gcc, use __builtin_expect to minimize impact of checks */
+#if !INSECURE
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define RTCHECK(e)  __builtin_expect(e, 1)
+#else /* GNUC */
+#define RTCHECK(e)  (e)
+#endif /* GNUC */
+#else /* !INSECURE */
+#define RTCHECK(e)  (1)
+#endif /* !INSECURE */
+
+/* macros to set up inuse chunks with or without footers */
+
+#if !FOOTERS
+
+#define mark_inuse_foot(M,p,s)
+
+/* Macros for setting head/foot of non-mmapped chunks */
+
+/* Set cinuse bit and pinuse bit of next chunk */
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set size, cinuse and pinuse bit of this chunk */
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))
+
+#else /* FOOTERS */
+
+/* Set foot of inuse chunk to be xor of mstate and seed */
+#define mark_inuse_foot(M,p,s)\
+  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))
+
+#define get_mstate_for(p)\
+  ((mstate)(((mchunkptr)((char*)(p) +\
+    (chunksize(p))))->prev_foot ^ mparams.magic))
+
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
+  mark_inuse_foot(M,p,s))
+
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
+ mark_inuse_foot(M,p,s))
+
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  mark_inuse_foot(M, p, s))
+
+#endif /* !FOOTERS */
+
+/* ---------------------------- setting mparams -------------------------- */
+
+#ifdef ENABLE_LARGE_PAGES
+typedef size_t (WINAPI *GetLargePageMinimum_t)(void);
+#endif
+
+/* Initialize mparams */
+static int init_mparams(void) {
+#ifdef NEED_GLOBAL_LOCK_INIT
+  if (malloc_global_mutex_status <= 0)
+    init_malloc_global_mutex();
+#endif
+
+  ACQUIRE_MALLOC_GLOBAL_LOCK();
+  if (mparams.magic == 0) {
+    size_t magic;
+    size_t psize;
+    size_t gsize;
+
+#ifndef WIN32
+    psize = malloc_getpagesize;
+    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
+#else /* WIN32 */
+    {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      psize = system_info.dwPageSize;
+      gsize = ((DEFAULT_GRANULARITY != 0)?
+               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
+#ifdef ENABLE_LARGE_PAGES
+      { 
+          GetLargePageMinimum_t GetLargePageMinimum_ = (GetLargePageMinimum_t) GetProcAddress(GetModuleHandle(__T("kernel32.dll")), "GetLargePageMinimum");
+          if(GetLargePageMinimum_) {
+              size_t largepagesize = GetLargePageMinimum_();
+              if(largepagesize) {
+                  psize = largepagesize;
+                  gsize = ((DEFAULT_GRANULARITY != 0)?
+                           DEFAULT_GRANULARITY : largepagesize);
+                  if(gsize < largepagesize) gsize = largepagesize;
+              }
+          }
+      }
+#endif
+    }
+#endif /* WIN32 */
+
+    /* Sanity-check configuration:
+       size_t must be unsigned and as wide as pointer type.
+       ints must be at least 4 bytes.
+       alignment must be at least 8.
+       Alignment, min chunk size, and page size must all be powers of 2.
+    */
+    if ((sizeof(size_t) != sizeof(char*)) ||
+        (MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
+        (sizeof(int) < 4)  ||
+        (MALLOC_ALIGNMENT < (size_t)8U) ||
+        ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
+        ((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
+        ((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
+        ((psize            & (psize-SIZE_T_ONE))            != 0))
+      ABORT;
+
+    mparams.granularity = gsize;
+    mparams.page_size = psize;
+    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
+    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
+#if MORECORE_CONTIGUOUS
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
+#else  /* MORECORE_CONTIGUOUS */
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
+#endif /* MORECORE_CONTIGUOUS */
+
+#if !ONLY_MSPACES
+    /* Set up lock for main malloc area */
+    gm->mflags = mparams.default_mflags;
+    INITIAL_LOCK(&gm->mutex);
+#endif
+
+    {
+#if USE_DEV_RANDOM
+      int fd;
+      unsigned char buf[sizeof(size_t)];
+      /* Try to use /dev/urandom, else fall back on using time */
+      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
+          read(fd, buf, sizeof(buf)) == sizeof(buf)) {
+        magic = *((size_t *) buf);
+        close(fd);
+      }
+      else
+#endif /* USE_DEV_RANDOM */
+#ifdef WIN32
+        magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
+#else
+        magic = (size_t)(time(0) ^ (size_t)0x55555555U);
+#endif
+      magic |= (size_t)8U;    /* ensure nonzero */
+      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
+      mparams.magic = magic;
+    }
+  }
+
+  RELEASE_MALLOC_GLOBAL_LOCK();
+  return 1;
+}
+
+/* support for mallopt */
+static int change_mparam(int param_number, int value) {
+  size_t val;
+  ensure_initialization();
+  val = (value == -1)? MAX_SIZE_T : (size_t)value;
+  switch(param_number) {
+  case M_TRIM_THRESHOLD:
+    mparams.trim_threshold = val;
+    return 1;
+  case M_GRANULARITY:
+    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
+      mparams.granularity = val;
+      return 1;
+    }
+    else
+      return 0;
+  case M_MMAP_THRESHOLD:
+    mparams.mmap_threshold = val;
+    return 1;
+  default:
+    return 0;
+  }
+}
+
+#if DEBUG
+/* ------------------------- Debugging Support --------------------------- */
+
+/* Check properties of any chunk, whether free, inuse, mmapped etc  */
+static void do_check_any_chunk(mstate m, mchunkptr p) {
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+}
+
+/* Check properties of top chunk */
+static void do_check_top_chunk(mstate m, mchunkptr p) {
+  msegmentptr sp = segment_holding(m, (char*)p);
+  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
+  assert(sp != 0);
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(sz == m->topsize);
+  assert(sz > 0);
+  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
+  assert(pinuse(p));
+  assert(!pinuse(chunk_plus_offset(p, sz)));
+}
+
+/* Check properties of (inuse) mmapped chunks */
+static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
+  size_t  sz = chunksize(p);
+  size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD);
+  assert(is_mmapped(p));
+  assert(use_mmap(m));
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(!is_small(sz));
+  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
+  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
+  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
+}
+
+/* Check properties of inuse chunks */
+static void do_check_inuse_chunk(mstate m, mchunkptr p) {
+  do_check_any_chunk(m, p);
+  assert(is_inuse(p));
+  assert(next_pinuse(p));
+  /* If not pinuse and not mmapped, previous chunk has OK offset */
+  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
+  if (is_mmapped(p))
+    do_check_mmapped_chunk(m, p);
+}
+
+/* Check properties of free chunks */
+static void do_check_free_chunk(mstate m, mchunkptr p) {
+  size_t sz = chunksize(p);
+  mchunkptr next = chunk_plus_offset(p, sz);
+  do_check_any_chunk(m, p);
+  assert(!is_inuse(p));
+  assert(!next_pinuse(p));
+  assert (!is_mmapped(p));
+  if (p != m->dv && p != m->top) {
+    if (sz >= MIN_CHUNK_SIZE) {
+      assert((sz & CHUNK_ALIGN_MASK) == 0);
+      assert(is_aligned(chunk2mem(p)));
+      assert(next->prev_foot == sz);
+      assert(pinuse(p));
+      assert (next == m->top || is_inuse(next));
+      assert(p->fd->bk == p);
+      assert(p->bk->fd == p);
+    }
+    else  /* markers are always of size SIZE_T_SIZE */
+      assert(sz == SIZE_T_SIZE);
+  }
+}
+
+/* Check properties of malloced chunks at the point they are malloced */
+static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    size_t sz = p->head & ~INUSE_BITS;
+    do_check_inuse_chunk(m, p);
+    assert((sz & CHUNK_ALIGN_MASK) == 0);
+    assert(sz >= MIN_CHUNK_SIZE);
+    assert(sz >= s);
+    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
+    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
+  }
+}
+
+/* Check a tree and its subtrees.  */
+static void do_check_tree(mstate m, tchunkptr t) {
+  tchunkptr head = 0;
+  tchunkptr u = t;
+  bindex_t tindex = t->index;
+  size_t tsize = chunksize(t);
+  bindex_t idx;
+  compute_tree_index(tsize, idx);
+  assert(tindex == idx);
+  assert(tsize >= MIN_LARGE_SIZE);
+  assert(tsize >= minsize_for_tree_index(idx));
+  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));
+
+  do { /* traverse through chain of same-sized nodes */
+    do_check_any_chunk(m, ((mchunkptr)u));
+    assert(u->index == tindex);
+    assert(chunksize(u) == tsize);
+    assert(!is_inuse(u));
+    assert(!next_pinuse(u));
+    assert(u->fd->bk == u);
+    assert(u->bk->fd == u);
+    if (u->parent == 0) {
+      assert(u->child[0] == 0);
+      assert(u->child[1] == 0);
+    }
+    else {
+      assert(head == 0); /* only one node on chain has parent */
+      head = u;
+      assert(u->parent != u);
+      assert (u->parent->child[0] == u ||
+              u->parent->child[1] == u ||
+              *((tbinptr*)(u->parent)) == u);
+      if (u->child[0] != 0) {
+        assert(u->child[0]->parent == u);
+        assert(u->child[0] != u);
+        do_check_tree(m, u->child[0]);
+      }
+      if (u->child[1] != 0) {
+        assert(u->child[1]->parent == u);
+        assert(u->child[1] != u);
+        do_check_tree(m, u->child[1]);
+      }
+      if (u->child[0] != 0 && u->child[1] != 0) {
+        assert(chunksize(u->child[0]) < chunksize(u->child[1]));
+      }
+    }
+    u = u->fd;
+  } while (u != t);
+  assert(head != 0);
+}
+
+/*  Check all the chunks in a treebin.  */
+static void do_check_treebin(mstate m, bindex_t i) {
+  tbinptr* tb = treebin_at(m, i);
+  tchunkptr t = *tb;
+  int empty = (m->treemap & (1U << i)) == 0;
+  if (t == 0)
+    assert(empty);
+  if (!empty)
+    do_check_tree(m, t);
+}
+
+/*  Check all the chunks in a smallbin.  */
+static void do_check_smallbin(mstate m, bindex_t i) {
+  sbinptr b = smallbin_at(m, i);
+  mchunkptr p = b->bk;
+  unsigned int empty = (m->smallmap & (1U << i)) == 0;
+  if (p == b)
+    assert(empty);
+  if (!empty) {
+    for (; p != b; p = p->bk) {
+      size_t size = chunksize(p);
+      mchunkptr q;
+      /* each chunk claims to be free */
+      do_check_free_chunk(m, p);
+      /* chunk belongs in bin */
+      assert(small_index(size) == i);
+      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
+      /* chunk is followed by an inuse chunk */
+      q = next_chunk(p);
+      if (q->head != FENCEPOST_HEAD)
+        do_check_inuse_chunk(m, q);
+    }
+  }
+}
+
+/* Find x in a bin. Used in other check functions. */
+static int bin_find(mstate m, mchunkptr x) {
+  size_t size = chunksize(x);
+  if (is_small(size)) {
+    bindex_t sidx = small_index(size);
+    sbinptr b = smallbin_at(m, sidx);
+    if (smallmap_is_marked(m, sidx)) {
+      mchunkptr p = b;
+      do {
+        if (p == x)
+          return 1;
+      } while ((p = p->fd) != b);
+    }
+  }
+  else {
+    bindex_t tidx;
+    compute_tree_index(size, tidx);
+    if (treemap_is_marked(m, tidx)) {
+      tchunkptr t = *treebin_at(m, tidx);
+      size_t sizebits = size << leftshift_for_tree_index(tidx);
+      while (t != 0 && chunksize(t) != size) {
+        t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+        sizebits <<= 1;
+      }
+      if (t != 0) {
+        tchunkptr u = t;
+        do {
+          if (u == (tchunkptr)x)
+            return 1;
+        } while ((u = u->fd) != t);
+      }
+    }
+  }
+  return 0;
+}
+
+/* Traverse each chunk and check it; return total */
+static size_t traverse_and_check(mstate m) {
+  size_t sum = 0;
+  if (is_initialized(m)) {
+    msegmentptr s = &m->seg;
+    sum += m->topsize + TOP_FOOT_SIZE;
+    while (s != 0) {
+      mchunkptr q = align_as_chunk(s->base);
+      mchunkptr lastq = 0;
+      assert(pinuse(q));
+      while (segment_holds(s, q) &&
+             q != m->top && q->head != FENCEPOST_HEAD) {
+        sum += chunksize(q);
+        if (is_inuse(q)) {
+          assert(!bin_find(m, q));
+          do_check_inuse_chunk(m, q);
+        }
+        else {
+          assert(q == m->dv || bin_find(m, q));
+          assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */
+          do_check_free_chunk(m, q);
+        }
+        lastq = q;
+        q = next_chunk(q);
+      }
+      s = s->next;
+    }
+  }
+  return sum;
+}
+
+/* Check all properties of malloc_state. */
+static void do_check_malloc_state(mstate m) {
+  bindex_t i;
+  size_t total;
+  /* check bins */
+  for (i = 0; i < NSMALLBINS; ++i)
+    do_check_smallbin(m, i);
+  for (i = 0; i < NTREEBINS; ++i)
+    do_check_treebin(m, i);
+
+  if (m->dvsize != 0) { /* check dv chunk */
+    do_check_any_chunk(m, m->dv);
+    assert(m->dvsize == chunksize(m->dv));
+    assert(m->dvsize >= MIN_CHUNK_SIZE);
+    assert(bin_find(m, m->dv) == 0);
+  }
+
+  if (m->top != 0) {   /* check top chunk */
+    do_check_top_chunk(m, m->top);
+    /*assert(m->topsize == chunksize(m->top)); redundant */
+    assert(m->topsize > 0);
+    assert(bin_find(m, m->top) == 0);
+  }
+
+  total = traverse_and_check(m);
+  assert(total <= m->footprint);
+  assert(m->footprint <= m->max_footprint);
+}
+#endif /* DEBUG */
+
+/* ----------------------------- statistics ------------------------------ */
+
+#if !NO_MALLINFO
+static struct mallinfo internal_mallinfo(mstate m) {
+  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      size_t nfree = SIZE_T_ONE; /* top always free */
+      size_t mfree = m->topsize + TOP_FOOT_SIZE;
+      size_t sum = mfree;
+      msegmentptr s = &m->seg;
+      while (s != 0) {
+        mchunkptr q = align_as_chunk(s->base);
+        while (segment_holds(s, q) &&
+               q != m->top && q->head != FENCEPOST_HEAD) {
+          size_t sz = chunksize(q);
+          sum += sz;
+          if (!is_inuse(q)) {
+            mfree += sz;
+            ++nfree;
+          }
+          q = next_chunk(q);
+        }
+        s = s->next;
+      }
+
+      nm.arena    = sum;
+      nm.ordblks  = nfree;
+      nm.hblkhd   = m->footprint - sum;
+      nm.usmblks  = m->max_footprint;
+      nm.uordblks = m->footprint - mfree;
+      nm.fordblks = mfree;
+      nm.keepcost = m->topsize;
+    }
+
+    POSTACTION(m);
+  }
+  return nm;
+}
+#endif /* !NO_MALLINFO */
+
+static void internal_malloc_stats(mstate m) {
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    size_t maxfp = 0;
+    size_t fp = 0;
+    size_t used = 0;
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      msegmentptr s = &m->seg;
+      maxfp = m->max_footprint;
+      fp = m->footprint;
+      used = fp - (m->topsize + TOP_FOOT_SIZE);
+
+      while (s != 0) {
+        mchunkptr q = align_as_chunk(s->base);
+        while (segment_holds(s, q) &&
+               q != m->top && q->head != FENCEPOST_HEAD) {
+          if (!is_inuse(q))
+            used -= chunksize(q);
+          q = next_chunk(q);
+        }
+        s = s->next;
+      }
+    }
+
+    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
+    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
+    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
+
+    POSTACTION(m);
+  }
+}
+
+/* ----------------------- Operations on smallbins ----------------------- */
+
+/*
+  Various forms of linking and unlinking are defined as macros.  Even
+  the ones for trees, which are very long but have very short typical
+  paths.  This is ugly but reduces reliance on inlining support of
+  compilers.
+*/
+
+/* Link a free chunk into a smallbin  */
+#define insert_small_chunk(M, P, S) {\
+  bindex_t I  = small_index(S);\
+  mchunkptr B = smallbin_at(M, I);\
+  mchunkptr F = B;\
+  assert(S >= MIN_CHUNK_SIZE);\
+  if (!smallmap_is_marked(M, I))\
+    mark_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, B->fd)))\
+    F = B->fd;\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+  B->fd = P;\
+  F->bk = P;\
+  P->fd = F;\
+  P->bk = B;\
+}
+
+/* Unlink a chunk from a smallbin  */
+#define unlink_small_chunk(M, P, S) {\
+  mchunkptr F = P->fd;\
+  mchunkptr B = P->bk;\
+  bindex_t I = small_index(S);\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (F == B)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK((F == smallbin_at(M,I) || ok_address(M, F)) &&\
+                   (B == smallbin_at(M,I) || ok_address(M, B)))) {\
+    F->bk = B;\
+    B->fd = F;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+/* Unlink the first chunk from a smallbin */
+#define unlink_first_small_chunk(M, B, P, I) {\
+  mchunkptr F = P->fd;\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (B == F)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, F))) {\
+    B->fd = F;\
+    F->bk = B;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+
+
+/* Replace dv node, binning the old one */
+/* Used only when dvsize known to be small */
+#define replace_dv(M, P, S) {\
+  size_t DVS = M->dvsize;\
+  if (DVS != 0) {\
+    mchunkptr DV = M->dv;\
+    assert(is_small(DVS));\
+    insert_small_chunk(M, DV, DVS);\
+  }\
+  M->dvsize = S;\
+  M->dv = P;\
+}
+
+/* ------------------------- Operations on trees ------------------------- */
+
+/* Insert chunk into tree */
+#define insert_large_chunk(M, X, S) {\
+  tbinptr* H;\
+  bindex_t I;\
+  compute_tree_index(S, I);\
+  H = treebin_at(M, I);\
+  X->index = I;\
+  X->child[0] = X->child[1] = 0;\
+  if (!treemap_is_marked(M, I)) {\
+    mark_treemap(M, I);\
+    *H = X;\
+    X->parent = (tchunkptr)H;\
+    X->fd = X->bk = X;\
+  }\
+  else {\
+    tchunkptr T = *H;\
+    size_t K = S << leftshift_for_tree_index(I);\
+    for (;;) {\
+      if (chunksize(T) != S) {\
+        tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
+        K <<= 1;\
+        if (*C != 0)\
+          T = *C;\
+        else if (RTCHECK(ok_address(M, C))) {\
+          *C = X;\
+          X->parent = T;\
+          X->fd = X->bk = X;\
+          break;\
+        }\
+        else {\
+          CORRUPTION_ERROR_ACTION(M);\
+          break;\
+        }\
+      }\
+      else {\
+        tchunkptr F = T->fd;\
+        if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
+          T->fd = F->bk = X;\
+          X->fd = F;\
+          X->bk = T;\
+          X->parent = 0;\
+          break;\
+        }\
+        else {\
+          CORRUPTION_ERROR_ACTION(M);\
+          break;\
+        }\
+      }\
+    }\
+  }\
+}
+
+/*
+  Unlink steps:
+
+  1. If x is a chained node, unlink it from its same-sized fd/bk links
+     and choose its bk node as its replacement.
+  2. If x was the last node of its size, but not a leaf node, it must
+     be replaced with a leaf node (not merely one with an open left or
+     right), to make sure that lefts and rights of descendents
+     correspond properly to bit masks.  We use the rightmost descendent
+     of x.  We could use any other leaf, but this is easy to locate and
+     tends to counteract removal of leftmosts elsewhere, and so keeps
+     paths shorter than minimally guaranteed.  This doesn't loop much
+     because on average a node in a tree is near the bottom.
+  3. If x is the base of a chain (i.e., has parent links) relink
+     x's parent and children to x's replacement (or null if none).
+*/
+
+#define unlink_large_chunk(M, X) {\
+  tchunkptr XP = X->parent;\
+  tchunkptr R;\
+  if (X->bk != X) {\
+    tchunkptr F = X->fd;\
+    R = X->bk;\
+    if (RTCHECK(ok_address(M, F))) {\
+      F->bk = R;\
+      R->fd = F;\
+    }\
+    else {\
+      CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+  else {\
+    tchunkptr* RP;\
+    if (((R = *(RP = &(X->child[1]))) != 0) ||\
+        ((R = *(RP = &(X->child[0]))) != 0)) {\
+      tchunkptr* CP;\
+      while ((*(CP = &(R->child[1])) != 0) ||\
+             (*(CP = &(R->child[0])) != 0)) {\
+        R = *(RP = CP);\
+      }\
+      if (RTCHECK(ok_address(M, RP)))\
+        *RP = 0;\
+      else {\
+        CORRUPTION_ERROR_ACTION(M);\
+      }\
+    }\
+  }\
+  if (XP != 0) {\
+    tbinptr* H = treebin_at(M, X->index);\
+    if (X == *H) {\
+      if ((*H = R) == 0) \
+        clear_treemap(M, X->index);\
+    }\
+    else if (RTCHECK(ok_address(M, XP))) {\
+      if (XP->child[0] == X) \
+        XP->child[0] = R;\
+      else \
+        XP->child[1] = R;\
+    }\
+    else\
+      CORRUPTION_ERROR_ACTION(M);\
+    if (R != 0) {\
+      if (RTCHECK(ok_address(M, R))) {\
+        tchunkptr C0, C1;\
+        R->parent = XP;\
+        if ((C0 = X->child[0]) != 0) {\
+          if (RTCHECK(ok_address(M, C0))) {\
+            R->child[0] = C0;\
+            C0->parent = R;\
+          }\
+          else\
+            CORRUPTION_ERROR_ACTION(M);\
+        }\
+        if ((C1 = X->child[1]) != 0) {\
+          if (RTCHECK(ok_address(M, C1))) {\
+            R->child[1] = C1;\
+            C1->parent = R;\
+          }\
+          else\
+            CORRUPTION_ERROR_ACTION(M);\
+        }\
+      }\
+      else\
+        CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+}
+
+/* Relays to large vs small bin operations */
+
+#define insert_chunk(M, P, S)\
+  if (is_small(S)) insert_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }
+
+#define unlink_chunk(M, P, S)\
+  if (is_small(S)) unlink_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }
+
+
+/* Relays to internal calls to malloc/free from realloc, memalign etc */
+
+#if ONLY_MSPACES
+#define internal_malloc(m, b) mspace_malloc(m, b)
+#define internal_free(m, mem) mspace_free(m,mem);
+#else /* ONLY_MSPACES */
+#if MSPACES
+#define internal_malloc(m, b)\
+   (m == gm)? dlmalloc(b) : mspace_malloc(m, b)
+#define internal_free(m, mem)\
+   if (m == gm) dlfree(mem); else mspace_free(m,mem);
+#else /* MSPACES */
+#define internal_malloc(m, b) dlmalloc(b)
+#define internal_free(m, mem) dlfree(mem)
+#endif /* MSPACES */
+#endif /* ONLY_MSPACES */
+
+/* -----------------------  Direct-mmapping chunks ----------------------- */
+
+/*
+  Directly mmapped chunks are set up with an offset to the start of
+  the mmapped region stored in the prev_foot field of the chunk. This
+  allows reconstruction of the required argument to MUNMAP when freed,
+  and also allows adjustment of the returned chunk to meet alignment
+  requirements (especially in memalign).
+*/
+
+/* Malloc using mmap */
+static void* mmap_alloc(mstate m, size_t nb) {
+  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  if (mmsize > nb) {     /* Check for wrap around 0 */
+    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
+    if (mm != CMFAIL) {
+      size_t offset = align_offset(chunk2mem(mm));
+      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
+      mchunkptr p = (mchunkptr)(mm + offset);
+      p->prev_foot = offset;
+      p->head = psize;
+      mark_inuse_foot(m, p, psize);
+      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;
+
+      if (m->least_addr == 0 || mm < m->least_addr)
+        m->least_addr = mm;
+      if ((m->footprint += mmsize) > m->max_footprint)
+        m->max_footprint = m->footprint;
+      assert(is_aligned(chunk2mem(p)));
+      check_mmapped_chunk(m, p);
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* Realloc using mmap */
+static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) {
+  size_t oldsize = chunksize(oldp);
+  if (is_small(nb)) /* Can't shrink mmap regions below small size */
+    return 0;
+  /* Keep old chunk if big enough but not too big */
+  if (oldsize >= nb + SIZE_T_SIZE &&
+      (oldsize - nb) <= (mparams.granularity << 1))
+    return oldp;
+  else {
+    size_t offset = oldp->prev_foot;
+    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
+    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
+                                  oldmmsize, newmmsize, 1);
+    if (cp != CMFAIL) {
+      mchunkptr newp = (mchunkptr)(cp + offset);
+      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
+      newp->head = psize;
+      mark_inuse_foot(m, newp, psize);
+      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;
+
+      if (cp < m->least_addr)
+        m->least_addr = cp;
+      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
+        m->max_footprint = m->footprint;
+      check_mmapped_chunk(m, newp);
+      return newp;
+    }
+  }
+  return 0;
+}
+
+/* -------------------------- mspace management -------------------------- */
+
+/* Initialize top chunk and its size */
+static void init_top(mstate m, mchunkptr p, size_t psize) {
+  /* Ensure alignment */
+  size_t offset = align_offset(chunk2mem(p));
+  p = (mchunkptr)((char*)p + offset);
+  psize -= offset;
+
+  m->top = p;
+  m->topsize = psize;
+  p->head = psize | PINUSE_BIT;
+  /* set size of fake trailing chunk holding overhead space only once */
+  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
+  m->trim_check = mparams.trim_threshold; /* reset on each update */
+}
+
+/* Initialize bins for a new mstate that is otherwise zeroed out */
+static void init_bins(mstate m) {
+  /* Establish circular links for smallbins */
+  bindex_t i;
+  for (i = 0; i < NSMALLBINS; ++i) {
+    sbinptr bin = smallbin_at(m,i);
+    bin->fd = bin->bk = bin;
+  }
+}
+
+#if PROCEED_ON_ERROR
+
+/* default corruption action */
+static void reset_on_error(mstate m) {
+  int i;
+  ++malloc_corruption_error_count;
+  /* Reinitialize fields to forget about all memory */
+  m->smallbins = m->treebins = 0;
+  m->dvsize = m->topsize = 0;
+  m->seg.base = 0;
+  m->seg.size = 0;
+  m->seg.next = 0;
+  m->top = m->dv = 0;
+  for (i = 0; i < NTREEBINS; ++i)
+    *treebin_at(m, i) = 0;
+  init_bins(m);
+}
+#endif /* PROCEED_ON_ERROR */
+
+/* Allocate chunk and prepend remainder with chunk in successor base. */
+static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
+                           size_t nb) {
+  mchunkptr p = align_as_chunk(newbase);
+  mchunkptr oldfirst = align_as_chunk(oldbase);
+  size_t psize = (char*)oldfirst - (char*)p;
+  mchunkptr q = chunk_plus_offset(p, nb);
+  size_t qsize = psize - nb;
+  set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+
+  assert((char*)oldfirst > (char*)q);
+  assert(pinuse(oldfirst));
+  assert(qsize >= MIN_CHUNK_SIZE);
+
+  /* consolidate remainder with first chunk of old base */
+  if (oldfirst == m->top) {
+    size_t tsize = m->topsize += qsize;
+    m->top = q;
+    q->head = tsize | PINUSE_BIT;
+    check_top_chunk(m, q);
+  }
+  else if (oldfirst == m->dv) {
+    size_t dsize = m->dvsize += qsize;
+    m->dv = q;
+    set_size_and_pinuse_of_free_chunk(q, dsize);
+  }
+  else {
+    if (!is_inuse(oldfirst)) {
+      size_t nsize = chunksize(oldfirst);
+      unlink_chunk(m, oldfirst, nsize);
+      oldfirst = chunk_plus_offset(oldfirst, nsize);
+      qsize += nsize;
+    }
+    set_free_with_pinuse(q, qsize, oldfirst);
+    insert_chunk(m, q, qsize);
+    check_free_chunk(m, q);
+  }
+
+  check_malloced_chunk(m, chunk2mem(p), nb);
+  return chunk2mem(p);
+}
+
+/* Add a segment to hold a new noncontiguous region */
+static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
+  /* Determine locations and sizes of segment, fenceposts, old top */
+  char* old_top = (char*)m->top;
+  msegmentptr oldsp = segment_holding(m, old_top);
+  char* old_end = oldsp->base + oldsp->size;
+  size_t ssize = pad_request(sizeof(struct malloc_segment));
+  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  size_t offset = align_offset(chunk2mem(rawsp));
+  char* asp = rawsp + offset;
+  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
+  mchunkptr sp = (mchunkptr)csp;
+  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
+  mchunkptr tnext = chunk_plus_offset(sp, ssize);
+  mchunkptr p = tnext;
+  int nfences = 0;
+
+  /* reset top to new space */
+  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+
+  /* Set up segment record */
+  assert(is_aligned(ss));
+  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
+  *ss = m->seg; /* Push current record */
+  m->seg.base = tbase;
+  m->seg.size = tsize;
+  m->seg.sflags = mmapped;
+  m->seg.next = ss;
+
+  /* Insert trailing fenceposts */
+  for (;;) {
+    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
+    p->head = FENCEPOST_HEAD;
+    ++nfences;
+    if ((char*)(&(nextp->head)) < old_end)
+      p = nextp;
+    else
+      break;
+  }
+  assert(nfences >= 2);
+
+  /* Insert the rest of old top into a bin as an ordinary free chunk */
+  if (csp != old_top) {
+    mchunkptr q = (mchunkptr)old_top;
+    size_t psize = csp - old_top;
+    mchunkptr tn = chunk_plus_offset(q, psize);
+    set_free_with_pinuse(q, psize, tn);
+    insert_chunk(m, q, psize);
+  }
+
+  check_top_chunk(m, m->top);
+}
+
+/* -------------------------- System allocation -------------------------- */
+
+/* Get memory from system using MORECORE or MMAP */
+static void* sys_alloc(mstate m, size_t nb) {
+  char* tbase = CMFAIL;
+  size_t tsize = 0;
+  flag_t mmap_flag = 0;
+
+  ensure_initialization();
+
+  /* Directly map large chunks, but only if already initialized */
+  if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize != 0) {
+    void* mem = mmap_alloc(m, nb);
+    if (mem != 0)
+      return mem;
+  }
+
+  /*
+    Try getting memory in any of three ways (in most-preferred to
+    least-preferred order):
+    1. A call to MORECORE that can normally contiguously extend memory.
+       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
+       or main space is mmapped or a previous contiguous call failed)
+    2. A call to MMAP new space (disabled if not HAVE_MMAP).
+       Note that under the default settings, if MORECORE is unable to
+       fulfill a request, and HAVE_MMAP is true, then mmap is
+       used as a noncontiguous system allocator. This is a useful backup
+       strategy for systems with holes in address spaces -- in this case
+       sbrk cannot contiguously expand the heap, but mmap may be able to
+       find space.
+    3. A call to MORECORE that cannot usually contiguously extend memory.
+       (disabled if not HAVE_MORECORE)
+
+   In all cases, we need to request enough bytes from system to ensure
+   we can malloc nb bytes upon success, so pad with enough space for
+   top_foot, plus alignment-pad to make sure we don't lose bytes if
+   not on boundary, and round this up to a granularity unit.
+  */
+
+  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
+    char* br = CMFAIL;
+    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top);
+    size_t asize = 0;
+    ACQUIRE_MALLOC_GLOBAL_LOCK();
+
+    if (ss == 0) {  /* First time through or recovery */
+      char* base = (char*)CALL_MORECORE(0);
+      if (base != CMFAIL) {
+        asize = granularity_align(nb + SYS_ALLOC_PADDING);
+        /* Adjust to end on a page boundary */
+        if (!is_page_aligned(base))
+          asize += (page_align((size_t)base) - (size_t)base);
+        /* Can't call MORECORE if size is negative when treated as signed */
+        if (asize < HALF_MAX_SIZE_T &&
+            (br = (char*)(CALL_MORECORE(asize))) == base) {
+          tbase = base;
+          tsize = asize;
+        }
+      }
+    }
+    else {
+      /* Subtract out existing available top space from MORECORE request. */
+      asize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
+      /* Use mem here only if it did continuously extend old space */
+      if (asize < HALF_MAX_SIZE_T &&
+          (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) {
+        tbase = br;
+        tsize = asize;
+      }
+    }
+
+    if (tbase == CMFAIL) {    /* Cope with partial failure */
+      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
+        if (asize < HALF_MAX_SIZE_T &&
+            asize < nb + SYS_ALLOC_PADDING) {
+          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - asize);
+          if (esize < HALF_MAX_SIZE_T) {
+            char* end = (char*)CALL_MORECORE(esize);
+            if (end != CMFAIL)
+              asize += esize;
+            else {            /* Can't use; try to release */
+              (void) CALL_MORECORE(-asize);
+              br = CMFAIL;
+            }
+          }
+        }
+      }
+      if (br != CMFAIL) {    /* Use the space we did get */
+        tbase = br;
+        tsize = asize;
+      }
+      else
+        disable_contiguous(m); /* Don't try contiguous path in the future */
+    }
+
+    RELEASE_MALLOC_GLOBAL_LOCK();
+  }
+
+  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
+    size_t rsize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (rsize > nb) { /* Fail if wraps around zero */
+      char* mp = (char*)(CALL_MMAP(rsize));
+      if (mp != CMFAIL) {
+        tbase = mp;
+        tsize = rsize;
+        mmap_flag = USE_MMAP_BIT;
+      }
+    }
+  }
+
+  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
+    size_t asize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (asize < HALF_MAX_SIZE_T) {
+      char* br = CMFAIL;
+      char* end = CMFAIL;
+      ACQUIRE_MALLOC_GLOBAL_LOCK();
+      br = (char*)(CALL_MORECORE(asize));
+      end = (char*)(CALL_MORECORE(0));
+      RELEASE_MALLOC_GLOBAL_LOCK();
+      if (br != CMFAIL && end != CMFAIL && br < end) {
+        size_t ssize = end - br;
+        if (ssize > nb + TOP_FOOT_SIZE) {
+          tbase = br;
+          tsize = ssize;
+        }
+      }
+    }
+  }
+
+  if (tbase != CMFAIL) {
+
+    if ((m->footprint += tsize) > m->max_footprint)
+      m->max_footprint = m->footprint;
+
+    if (!is_initialized(m)) { /* first-time initialization */
+      if (m->least_addr == 0 || tbase < m->least_addr)
+        m->least_addr = tbase;
+      m->seg.base = tbase;
+      m->seg.size = tsize;
+      m->seg.sflags = mmap_flag;
+      m->magic = mparams.magic;
+      m->release_checks = MAX_RELEASE_CHECK_RATE;
+      init_bins(m);
+#if !ONLY_MSPACES
+      if (is_global(m))
+        init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+      else
+#endif
+      {
+        /* Offset top by embedded malloc_state */
+        mchunkptr mn = next_chunk(mem2chunk(m));
+        init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
+      }
+    }
+
+    else {
+      /* Try to merge with an existing segment */
+      msegmentptr sp = &m->seg;
+      /* Only consider most recent segment if traversal suppressed */
+      while (sp != 0 && tbase != sp->base + sp->size)
+        sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+      if (sp != 0 &&
+          !is_extern_segment(sp) &&
+          (sp->sflags & USE_MMAP_BIT) == mmap_flag &&
+          segment_holds(sp, m->top)) { /* append */
+        sp->size += tsize;
+        init_top(m, m->top, m->topsize + tsize);
+      }
+      else {
+        if (tbase < m->least_addr)
+          m->least_addr = tbase;
+        sp = &m->seg;
+        while (sp != 0 && sp->base != tbase + tsize)
+          sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+        if (sp != 0 &&
+            !is_extern_segment(sp) &&
+            (sp->sflags & USE_MMAP_BIT) == mmap_flag) {
+          char* oldbase = sp->base;
+          sp->base = tbase;
+          sp->size += tsize;
+          return prepend_alloc(m, tbase, oldbase, nb);
+        }
+        else
+          add_segment(m, tbase, tsize, mmap_flag);
+      }
+    }
+
+    if (nb < m->topsize) { /* Allocate from new or extended top space */
+      size_t rsize = m->topsize -= nb;
+      mchunkptr p = m->top;
+      mchunkptr r = m->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+      check_top_chunk(m, m->top);
+      check_malloced_chunk(m, chunk2mem(p), nb);
+      return chunk2mem(p);
+    }
+  }
+
+  MALLOC_FAILURE_ACTION;
+  return 0;
+}
+
+/* -----------------------  system deallocation -------------------------- */
+
+/* Unmap and unlink any mmapped segments that don't contain used chunks */
+static size_t release_unused_segments(mstate m) {
+  size_t released = 0;
+  int nsegs = 0;
+  msegmentptr pred = &m->seg;
+  msegmentptr sp = pred->next;
+  while (sp != 0) {
+    char* base = sp->base;
+    size_t size = sp->size;
+    msegmentptr next = sp->next;
+    ++nsegs;
+    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
+      mchunkptr p = align_as_chunk(base);
+      size_t psize = chunksize(p);
+      /* Can unmap if first chunk holds entire segment and not pinned */
+      if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
+        tchunkptr tp = (tchunkptr)p;
+        assert(segment_holds(sp, (char*)sp));
+        if (p == m->dv) {
+          m->dv = 0;
+          m->dvsize = 0;
+        }
+        else {
+          unlink_large_chunk(m, tp);
+        }
+        if (CALL_MUNMAP(base, size) == 0) {
+          released += size;
+          m->footprint -= size;
+          /* unlink obsoleted record */
+          sp = pred;
+          sp->next = next;
+        }
+        else { /* back out if cannot unmap */
+          insert_large_chunk(m, tp, psize);
+        }
+      }
+    }
+    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
+      break;
+    pred = sp;
+    sp = next;
+  }
+  /* Reset check counter */
+  m->release_checks = ((nsegs > MAX_RELEASE_CHECK_RATE)?
+                       nsegs : MAX_RELEASE_CHECK_RATE);
+  return released;
+}
+
+static int sys_trim(mstate m, size_t pad) {
+  size_t released = 0;
+  ensure_initialization();
+  if (pad < MAX_REQUEST && is_initialized(m)) {
+    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */
+
+    if (m->topsize > pad) {
+      /* Shrink top space in granularity-size units, keeping at least one */
+      size_t unit = mparams.granularity;
+      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
+                      SIZE_T_ONE) * unit;
+      msegmentptr sp = segment_holding(m, (char*)m->top);
+
+      if (!is_extern_segment(sp)) {
+        if (is_mmapped_segment(sp)) {
+          if (HAVE_MMAP &&
+              sp->size >= extra &&
+              !has_segment_link(m, sp)) { /* can't shrink if pinned */
+            size_t newsize = sp->size - extra;
+            /* Prefer mremap, fall back to munmap */
+            if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
+                (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
+              released = extra;
+            }
+          }
+        }
+        else if (HAVE_MORECORE) {
+          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
+            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
+          ACQUIRE_MALLOC_GLOBAL_LOCK();
+          {
+            /* Make sure end of memory is where we last set it. */
+            char* old_br = (char*)(CALL_MORECORE(0));
+            if (old_br == sp->base + sp->size) {
+              char* rel_br = (char*)(CALL_MORECORE(-extra));
+              char* new_br = (char*)(CALL_MORECORE(0));
+              if (rel_br != CMFAIL && new_br < old_br)
+                released = old_br - new_br;
+            }
+          }
+          RELEASE_MALLOC_GLOBAL_LOCK();
+        }
+      }
+
+      if (released != 0) {
+        sp->size -= released;
+        m->footprint -= released;
+        init_top(m, m->top, m->topsize - released);
+        check_top_chunk(m, m->top);
+      }
+    }
+
+    /* Unmap any unused mmapped segments */
+    if (HAVE_MMAP)
+      released += release_unused_segments(m);
+
+    /* On failure, disable autotrim to avoid repeated failed future calls */
+    if (released == 0 && m->topsize > m->trim_check)
+      m->trim_check = MAX_SIZE_T;
+  }
+
+  return (released != 0)? 1 : 0;
+}
+
+
+/* ---------------------------- malloc support --------------------------- */
+
+/* allocate a large request from the best fitting chunk in a treebin */
+static void* tmalloc_large(mstate m, size_t nb) {
+  tchunkptr v = 0;
+  size_t rsize = -nb; /* Unsigned negation */
+  tchunkptr t;
+  bindex_t idx;
+  compute_tree_index(nb, idx);
+  if ((t = *treebin_at(m, idx)) != 0) {
+    /* Traverse tree for this bin looking for node with size == nb */
+    size_t sizebits = nb << leftshift_for_tree_index(idx);
+    tchunkptr rst = 0;  /* The deepest untaken right subtree */
+    for (;;) {
+      tchunkptr rt;
+      size_t trem = chunksize(t) - nb;
+      if (trem < rsize) {
+        v = t;
+        if ((rsize = trem) == 0)
+          break;
+      }
+      rt = t->child[1];
+      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+      if (rt != 0 && rt != t)
+        rst = rt;
+      if (t == 0) {
+        t = rst; /* set t to least subtree holding sizes > nb */
+        break;
+      }
+      sizebits <<= 1;
+    }
+  }
+  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
+    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
+    if (leftbits != 0) {
+      bindex_t i;
+      binmap_t leastbit = least_bit(leftbits);
+      compute_bit2idx(leastbit, i);
+      t = *treebin_at(m, i);
+    }
+  }
+
+  while (t != 0) { /* find smallest of tree or subtree */
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+    t = leftmost_child(t);
+  }
+
+  /*  If dv is a better fit, return 0 so malloc will use it */
+  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
+    if (RTCHECK(ok_address(m, v))) { /* split */
+      mchunkptr r = chunk_plus_offset(v, nb);
+      assert(chunksize(v) == rsize + nb);
+      if (RTCHECK(ok_next(v, r))) {
+        unlink_large_chunk(m, v);
+        if (rsize < MIN_CHUNK_SIZE)
+          set_inuse_and_pinuse(m, v, (rsize + nb));
+        else {
+          set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+          set_size_and_pinuse_of_free_chunk(r, rsize);
+          insert_chunk(m, r, rsize);
+        }
+        return chunk2mem(v);
+      }
+    }
+    CORRUPTION_ERROR_ACTION(m);
+  }
+  return 0;
+}
+
+/* allocate a small request from the best fitting chunk in a treebin */
+static void* tmalloc_small(mstate m, size_t nb) {
+  tchunkptr t, v;
+  size_t rsize;
+  bindex_t i;
+  binmap_t leastbit = least_bit(m->treemap);
+  compute_bit2idx(leastbit, i);
+  v = t = *treebin_at(m, i);
+  rsize = chunksize(t) - nb;
+
+  while ((t = leftmost_child(t)) != 0) {
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+  }
+
+  if (RTCHECK(ok_address(m, v))) {
+    mchunkptr r = chunk_plus_offset(v, nb);
+    assert(chunksize(v) == rsize + nb);
+    if (RTCHECK(ok_next(v, r))) {
+      unlink_large_chunk(m, v);
+      if (rsize < MIN_CHUNK_SIZE)
+        set_inuse_and_pinuse(m, v, (rsize + nb));
+      else {
+        set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        replace_dv(m, r, rsize);
+      }
+      return chunk2mem(v);
+    }
+  }
+
+  CORRUPTION_ERROR_ACTION(m);
+  return 0;
+}
+
+/* --------------------------- realloc support --------------------------- */
+
+static void* internal_realloc(mstate m, void* oldmem, size_t bytes) {
+  if (bytes >= MAX_REQUEST) {
+    MALLOC_FAILURE_ACTION;
+    return 0;
+  }
+  if (!PREACTION(m)) {
+    mchunkptr oldp = mem2chunk(oldmem);
+    size_t oldsize = chunksize(oldp);
+    mchunkptr next = chunk_plus_offset(oldp, oldsize);
+    mchunkptr newp = 0;
+    void* extra = 0;
+
+    /* Try to either shrink or extend into top. Else malloc-copy-free */
+
+    if (RTCHECK(ok_address(m, oldp) && ok_inuse(oldp) &&
+                ok_next(oldp, next) && ok_pinuse(next))) {
+      size_t nb = request2size(bytes);
+      if (is_mmapped(oldp))
+        newp = mmap_resize(m, oldp, nb);
+      else if (oldsize >= nb) { /* already big enough */
+        size_t rsize = oldsize - nb;
+        newp = oldp;
+        if (rsize >= MIN_CHUNK_SIZE) {
+          mchunkptr remainder = chunk_plus_offset(newp, nb);
+          set_inuse(m, newp, nb);
+          set_inuse_and_pinuse(m, remainder, rsize);
+          extra = chunk2mem(remainder);
+        }
+      }
+      else if (next == m->top && oldsize + m->topsize > nb) {
+        /* Expand into top */
+        size_t newsize = oldsize + m->topsize;
+        size_t newtopsize = newsize - nb;
+        mchunkptr newtop = chunk_plus_offset(oldp, nb);
+        set_inuse(m, oldp, nb);
+        newtop->head = newtopsize |PINUSE_BIT;
+        m->top = newtop;
+        m->topsize = newtopsize;
+        newp = oldp;
+      }
+    }
+    else {
+      USAGE_ERROR_ACTION(m, oldmem);
+      POSTACTION(m);
+      return 0;
+    }
+#if DEBUG
+    if (newp != 0) {
+      check_inuse_chunk(m, newp); /* Check requires lock */
+    }
+#endif
+
+    POSTACTION(m);
+
+    if (newp != 0) {
+      if (extra != 0) {
+        internal_free(m, extra);
+      }
+      return chunk2mem(newp);
+    }
+    else {
+      void* newmem = internal_malloc(m, bytes);
+      if (newmem != 0) {
+        size_t oc = oldsize - overhead_for(oldp);
+        memcpy(newmem, oldmem, (oc < bytes)? oc : bytes);
+        internal_free(m, oldmem);
+      }
+      return newmem;
+    }
+  }
+  return 0;
+}
+
+/* --------------------------- memalign support -------------------------- */
+
+static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
+  if (alignment <= MALLOC_ALIGNMENT)    /* Can just use malloc */
+    return internal_malloc(m, bytes);
+  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
+    alignment = MIN_CHUNK_SIZE;
+  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
+    size_t a = MALLOC_ALIGNMENT << 1;
+    while (a < alignment) a <<= 1;
+    alignment = a;
+  }
+
+  if (bytes >= MAX_REQUEST - alignment) {
+    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
+      MALLOC_FAILURE_ACTION;
+    }
+  }
+  else {
+    size_t nb = request2size(bytes);
+    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
+    char* mem = (char*)internal_malloc(m, req);
+    if (mem != 0) {
+      void* leader = 0;
+      void* trailer = 0;
+      mchunkptr p = mem2chunk(mem);
+
+      if (PREACTION(m)) return 0;
+      if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */
+        /*
+          Find an aligned spot inside chunk.  Since we need to give
+          back leading space in a chunk of at least MIN_CHUNK_SIZE, if
+          the first calculation places us at a spot with less than
+          MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
+          We've allocated enough total room so that this is always
+          possible.
+        */
+        char* br = (char*)mem2chunk((size_t)(((size_t)(mem +
+                                                       alignment -
+                                                       SIZE_T_ONE)) &
+                                             -alignment));
+        char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
+          br : br+alignment;
+        mchunkptr newp = (mchunkptr)pos;
+        size_t leadsize = pos - (char*)(p);
+        size_t newsize = chunksize(p) - leadsize;
+
+        if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
+          newp->prev_foot = p->prev_foot + leadsize;
+          newp->head = newsize;
+        }
+        else { /* Otherwise, give back leader, use the rest */
+          set_inuse(m, newp, newsize);
+          set_inuse(m, p, leadsize);
+          leader = chunk2mem(p);
+        }
+        p = newp;
+      }
+
+      /* Give back spare room at the end */
+      if (!is_mmapped(p)) {
+        size_t size = chunksize(p);
+        if (size > nb + MIN_CHUNK_SIZE) {
+          size_t remainder_size = size - nb;
+          mchunkptr remainder = chunk_plus_offset(p, nb);
+          set_inuse(m, p, nb);
+          set_inuse(m, remainder, remainder_size);
+          trailer = chunk2mem(remainder);
+        }
+      }
+
+      assert (chunksize(p) >= nb);
+      assert((((size_t)(chunk2mem(p))) % alignment) == 0);
+      check_inuse_chunk(m, p);
+      POSTACTION(m);
+      if (leader != 0) {
+        internal_free(m, leader);
+      }
+      if (trailer != 0) {
+        internal_free(m, trailer);
+      }
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* ------------------------ comalloc/coalloc support --------------------- */
+
+static void** ialloc(mstate m,
+                     size_t n_elements,
+                     size_t* sizes,
+                     int opts,
+                     void* chunks[]) {
+  /*
+    This provides common support for independent_X routines, handling
+    all of the combinations that can result.
+
+    The opts arg has:
+    bit 0 set if all elements are same size (using sizes[0])
+    bit 1 set if elements should be zeroed
+  */
+
+  size_t    element_size;   /* chunksize of each element, if all same */
+  size_t    contents_size;  /* total size of elements */
+  size_t    array_size;     /* request size of pointer array */
+  void*     mem;            /* malloced aggregate space */
+  mchunkptr p;              /* corresponding chunk */
+  size_t    remainder_size; /* remaining bytes while splitting */
+  void**    marray;         /* either "chunks" or malloced ptr array */
+  mchunkptr array_chunk;    /* chunk for malloced ptr array */
+  flag_t    was_enabled;    /* to disable mmap */
+  size_t    size;
+  size_t    i;
+
+  ensure_initialization();
+  /* compute array length, if needed */
+  if (chunks != 0) {
+    if (n_elements == 0)
+      return chunks; /* nothing to do */
+    marray = chunks;
+    array_size = 0;
+  }
+  else {
+    /* if empty req, must still return chunk representing empty array */
+    if (n_elements == 0)
+      return (void**)internal_malloc(m, 0);
+    marray = 0;
+    array_size = request2size(n_elements * (sizeof(void*)));
+  }
+
+  /* compute total element size */
+  if (opts & 0x1) { /* all-same-size */
+    element_size = request2size(*sizes);
+    contents_size = n_elements * element_size;
+  }
+  else { /* add up all the sizes */
+    element_size = 0;
+    contents_size = 0;
+    for (i = 0; i != n_elements; ++i)
+      contents_size += request2size(sizes[i]);
+  }
+
+  size = contents_size + array_size;
+
+  /*
+     Allocate the aggregate chunk.  First disable direct-mmapping so
+     malloc won't use it, since we would not be able to later
+     free/realloc space internal to a segregated mmap region.
+  */
+  was_enabled = use_mmap(m);
+  disable_mmap(m);
+  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
+  if (was_enabled)
+    enable_mmap(m);
+  if (mem == 0)
+    return 0;
+
+  if (PREACTION(m)) return 0;
+  p = mem2chunk(mem);
+  remainder_size = chunksize(p);
+
+  assert(!is_mmapped(p));
+
+  if (opts & 0x2) {       /* optionally clear the elements */
+    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
+  }
+
+  /* If not provided, allocate the pointer array as final part of chunk */
+  if (marray == 0) {
+    size_t  array_chunk_size;
+    array_chunk = chunk_plus_offset(p, contents_size);
+    array_chunk_size = remainder_size - contents_size;
+    marray = (void**) (chunk2mem(array_chunk));
+    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
+    remainder_size = contents_size;
+  }
+
+  /* split out elements */
+  for (i = 0; ; ++i) {
+    marray[i] = chunk2mem(p);
+    if (i != n_elements-1) {
+      if (element_size != 0)
+        size = element_size;
+      else
+        size = request2size(sizes[i]);
+      remainder_size -= size;
+      set_size_and_pinuse_of_inuse_chunk(m, p, size);
+      p = chunk_plus_offset(p, size);
+    }
+    else { /* the final element absorbs any overallocation slop */
+      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
+      break;
+    }
+  }
+
+#if DEBUG
+  if (marray != chunks) {
+    /* final element must have exactly exhausted chunk */
+    if (element_size != 0) {
+      assert(remainder_size == element_size);
+    }
+    else {
+      assert(remainder_size == request2size(sizes[i]));
+    }
+    check_inuse_chunk(m, mem2chunk(marray));
+  }
+  for (i = 0; i != n_elements; ++i)
+    check_inuse_chunk(m, mem2chunk(marray[i]));
+
+#endif /* DEBUG */
+
+  POSTACTION(m);
+  return marray;
+}
+
+
+/* -------------------------- public routines ---------------------------- */
+
+#if !ONLY_MSPACES
+
+void* dlmalloc(size_t bytes) {
+  /*
+     Basic algorithm:
+     If a small request (< 256 bytes minus per-chunk overhead):
+       1. If one exists, use a remainderless chunk in associated smallbin.
+          (Remainderless means that there are too few excess bytes to
+          represent as a chunk.)
+       2. If it is big enough, use the dv chunk, which is normally the
+          chunk adjacent to the one used for the most recent small request.
+       3. If one exists, split the smallest available chunk in a bin,
+          saving remainder in dv.
+       4. If it is big enough, use the top chunk.
+       5. If available, get memory from system and use it
+     Otherwise, for a large request:
+       1. Find the smallest available binned chunk that fits, and use it
+          if it is better fitting than dv chunk, splitting if necessary.
+       2. If better fitting than any binned chunk, use the dv chunk.
+       3. If it is big enough, use the top chunk.
+       4. If request size >= mmap threshold, try to directly mmap this chunk.
+       5. If available, get memory from system and use it
+
+     The ugly goto's here ensure that postaction occurs along all paths.
+  */
+
+#if USE_LOCKS
+  ensure_initialization(); /* initialize in sys_alloc if not using locks */
+#endif
+
+  if (!PREACTION(gm)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = gm->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+        mchunkptr b, p;
+        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+        b = smallbin_at(gm, idx);
+        p = b->fd;
+        assert(chunksize(p) == small_index2size(idx));
+        unlink_first_small_chunk(gm, b, p, idx);
+        set_inuse_and_pinuse(gm, p, small_index2size(idx));
+        mem = chunk2mem(p);
+        check_malloced_chunk(gm, mem, nb);
+        goto postaction;
+      }
+
+      else if (nb > gm->dvsize) {
+        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+          mchunkptr b, p, r;
+          size_t rsize;
+          bindex_t i;
+          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+          binmap_t leastbit = least_bit(leftbits);
+          compute_bit2idx(leastbit, i);
+          b = smallbin_at(gm, i);
+          p = b->fd;
+          assert(chunksize(p) == small_index2size(i));
+          unlink_first_small_chunk(gm, b, p, i);
+          rsize = small_index2size(i) - nb;
+          /* Fit here cannot be remainderless if 4byte sizes */
+          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+            set_inuse_and_pinuse(gm, p, small_index2size(i));
+          else {
+            set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+            r = chunk_plus_offset(p, nb);
+            set_size_and_pinuse_of_free_chunk(r, rsize);
+            replace_dv(gm, r, rsize);
+          }
+          mem = chunk2mem(p);
+          check_malloced_chunk(gm, mem, nb);
+          goto postaction;
+        }
+
+        else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
+          check_malloced_chunk(gm, mem, nb);
+          goto postaction;
+        }
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
+        check_malloced_chunk(gm, mem, nb);
+        goto postaction;
+      }
+    }
+
+    if (nb <= gm->dvsize) {
+      size_t rsize = gm->dvsize - nb;
+      mchunkptr p = gm->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+        mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
+        gm->dvsize = rsize;
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      }
+      else { /* exhaust dv */
+        size_t dvs = gm->dvsize;
+        gm->dvsize = 0;
+        gm->dv = 0;
+        set_inuse_and_pinuse(gm, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < gm->topsize) { /* Split top */
+      size_t rsize = gm->topsize -= nb;
+      mchunkptr p = gm->top;
+      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(gm, gm->top);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(gm, nb);
+
+  postaction:
+    POSTACTION(gm);
+    return mem;
+  }
+
+  return 0;
+}
+
+void dlfree(void* mem) {
+  /*
+     Consolidate freed chunks with preceeding or succeeding bordering
+     free chunks, if they exist, and then place in a bin.  Intermixed
+     with special cases for top, dv, mmapped chunks, and usage errors.
+  */
+
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+#else /* FOOTERS */
+#define fm gm
+#endif /* FOOTERS */
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
+        size_t psize = chunksize(p);
+        mchunkptr next = chunk_plus_offset(p, psize);
+        if (!pinuse(p)) {
+          size_t prevsize = p->prev_foot;
+          if (is_mmapped(p)) {
+            psize += prevsize + MMAP_FOOT_PAD;
+            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+              fm->footprint -= psize;
+            goto postaction;
+          }
+          else {
+            mchunkptr prev = chunk_minus_offset(p, prevsize);
+            psize += prevsize;
+            p = prev;
+            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+              if (p != fm->dv) {
+                unlink_chunk(fm, p, prevsize);
+              }
+              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+                fm->dvsize = psize;
+                set_free_with_pinuse(p, psize, next);
+                goto postaction;
+              }
+            }
+            else
+              goto erroraction;
+          }
+        }
+
+        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+          if (!cinuse(next)) {  /* consolidate forward */
+            if (next == fm->top) {
+              size_t tsize = fm->topsize += psize;
+              fm->top = p;
+              p->head = tsize | PINUSE_BIT;
+              if (p == fm->dv) {
+                fm->dv = 0;
+                fm->dvsize = 0;
+              }
+              if (should_trim(fm, tsize))
+                sys_trim(fm, 0);
+              goto postaction;
+            }
+            else if (next == fm->dv) {
+              size_t dsize = fm->dvsize += psize;
+              fm->dv = p;
+              set_size_and_pinuse_of_free_chunk(p, dsize);
+              goto postaction;
+            }
+            else {
+              size_t nsize = chunksize(next);
+              psize += nsize;
+              unlink_chunk(fm, next, nsize);
+              set_size_and_pinuse_of_free_chunk(p, psize);
+              if (p == fm->dv) {
+                fm->dvsize = psize;
+                goto postaction;
+              }
+            }
+          }
+          else
+            set_free_with_pinuse(p, psize, next);
+
+          if (is_small(psize)) {
+            insert_small_chunk(fm, p, psize);
+            check_free_chunk(fm, p);
+          }
+          else {
+            tchunkptr tp = (tchunkptr)p;
+            insert_large_chunk(fm, tp, psize);
+            check_free_chunk(fm, p);
+            if (--fm->release_checks == 0)
+              release_unused_segments(fm);
+          }
+          goto postaction;
+        }
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+#if !FOOTERS
+#undef fm
+#endif /* FOOTERS */
+}
+
+void* dlcalloc(size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+        (req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = dlmalloc(req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* dlrealloc(void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return dlmalloc(bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    dlfree(oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if ! FOOTERS
+    mstate m = gm;
+#else /* FOOTERS */
+    mstate m = get_mstate_for(mem2chunk(oldmem));
+    if (!ok_magic(m)) {
+      USAGE_ERROR_ACTION(m, oldmem);
+      return 0;
+    }
+#endif /* FOOTERS */
+    return internal_realloc(m, oldmem, bytes);
+  }
+}
+
+void* dlmemalign(size_t alignment, size_t bytes) {
+  return internal_memalign(gm, alignment, bytes);
+}
+
+void** dlindependent_calloc(size_t n_elements, size_t elem_size,
+                                 void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  return ialloc(gm, n_elements, &sz, 3, chunks);
+}
+
+void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
+                                   void* chunks[]) {
+  return ialloc(gm, n_elements, sizes, 0, chunks);
+}
+
+void* dlvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, bytes);
+}
+
+void* dlpvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
+}
+
+int dlmalloc_trim(size_t pad) {
+  int result = 0;
+  ensure_initialization();
+  if (!PREACTION(gm)) {
+    result = sys_trim(gm, pad);
+    POSTACTION(gm);
+  }
+  return result;
+}
+
+size_t dlmalloc_footprint(void) {
+  return gm->footprint;
+}
+
+size_t dlmalloc_max_footprint(void) {
+  return gm->max_footprint;
+}
+
+#if !NO_MALLINFO
+struct mallinfo dlmallinfo(void) {
+  return internal_mallinfo(gm);
+}
+#endif /* NO_MALLINFO */
+
+void dlmalloc_stats() {
+  internal_malloc_stats(gm);
+}
+
+int dlmallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* !ONLY_MSPACES */
+
+size_t dlmalloc_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (is_inuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+/* ----------------------------- user mspaces ---------------------------- */
+
+#if MSPACES
+
+static mstate init_user_mstate(char* tbase, size_t tsize) {
+  size_t msize = pad_request(sizeof(struct malloc_state));
+  mchunkptr mn;
+  mchunkptr msp = align_as_chunk(tbase);
+  mstate m = (mstate)(chunk2mem(msp));
+  memset(m, 0, msize);
+  INITIAL_LOCK(&m->mutex);
+  msp->head = (msize|INUSE_BITS);
+  m->seg.base = m->least_addr = tbase;
+  m->seg.size = m->footprint = m->max_footprint = tsize;
+  m->magic = mparams.magic;
+  m->release_checks = MAX_RELEASE_CHECK_RATE;
+  m->mflags = mparams.default_mflags;
+  m->extp = 0;
+  m->exts = 0;
+  disable_contiguous(m);
+  init_bins(m);
+  mn = next_chunk(mem2chunk(m));
+  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
+  check_top_chunk(m, m->top);
+  return m;
+}
+
+mspace create_mspace(size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    size_t rs = ((capacity == 0)? mparams.granularity :
+                 (capacity + TOP_FOOT_SIZE + msize));
+    size_t tsize = granularity_align(rs);
+    char* tbase = (char*)(CALL_MMAP(tsize));
+    if (tbase != CMFAIL) {
+      m = init_user_mstate(tbase, tsize);
+      m->seg.sflags = USE_MMAP_BIT;
+      set_lock(m, locked);
+    }
+  }
+  return (mspace)m;
+}
+
+mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity > msize + TOP_FOOT_SIZE &&
+      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    m = init_user_mstate((char*)base, capacity);
+    m->seg.sflags = EXTERN_BIT;
+    set_lock(m, locked);
+  }
+  return (mspace)m;
+}
+
+int mspace_track_large_chunks(mspace msp, int enable) {
+  int ret = 0;
+  mstate ms = (mstate)msp;
+  if (!PREACTION(ms)) {
+    if (!use_mmap(ms))
+      ret = 1;
+    if (!enable)
+      enable_mmap(ms);
+    else
+      disable_mmap(ms);
+    POSTACTION(ms);
+  }
+  return ret;
+}
+
+size_t destroy_mspace(mspace msp) {
+  size_t freed = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    msegmentptr sp = &ms->seg;
+    while (sp != 0) {
+      char* base = sp->base;
+      size_t size = sp->size;
+      flag_t flag = sp->sflags;
+      sp = sp->next;
+      if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
+          CALL_MUNMAP(base, size) == 0)
+        freed += size;
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return freed;
+}
+
+/*
+  mspace versions of routines are near-clones of the global
+  versions. This is not so nice but better than the alternatives.
+*/
+
+
+void* mspace_malloc(mspace msp, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (!PREACTION(ms)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = ms->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+        mchunkptr b, p;
+        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+        b = smallbin_at(ms, idx);
+        p = b->fd;
+        assert(chunksize(p) == small_index2size(idx));
+        unlink_first_small_chunk(ms, b, p, idx);
+        set_inuse_and_pinuse(ms, p, small_index2size(idx));
+        mem = chunk2mem(p);
+        check_malloced_chunk(ms, mem, nb);
+        goto postaction;
+      }
+
+      else if (nb > ms->dvsize) {
+        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+          mchunkptr b, p, r;
+          size_t rsize;
+          bindex_t i;
+          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+          binmap_t leastbit = least_bit(leftbits);
+          compute_bit2idx(leastbit, i);
+          b = smallbin_at(ms, i);
+          p = b->fd;
+          assert(chunksize(p) == small_index2size(i));
+          unlink_first_small_chunk(ms, b, p, i);
+          rsize = small_index2size(i) - nb;
+          /* Fit here cannot be remainderless if 4byte sizes */
+          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+            set_inuse_and_pinuse(ms, p, small_index2size(i));
+          else {
+            set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+            r = chunk_plus_offset(p, nb);
+            set_size_and_pinuse_of_free_chunk(r, rsize);
+            replace_dv(ms, r, rsize);
+          }
+          mem = chunk2mem(p);
+          check_malloced_chunk(ms, mem, nb);
+          goto postaction;
+        }
+
+        else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
+          check_malloced_chunk(ms, mem, nb);
+          goto postaction;
+        }
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
+        check_malloced_chunk(ms, mem, nb);
+        goto postaction;
+      }
+    }
+
+    if (nb <= ms->dvsize) {
+      size_t rsize = ms->dvsize - nb;
+      mchunkptr p = ms->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+        mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
+        ms->dvsize = rsize;
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      }
+      else { /* exhaust dv */
+        size_t dvs = ms->dvsize;
+        ms->dvsize = 0;
+        ms->dv = 0;
+        set_inuse_and_pinuse(ms, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < ms->topsize) { /* Split top */
+      size_t rsize = ms->topsize -= nb;
+      mchunkptr p = ms->top;
+      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(ms, ms->top);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(ms, nb);
+
+  postaction:
+    POSTACTION(ms);
+    return mem;
+  }
+
+  return 0;
+}
+
+void mspace_free(mspace msp, void* mem) {
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+    msp = msp; /* placate people compiling -Wunused */
+#else /* FOOTERS */
+    mstate fm = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
+        size_t psize = chunksize(p);
+        mchunkptr next = chunk_plus_offset(p, psize);
+        if (!pinuse(p)) {
+          size_t prevsize = p->prev_foot;
+          if (is_mmapped(p)) {
+            psize += prevsize + MMAP_FOOT_PAD;
+            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+              fm->footprint -= psize;
+            goto postaction;
+          }
+          else {
+            mchunkptr prev = chunk_minus_offset(p, prevsize);
+            psize += prevsize;
+            p = prev;
+            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+              if (p != fm->dv) {
+                unlink_chunk(fm, p, prevsize);
+              }
+              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+                fm->dvsize = psize;
+                set_free_with_pinuse(p, psize, next);
+                goto postaction;
+              }
+            }
+            else
+              goto erroraction;
+          }
+        }
+
+        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+          if (!cinuse(next)) {  /* consolidate forward */
+            if (next == fm->top) {
+              size_t tsize = fm->topsize += psize;
+              fm->top = p;
+              p->head = tsize | PINUSE_BIT;
+              if (p == fm->dv) {
+                fm->dv = 0;
+                fm->dvsize = 0;
+              }
+              if (should_trim(fm, tsize))
+                sys_trim(fm, 0);
+              goto postaction;
+            }
+            else if (next == fm->dv) {
+              size_t dsize = fm->dvsize += psize;
+              fm->dv = p;
+              set_size_and_pinuse_of_free_chunk(p, dsize);
+              goto postaction;
+            }
+            else {
+              size_t nsize = chunksize(next);
+              psize += nsize;
+              unlink_chunk(fm, next, nsize);
+              set_size_and_pinuse_of_free_chunk(p, psize);
+              if (p == fm->dv) {
+                fm->dvsize = psize;
+                goto postaction;
+              }
+            }
+          }
+          else
+            set_free_with_pinuse(p, psize, next);
+
+          if (is_small(psize)) {
+            insert_small_chunk(fm, p, psize);
+            check_free_chunk(fm, p);
+          }
+          else {
+            tchunkptr tp = (tchunkptr)p;
+            insert_large_chunk(fm, tp, psize);
+            check_free_chunk(fm, p);
+            if (--fm->release_checks == 0)
+              release_unused_segments(fm);
+          }
+          goto postaction;
+        }
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+}
+
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+        (req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = internal_malloc(ms, req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return mspace_malloc(msp, bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    mspace_free(msp, oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if FOOTERS
+    mchunkptr p  = mem2chunk(oldmem);
+    mstate ms = get_mstate_for(p);
+#else /* FOOTERS */
+    mstate ms = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(ms)) {
+      USAGE_ERROR_ACTION(ms,ms);
+      return 0;
+    }
+    return internal_realloc(ms, oldmem, bytes);
+  }
+}
+
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return internal_memalign(ms, alignment, bytes);
+}
+
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+                                 size_t elem_size, void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, &sz, 3, chunks);
+}
+
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+                                   size_t sizes[], void* chunks[]) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, sizes, 0, chunks);
+}
+
+int mspace_trim(mspace msp, size_t pad) {
+  int result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    if (!PREACTION(ms)) {
+      result = sys_trim(ms, pad);
+      POSTACTION(ms);
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+void mspace_malloc_stats(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    internal_malloc_stats(ms);
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+}
+
+size_t mspace_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+size_t mspace_max_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->max_footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+#if !NO_MALLINFO
+struct mallinfo mspace_mallinfo(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return internal_mallinfo(ms);
+}
+#endif /* NO_MALLINFO */
+
+size_t mspace_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (is_inuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+int mspace_mallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* MSPACES */
+
+
+/* -------------------- Alternative MORECORE functions ------------------- */
+
+/*
+  Guidelines for creating a custom version of MORECORE:
+
+  * For best performance, MORECORE should allocate in multiples of pagesize.
+  * MORECORE may allocate more memory than requested. (Or even less,
+      but this will usually result in a malloc failure.)
+  * MORECORE must not allocate memory when given argument zero, but
+      instead return one past the end address of memory from previous
+      nonzero call.
+  * For best performance, consecutive calls to MORECORE with positive
+      arguments should return increasing addresses, indicating that
+      space has been contiguously extended.
+  * Even though consecutive calls to MORECORE need not return contiguous
+      addresses, it must be OK for malloc'ed chunks to span multiple
+      regions in those cases where they do happen to be contiguous.
+  * MORECORE need not handle negative arguments -- it may instead
+      just return MFAIL when given negative arguments.
+      Negative arguments are always multiples of pagesize. MORECORE
+      must not misinterpret negative args as large positive unsigned
+      args. You can suppress all such calls from even occurring by defining
+      MORECORE_CANNOT_TRIM,
+
+  As an example alternative MORECORE, here is a custom allocator
+  kindly contributed for pre-OSX macOS.  It uses virtually but not
+  necessarily physically contiguous non-paged memory (locked in,
+  present and won't get swapped out).  You can use it by uncommenting
+  this section, adding some #includes, and setting up the appropriate
+  defines above:
+
+      #define MORECORE osMoreCore
+
+  There is also a shutdown routine that should somehow be called for
+  cleanup upon program exit.
+
+  #define MAX_POOL_ENTRIES 100
+  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
+  static int next_os_pool;
+  void *our_os_pools[MAX_POOL_ENTRIES];
+
+  void *osMoreCore(int size)
+  {
+    void *ptr = 0;
+    static void *sbrk_top = 0;
+
+    if (size > 0)
+    {
+      if (size < MINIMUM_MORECORE_SIZE)
+         size = MINIMUM_MORECORE_SIZE;
+      if (CurrentExecutionLevel() == kTaskLevel)
+         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
+      if (ptr == 0)
+      {
+        return (void *) MFAIL;
+      }
+      // save ptrs so they can be freed during cleanup
+      our_os_pools[next_os_pool] = ptr;
+      next_os_pool++;
+      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
+      sbrk_top = (char *) ptr + size;
+      return ptr;
+    }
+    else if (size < 0)
+    {
+      // we don't currently support shrink behavior
+      return (void *) MFAIL;
+    }
+    else
+    {
+      return sbrk_top;
+    }
+  }
+
+  // cleanup any allocated memory pools
+  // called as last thing before shutting down driver
+
+  void osCleanupMem(void)
+  {
+    void **ptr;
+
+    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
+      if (*ptr)
+      {
+         PoolDeallocate(*ptr);
+         *ptr = 0;
+      }
+  }
+
+*/
+
+
+/* -----------------------------------------------------------------------
+History:
+    V2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
+      * Use zeros instead of prev foot for is_mmapped
+      * Add mspace_track_large_chunks; thanks to Jean Brouwers
+      * Fix set_inuse in internal_realloc; thanks to Jean Brouwers
+      * Fix insufficient sys_alloc padding when using 16byte alignment
+      * Fix bad error check in mspace_footprint
+      * Adaptations for ptmalloc; thanks to Wolfram Gloger.
+      * Reentrant spin locks; thanks to Earl Chew and others
+      * Win32 improvements; thanks to Niall Douglas and Earl Chew
+      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
+      * Extension hook in malloc_state
+      * Various small adjustments to reduce warnings on some compilers
+      * Various configuration extensions/changes for more platforms. Thanks
+         to all who contributed these.
+
+    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
+      * Add max_footprint functions
+      * Ensure all appropriate literals are size_t
+      * Fix conditional compilation problem for some #define settings
+      * Avoid concatenating segments with the one provided
+        in create_mspace_with_base
+      * Rename some variables to avoid compiler shadowing warnings
+      * Use explicit lock initialization.
+      * Better handling of sbrk interference.
+      * Simplify and fix segment insertion, trimming and mspace_destroy
+      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
+      * Thanks especially to Dennis Flanagan for help on these.
+
+    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
+      * Fix memalign brace error.
+
+    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
+      * Fix improper #endif nesting in C++
+      * Add explicit casts needed for C++
+
+    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
+      * Use trees for large bins
+      * Support mspaces
+      * Use segments to unify sbrk-based and mmap-based system allocation,
+        removing need for emulation on most platforms without sbrk.
+      * Default safety checks
+      * Optional footer checks. Thanks to William Robertson for the idea.
+      * Internal code refactoring
+      * Incorporate suggestions and platform-specific changes.
+        Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
+        Aaron Bachmann,  Emery Berger, and others.
+      * Speed up non-fastbin processing enough to remove fastbins.
+      * Remove useless cfree() to avoid conflicts with other apps.
+      * Remove internal memcpy, memset. Compilers handle builtins better.
+      * Remove some options that no one ever used and rename others.
+
+    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
+      * Fix malloc_state bitmap array misdeclaration
+
+    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
+      * Allow tuning of FIRST_SORTED_BIN_SIZE
+      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
+      * Better detection and support for non-contiguousness of MORECORE.
+        Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
+      * Bypass most of malloc if no frees. Thanks To Emery Berger.
+      * Fix freeing of old top non-contiguous chunk im sysmalloc.
+      * Raised default trim and map thresholds to 256K.
+      * Fix mmap-related #defines. Thanks to Lubos Lunak.
+      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
+      * Branch-free bin calculation
+      * Default trim and mmap thresholds now 256K.
+
+    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
+      * Introduce independent_comalloc and independent_calloc.
+        Thanks to Michael Pachos for motivation and help.
+      * Make optional .h file available
+      * Allow > 2GB requests on 32bit systems.
+      * new WIN32 sbrk, mmap, munmap, lock code from <[email protected]>.
+        Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
+        and Anonymous.
+      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
+        helping test this.)
+      * memalign: check alignment arg
+      * realloc: don't try to shift chunks backwards, since this
+        leads to  more fragmentation in some programs and doesn't
+        seem to help in any others.
+      * Collect all cases in malloc requiring system memory into sysmalloc
+      * Use mmap as backup to sbrk
+      * Place all internal state in malloc_state
+      * Introduce fastbins (although similar to 2.5.1)
+      * Many minor tunings and cosmetic improvements
+      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
+      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
+        Thanks to Tony E. Bennett <[email protected]> and others.
+      * Include errno.h to support default failure action.
+
+    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
+      * return null for negative arguments
+      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
+         * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
+          (e.g. WIN32 platforms)
+         * Cleanup header file inclusion for WIN32 platforms
+         * Cleanup code to avoid Microsoft Visual C++ compiler complaints
+         * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
+           memory allocation routines
+         * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
+         * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
+           usage of 'assert' in non-WIN32 code
+         * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
+           avoid infinite loop
+      * Always call 'fREe()' rather than 'free()'
+
+    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
+      * Fixed ordering problem with boundary-stamping
+
+    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
+      * Added pvalloc, as recommended by H.J. Liu
+      * Added 64bit pointer support mainly from Wolfram Gloger
+      * Added anonymously donated WIN32 sbrk emulation
+      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
+      * malloc_extend_top: fix mask error that caused wastage after
+        foreign sbrks
+      * Add linux mremap support code from HJ Liu
+
+    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
+      * Integrated most documentation with the code.
+      * Add support for mmap, with help from
+        Wolfram Gloger ([email protected]).
+      * Use last_remainder in more cases.
+      * Pack bins using idea from  [email protected]
+      * Use ordered bins instead of best-fit threshhold
+      * Eliminate block-local decls to simplify tracing and debugging.
+      * Support another case of realloc via move into top
+      * Fix error occuring when initial sbrk_base not word-aligned.
+      * Rely on page size for units instead of SBRK_UNIT to
+        avoid surprises about sbrk alignment conventions.
+      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
+        ([email protected]) for the suggestion.
+      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
+      * More precautions for cases where other routines call sbrk,
+        courtesy of Wolfram Gloger ([email protected]).
+      * Added macros etc., allowing use in linux libc from
+        H.J. Lu ([email protected])
+      * Inverted this history list
+
+    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
+      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
+      * Removed all preallocation code since under current scheme
+        the work required to undo bad preallocations exceeds
+        the work saved in good cases for most test programs.
+      * No longer use return list or unconsolidated bins since
+        no scheme using them consistently outperforms those that don't
+        given above changes.
+      * Use best fit for very large chunks to prevent some worst-cases.
+      * Added some support for debugging
+
+    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
+      * Removed footers when chunks are in use. Thanks to
+        Paul Wilson ([email protected]) for the suggestion.
+
+    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
+      * Added malloc_trim, with help from Wolfram Gloger
+        ([email protected]).
+
+    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)
+
+    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
+      * realloc: try to expand in both directions
+      * malloc: swap order of clean-bin strategy;
+      * realloc: only conditionally expand backwards
+      * Try not to scavenge used bins
+      * Use bin counts as a guide to preallocation
+      * Occasionally bin return list chunks in first scan
+      * Add a few optimizations from [email protected]
+
+    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
+      * faster bin computation & slightly different binning
+      * merged all consolidations to one part of malloc proper
+         (eliminating old malloc_find_space & malloc_clean_bin)
+      * Scan 2 returns chunks (not just 1)
+      * Propagate failure in realloc if malloc returns 0
+      * Add stuff to allow compilation on non-ANSI compilers
+          from [email protected]
+
+    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
+      * removed potential for odd address access in prev_chunk
+      * removed dependency on getpagesize.h
+      * misc cosmetics and a bit more internal documentation
+      * anticosmetics: mangled names in macros to evade debugger strangeness
+      * tested on sparc, hp-700, dec-mips, rs6000
+          with gcc & native cc (hp, dec only) allowing
+          Detlefs & Zorn comparison study (in SIGPLAN Notices.)
+
+    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
+      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
+         structure of old version,  but most details differ.)
+
+*/
+
+#endif

+ 1467 - 1467
drivers/nedmalloc/nedmalloc.cpp

@@ -1,1467 +1,1467 @@
-#ifdef NEDMALLOC_ENABLED
-/* Alternative malloc implementation for multiple threads without
-lock contention based on dlmalloc. (C) 2005-2009 Niall Douglas
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-#ifdef _MSC_VER
-/* Enable full aliasing on MSVC */
-/*#pragma optimize("a", on)*/
-#pragma warning(push)
-#pragma warning(disable:4100)	/* unreferenced formal parameter */
-#pragma warning(disable:4127)	/* conditional expression is constant */
-#pragma warning(disable:4706)	/* assignment within conditional expression */
-#endif
-
-/*#define ENABLE_TOLERANT_NEDMALLOC 1*/
-/*#define ENABLE_FAST_HEAP_DETECTION 1*/
-/*#define NEDMALLOC_DEBUG 1*/
-
-/*#define FULLSANITYCHECKS*/
-/* If link time code generation is on, don't force or prevent inlining */
-#if defined(_MSC_VER) && defined(NEDMALLOC_DLL_EXPORTS)
-#define FORCEINLINE
-#define NOINLINE
-#endif
-
-
-#include "nedmalloc.h"
-#ifdef WIN32
- #include <malloc.h>
- #include <stddef.h>
-#endif
-#if USE_ALLOCATOR==1
- #define MSPACES 1
- #define ONLY_MSPACES 1
-#endif
-#define USE_DL_PREFIX 1
-#ifndef USE_LOCKS
- #define USE_LOCKS 1
-#endif
-#define FOOTERS 1           /* Need to enable footers so frees lock the right mspace */
-#ifndef NEDMALLOC_DEBUG
- #if defined(DEBUG) || defined(_DEBUG)
-  #define NEDMALLOC_DEBUG 1
- #else
-  #define NEDMALLOC_DEBUG 0
- #endif
-#endif
-/* We need to consistently define DEBUG=0|1, _DEBUG and NDEBUG for dlmalloc */
-#undef DEBUG
-#undef _DEBUG
-#if NEDMALLOC_DEBUG
- #define _DEBUG
- #define DEBUG 1
-#else
- #define DEBUG 0
-#endif
-#ifdef NDEBUG               /* Disable assert checking on release builds */
- #undef DEBUG
- #undef _DEBUG
-#endif
-/* The default of 64Kb means we spend too much time kernel-side */
-#ifndef DEFAULT_GRANULARITY
-#define DEFAULT_GRANULARITY (1*1024*1024)
-#if DEBUG
-#define DEFAULT_GRANULARITY_ALIGNED
-#endif
-#endif
-/*#define USE_SPIN_LOCKS 0*/
-
-
-#include "malloc.c.h"
-#ifdef NDEBUG               /* Disable assert checking on release builds */
- #undef DEBUG
-#elif !NEDMALLOC_DEBUG
- #ifdef __GNUC__
-  #warning DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.
- #elif defined(_MSC_VER)
-  #pragma message(__FILE__ ": WARNING: DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.")
- #endif
-#endif
-
-/* The maximum concurrent threads in a pool possible */
-#ifndef MAXTHREADSINPOOL
-#define MAXTHREADSINPOOL 16
-#endif
-/* The maximum number of threadcaches which can be allocated */
-#ifndef THREADCACHEMAXCACHES
-#define THREADCACHEMAXCACHES 256
-#endif
-/* The maximum size to be allocated from the thread cache */
-#ifndef THREADCACHEMAX
-#define THREADCACHEMAX 8192
-#endif
-#if 0
-/* The number of cache entries for finer grained bins. This is (topbitpos(THREADCACHEMAX)-4)*2 */
-#define THREADCACHEMAXBINS ((13-4)*2)
-#else
-/* The number of cache entries. This is (topbitpos(THREADCACHEMAX)-4) */
-#define THREADCACHEMAXBINS (13-4)
-#endif
-/* Point at which the free space in a thread cache is garbage collected */
-#ifndef THREADCACHEMAXFREESPACE
-#define THREADCACHEMAXFREESPACE (512*1024)
-#endif
-
-
-#ifdef WIN32
- #define TLSVAR			DWORD
- #define TLSALLOC(k)	(*(k)=TlsAlloc(), TLS_OUT_OF_INDEXES==*(k))
- #define TLSFREE(k)		(!TlsFree(k))
- #define TLSGET(k)		TlsGetValue(k)
- #define TLSSET(k, a)	(!TlsSetValue(k, a))
- #ifdef DEBUG
-static LPVOID ChkedTlsGetValue(DWORD idx)
-{
-	LPVOID ret=TlsGetValue(idx);
-	assert(S_OK==GetLastError());
-	return ret;
-}
-  #undef TLSGET
-  #define TLSGET(k) ChkedTlsGetValue(k)
- #endif
-#else
- #define TLSVAR			pthread_key_t
- #define TLSALLOC(k)	pthread_key_create(k, 0)
- #define TLSFREE(k)		pthread_key_delete(k)
- #define TLSGET(k)		pthread_getspecific(k)
- #define TLSSET(k, a)	pthread_setspecific(k, a)
-#endif
-
-#if defined(__cplusplus)
-#if !defined(NO_NED_NAMESPACE)
-namespace nedalloc {
-#else
-extern "C" {
-#endif
-#endif
-
-#if USE_ALLOCATOR==0
-static void *unsupported_operation(const char *opname) THROWSPEC
-{
-	fprintf(stderr, "nedmalloc: The operation %s is not supported under this build configuration\n", opname);
-	abort();
-	return 0;
-}
-static size_t mspacecounter=(size_t) 0xdeadbeef;
-#endif
-#ifndef ENABLE_FAST_HEAP_DETECTION
-static void *RESTRICT leastusedaddress;
-static size_t largestusedblock;
-#endif
-
-static FORCEINLINE void *CallMalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
-{
-	void *RESTRICT ret=0;
-	size_t _alignment=alignment;
-#if USE_MAGIC_HEADERS
-	size_t *_ret=0;
-	size+=alignment+3*sizeof(size_t);
-	_alignment=0;
-#endif
-#if USE_ALLOCATOR==0
-	ret=_alignment ? 
-#ifdef _MSC_VER
-	/* This is the MSVCRT equivalent */
-		_aligned_malloc(size, _alignment)
-#elif defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
-	/* This is the glibc/ptmalloc2/dlmalloc/BSD libc equivalent.  */
-		memalign(_alignment, size)
-#else
-#error Cannot aligned allocate with the memory allocator of an unknown system!
-#endif
-		: malloc(size);
-#elif USE_ALLOCATOR==1
-	ret=_alignment ? mspace_memalign((mstate) mspace, _alignment, size) : mspace_malloc((mstate) mspace, size);
-#ifndef ENABLE_FAST_HEAP_DETECTION
-	if(ret)
-	{
-		size_t truesize=chunksize(mem2chunk(ret));
-		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
-		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
-	}
-#endif
-#endif
-	if(!ret) return 0;
-#if USE_MAGIC_HEADERS
-	_ret=(size_t *) ret;
-	ret=(void *)(_ret+3);
-	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
-	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *)"NEDMALOC";
-	_ret[0]=(size_t) mspace;
-	_ret[1]=size-3*sizeof(size_t);
-#endif
-	return ret;
-}
-
-static FORCEINLINE void *CallCalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
-{
-	void *RESTRICT ret=0;
-#if USE_MAGIC_HEADERS
-	size_t *_ret=0;
-	size+=alignment+3*sizeof(size_t);
-#endif
-#if USE_ALLOCATOR==0
-	ret=calloc(1, size);
-#elif USE_ALLOCATOR==1
-	ret=mspace_calloc((mstate) mspace, 1, size);
-#ifndef ENABLE_FAST_HEAP_DETECTION
-	if(ret)
-	{
-		size_t truesize=chunksize(mem2chunk(ret));
-		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
-		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
-	}
-#endif
-#endif
-	if(!ret) return 0;
-#if USE_MAGIC_HEADERS
-	_ret=(size_t *) ret;
-	ret=(void *)(_ret+3);
-	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
-	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
-	_ret[0]=(size_t) mspace;
-	_ret[1]=size-3*sizeof(size_t);
-#endif
-	return ret;
-}
-
-static FORCEINLINE void *CallRealloc(void *RESTRICT mspace, void *RESTRICT mem, int isforeign, size_t oldsize, size_t newsize) THROWSPEC
-{
-	void *RESTRICT ret=0;
-#if USE_MAGIC_HEADERS
-	mstate oldmspace=0;
-	size_t *_ret=0, *_mem=(size_t *) mem-3;
-#endif
-	if(isforeign)
-	{	/* Transfer */
-#if USE_MAGIC_HEADERS
-		assert(_mem[0]!=*(size_t *) "NEDMALOC");
-#endif
-		if((ret=CallMalloc(mspace, newsize, 0)))
-		{
-#if defined(DEBUG)
-			printf("*** nedmalloc frees system allocated block %p\n", mem);
-#endif
-			memcpy(ret, mem, oldsize<newsize ? oldsize : newsize);
-			free(mem);
-		}
-		return ret;
-	}
-#if USE_MAGIC_HEADERS
-	assert(_mem[0]==*(size_t *) "NEDMALOC");
-	newsize+=3*sizeof(size_t);
-	oldmspace=(mstate) _mem[1];
-	assert(oldsize>=_mem[2]);
-	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
-	mem=(void *)(++_mem);
-#endif
-#if USE_ALLOCATOR==0
-	ret=realloc(mem, newsize);
-#elif USE_ALLOCATOR==1
-	ret=mspace_realloc((mstate) mspace, mem, newsize);
-#ifndef ENABLE_FAST_HEAP_DETECTION
-	if(ret)
-	{
-		size_t truesize=chunksize(mem2chunk(ret));
-		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
-	}
-#endif
-#endif
-	if(!ret)
-	{	/* Put it back the way it was */
-#if USE_MAGIC_HEADERS
-		for(; *_mem==0; *_mem++=*(size_t *) "NEDMALOC");
-#endif
-		return 0;
-	}
-#if USE_MAGIC_HEADERS
-	_ret=(size_t *) ret;
-	ret=(void *)(_ret+3);
-	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
-	_ret[0]=(size_t) mspace;
-	_ret[1]=newsize-3*sizeof(size_t);
-#endif
-	return ret;
-}
-
-static FORCEINLINE void CallFree(void *RESTRICT mspace, void *RESTRICT mem, int isforeign) THROWSPEC
-{
-#if USE_MAGIC_HEADERS
-	mstate oldmspace=0;
-	size_t *_mem=(size_t *) mem-3, oldsize=0;
-#endif
-	if(isforeign)
-	{
-#if USE_MAGIC_HEADERS
-		assert(_mem[0]!=*(size_t *) "NEDMALOC");
-#endif
-#if defined(DEBUG)
-		printf("*** nedmalloc frees system allocated block %p\n", mem);
-#endif
-		free(mem);
-		return;
-	}
-#if USE_MAGIC_HEADERS
-	assert(_mem[0]==*(size_t *) "NEDMALOC");
-	oldmspace=(mstate) _mem[1];
-	oldsize=_mem[2];
-	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
-	mem=(void *)(++_mem);
-#endif
-#if USE_ALLOCATOR==0
-	free(mem);
-#elif USE_ALLOCATOR==1
-	mspace_free((mstate) mspace, mem);
-#endif
-}
-
-static NEDMALLOCNOALIASATTR mstate nedblkmstate(void *RESTRICT mem) THROWSPEC
-{
-	if(mem)
-	{
-#if USE_MAGIC_HEADERS
-		size_t *_mem=(size_t *) mem-3;
-		if(_mem[0]==*(size_t *) "NEDMALOC")
-		{
-			return (mstate) _mem[1];
-		}
-		else return 0;
-#else
-#if USE_ALLOCATOR==0
-		/* Fail everything */
-		return 0;
-#elif USE_ALLOCATOR==1
-#ifdef ENABLE_FAST_HEAP_DETECTION
-#ifdef WIN32
-		/*  On Windows for RELEASE both x86 and x64 the NT heap precedes each block with an eight byte header
-			which looks like:
-				normal: 4 bytes of size, 4 bytes of [char < 64, char < 64, char < 64 bit 0 always set, char random ]
-				mmaped: 4 bytes of size  4 bytes of [zero,      zero,      0xb,                        zero        ]
-
-			On Windows for DEBUG both x86 and x64 the preceding four bytes is always 0xfdfdfdfd (no man's land).
-		*/
-#pragma pack(push, 1)
-		struct _HEAP_ENTRY
-		{
-			USHORT Size;
-			USHORT PreviousSize;
-			UCHAR Cookie;			/* SegmentIndex */
-			UCHAR Flags;			/* always bit 0 (HEAP_ENTRY_BUSY). bit 1=(HEAP_ENTRY_EXTRA_PRESENT), bit 2=normal block (HEAP_ENTRY_FILL_PATTERN), bit 3=mmap block (HEAP_ENTRY_VIRTUAL_ALLOC). Bit 4 (HEAP_ENTRY_LAST_ENTRY) could be set */
-			UCHAR UnusedBytes;
-			UCHAR SmallTagIndex;	/* fastbin index. Always one of 0x02, 0x03, 0x04 < 0x80 */
-		} *RESTRICT he=((struct _HEAP_ENTRY *) mem)-1;
-#pragma pack(pop)
-		unsigned int header=((unsigned int *)mem)[-1], mask1=0x8080E100, result1, mask2=0xFFFFFF06, result2;
-		result1=header & mask1;	/* Positive testing for NT heap */
-		result2=header & mask2;	/* Positive testing for dlmalloc */
-		if(result1==0x00000100 && result2!=0x00000102)
-		{	/* This is likely a NT heap block */
-			return 0;
-		}
-#endif
-#ifdef __linux__
-		/* On Linux glibc uses ptmalloc2 (really dlmalloc) just as we do, but prev_foot contains rubbish
-		when the preceding block is allocated because ptmalloc2 finds the local mstate by rounding the ptr
-		down to the nearest megabyte. It's like dlmalloc with FOOTERS disabled. */
-		mchunkptr p=mem2chunk(mem);
-		mstate fm=get_mstate_for(p);
-		/* If it's a ptmalloc2 block, fm is likely to be some crazy value */
-		if(!is_aligned(fm)) return 0;
-		if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
-		if(ok_magic(fm))
-			return fm;
-		else
-			return 0;
-		if(1) { }
-#endif
-		else
-		{
-			mchunkptr p=mem2chunk(mem);
-			mstate fm=get_mstate_for(p);
-			assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
-			if(ok_magic(fm))
-				return fm;
-		}
-#else
-//#ifdef WIN32
-//		__try
-//#endif
-		{
-			/* We try to return zero here if it isn't one of our own blocks, however
-			the current block annotation scheme used by dlmalloc makes it impossible
-			to be absolutely sure of avoiding a segfault.
-
-			mchunkptr->prev_foot = mem-(2*size_t) = mstate ^ mparams.magic for PRECEDING block;
-			mchunkptr->head      = mem-(1*size_t) = 8 multiple size of this block with bottom three bits = FLAG_BITS
-			    FLAG_BITS = bit 0 is CINUSE (currently in use unless is mmap), bit 1 is PINUSE (previous block currently
-				            in use unless mmap), bit 2 is UNUSED and currently is always zero.
-			*/
-			register void *RESTRICT leastusedaddress_=leastusedaddress;		/* Cache these to avoid register reloading */
-			register size_t largestusedblock_=largestusedblock;
-			if(!is_aligned(mem)) return 0;		/* Would fail very rarely as all allocators return aligned blocks */
-			if(mem<leastusedaddress_) return 0;	/* Simple but effective */
-			{
-				mchunkptr p=mem2chunk(mem);
-				mstate fm=0;
-				int ismmapped=is_mmapped(p);
-				if((!ismmapped && !is_inuse(p)) || (p->head & FLAG4_BIT)) return 0;
-				/* Reduced uncertainty by 0.5^2 = 25.0% */
-				/* size should never exceed largestusedblock */
-				if(chunksize(p)>largestusedblock_) return 0;
-				/* Reduced uncertainty by a minimum of 0.5^3 = 12.5%, maximum 0.5^16 = 0.0015% */
-				/* Having sanity checked prev_foot and head, check next block */
-				if(!ismmapped && (!next_pinuse(p) || (next_chunk(p)->head & FLAG4_BIT))) return 0;
-				/* Reduced uncertainty by 0.5^5 = 3.13% or 0.5^18 = 0.00038% */
-	#if 0
-				/* If previous block is free, check that its next block pointer equals us */
-				if(!ismmapped && !pinuse(p))
-					if(next_chunk(prev_chunk(p))!=p) return 0;
-				/* We could start comparing prev_foot's for similarity but it starts getting slow. */
-	#endif
-				fm = get_mstate_for(p);
-				if(!is_aligned(fm) || (void *)fm<leastusedaddress_) return 0;
-				if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
-				assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
-				if(ok_magic(fm))
-					return fm;
-			}
-		}
-//#ifdef WIN32
-//		__except(1) { }
-//#endif
-#endif
-#endif
-#endif
-	}
-	return 0;
-}
-NEDMALLOCNOALIASATTR size_t nedblksize(int *RESTRICT isforeign, void *RESTRICT mem) THROWSPEC
-{
-	if(mem)
-	{
-		if(isforeign) *isforeign=1;
-#if USE_MAGIC_HEADERS
-		{
-			size_t *_mem=(size_t *) mem-3;
-			if(_mem[0]==*(size_t *) "NEDMALOC")
-			{
-				mstate mspace=(mstate) _mem[1];
-				size_t size=_mem[2];
-				if(isforeign) *isforeign=0;
-				return size;
-			}
-		}
-#elif USE_ALLOCATOR==1
-		if(nedblkmstate(mem))
-		{
-			mchunkptr p=mem2chunk(mem);
-			if(isforeign) *isforeign=0;
-			return chunksize(p)-overhead_for(p);
-		}
-#ifdef DEBUG
-		else
-		{
-			int a=1; /* Set breakpoints here if needed */
-		}
-#endif
-#endif
-#if defined(ENABLE_TOLERANT_NEDMALLOC) || USE_ALLOCATOR==0
-#ifdef _MSC_VER
-		/* This is the MSVCRT equivalent */
-		return _msize(mem);
-#elif defined(__linux__)
-		/* This is the glibc/ptmalloc2/dlmalloc equivalent.  */
-		return malloc_usable_size(mem);
-#elif defined(__FreeBSD__) || defined(__APPLE__)
-		/* This is the BSD libc equivalent.  */
-		return malloc_size(mem);
-#else
-#error Cannot tolerate the memory allocator of an unknown system!
-#endif
-#endif
-	}
-	return 0;
-}
-
-NEDMALLOCNOALIASATTR void nedsetvalue(void *v) THROWSPEC											{ nedpsetvalue((nedpool *) 0, v); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmalloc(size_t size) THROWSPEC						{ return nedpmalloc((nedpool *) 0, size); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedcalloc(size_t no, size_t size) THROWSPEC			{ return nedpcalloc((nedpool *) 0, no, size); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedrealloc(void *mem, size_t size) THROWSPEC			{ return nedprealloc((nedpool *) 0, mem, size); }
-NEDMALLOCNOALIASATTR void   nedfree(void *mem) THROWSPEC											{ nedpfree((nedpool *) 0, mem); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC	{ return nedpmemalign((nedpool *) 0, alignment, bytes); }
-NEDMALLOCNOALIASATTR struct nedmallinfo nedmallinfo(void) THROWSPEC									{ return nedpmallinfo((nedpool *) 0); }
-NEDMALLOCNOALIASATTR int    nedmallopt(int parno, int value) THROWSPEC								{ return nedpmallopt((nedpool *) 0, parno, value); }
-NEDMALLOCNOALIASATTR int    nedmalloc_trim(size_t pad) THROWSPEC									{ return nedpmalloc_trim((nedpool *) 0, pad); }
-void   nedmalloc_stats() THROWSPEC																	{ nedpmalloc_stats((nedpool *) 0); }
-NEDMALLOCNOALIASATTR size_t nedmalloc_footprint() THROWSPEC											{ return nedpmalloc_footprint((nedpool *) 0); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC	{ return nedpindependent_calloc((nedpool *) 0, elemsno, elemsize, chunks); }
-NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC		{ return nedpindependent_comalloc((nedpool *) 0, elems, sizes, chunks); }
-
-struct threadcacheblk_t;
-typedef struct threadcacheblk_t threadcacheblk;
-struct threadcacheblk_t
-{	/* Keep less than 16 bytes on 32 bit systems and 32 bytes on 64 bit systems */
-#ifdef FULLSANITYCHECKS
-	unsigned int magic;
-#endif
-	unsigned int lastUsed, size;
-	threadcacheblk *next, *prev;
-};
-typedef struct threadcache_t
-{
-#ifdef FULLSANITYCHECKS
-	unsigned int magic1;
-#endif
-	int mymspace;						/* Last mspace entry this thread used */
-	long threadid;
-	unsigned int mallocs, frees, successes;
-	size_t freeInCache;					/* How much free space is stored in this cache */
-	threadcacheblk *bins[(THREADCACHEMAXBINS+1)*2];
-#ifdef FULLSANITYCHECKS
-	unsigned int magic2;
-#endif
-} threadcache;
-struct nedpool_t
-{
-	MLOCK_T mutex;
-	void *uservalue;
-	int threads;						/* Max entries in m to use */
-	threadcache *caches[THREADCACHEMAXCACHES];
-	TLSVAR mycache;						/* Thread cache for this thread. 0 for unset, negative for use mspace-1 directly, otherwise is cache-1 */
-	mstate m[MAXTHREADSINPOOL+1];		/* mspace entries for this pool */
-};
-static nedpool syspool;
-
-static FORCEINLINE NEDMALLOCNOALIASATTR unsigned int size2binidx(size_t _size) THROWSPEC
-{	/* 8=1000	16=10000	20=10100	24=11000	32=100000	48=110000	4096=1000000000000 */
-	unsigned int topbit, size=(unsigned int)(_size>>4);
-	/* 16=1		20=1	24=1	32=10	48=11	64=100	96=110	128=1000	4096=100000000 */
-
-#if defined(__GNUC__)
-        topbit = sizeof(size)*__CHAR_BIT__ - 1 - __builtin_clz(size);
-#elif defined(_MSC_VER) && _MSC_VER>=1300
-	{
-            unsigned long bsrTopBit;
-
-            _BitScanReverse(&bsrTopBit, size);
-
-            topbit = bsrTopBit;
-        }
-#else
-#if 0
-	union {
-		unsigned asInt[2];
-		double asDouble;
-	};
-	int n;
-
-	asDouble = (double)size + 0.5;
-	topbit = (asInt[!FOX_BIGENDIAN] >> 20) - 1023;
-#else
-	{
-		unsigned int x=size;
-		x = x | (x >> 1);
-		x = x | (x >> 2);
-		x = x | (x >> 4);
-		x = x | (x >> 8);
-		x = x | (x >>16);
-		x = ~x;
-		x = x - ((x >> 1) & 0x55555555);
-		x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-		x = (x + (x >> 4)) & 0x0F0F0F0F;
-		x = x + (x << 8);
-		x = x + (x << 16);
-		topbit=31 - (x >> 24);
-	}
-#endif
-#endif
-	return topbit;
-}
-
-
-#ifdef FULLSANITYCHECKS
-static void tcsanitycheck(threadcacheblk **ptr) THROWSPEC
-{
-	assert((ptr[0] && ptr[1]) || (!ptr[0] && !ptr[1]));
-	if(ptr[0] && ptr[1])
-	{
-		assert(nedblksize(ptr[0])>=sizeof(threadcacheblk));
-		assert(nedblksize(ptr[1])>=sizeof(threadcacheblk));
-		assert(*(unsigned int *) "NEDN"==ptr[0]->magic);
-		assert(*(unsigned int *) "NEDN"==ptr[1]->magic);
-		assert(!ptr[0]->prev);
-		assert(!ptr[1]->next);
-		if(ptr[0]==ptr[1])
-		{
-			assert(!ptr[0]->next);
-			assert(!ptr[1]->prev);
-		}
-	}
-}
-static void tcfullsanitycheck(threadcache *tc) THROWSPEC
-{
-	threadcacheblk **tcbptr=tc->bins;
-	int n;
-	for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
-	{
-		threadcacheblk *b, *ob=0;
-		tcsanitycheck(tcbptr);
-		for(b=tcbptr[0]; b; ob=b, b=b->next)
-		{
-			assert(*(unsigned int *) "NEDN"==b->magic);
-			assert(!ob || ob->next==b);
-			assert(!ob || b->prev==ob);
-		}
-	}
-}
-#endif
-
-static NOINLINE void RemoveCacheEntries(nedpool *RESTRICT p, threadcache *RESTRICT tc, unsigned int age) THROWSPEC
-{
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	if(tc->freeInCache)
-	{
-		threadcacheblk **tcbptr=tc->bins;
-		int n;
-		for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
-		{
-			threadcacheblk **tcb=tcbptr+1;		/* come from oldest end of list */
-			/*tcsanitycheck(tcbptr);*/
-			for(; *tcb && tc->frees-(*tcb)->lastUsed>=age; )
-			{
-				threadcacheblk *f=*tcb;
-				size_t blksize=f->size; /*nedblksize(f);*/
-				assert(blksize<=nedblksize(0, f));
-				assert(blksize);
-#ifdef FULLSANITYCHECKS
-				assert(*(unsigned int *) "NEDN"==(*tcb)->magic);
-#endif
-				*tcb=(*tcb)->prev;
-				if(*tcb)
-					(*tcb)->next=0;
-				else
-					*tcbptr=0;
-				tc->freeInCache-=blksize;
-				assert((long) tc->freeInCache>=0);
-				CallFree(0, f, 0);
-				/*tcsanitycheck(tcbptr);*/
-			}
-		}
-	}
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-}
-static void DestroyCaches(nedpool *RESTRICT p) THROWSPEC
-{
-	if(p->caches)
-	{
-		threadcache *tc;
-		int n;
-		for(n=0; n<THREADCACHEMAXCACHES; n++)
-		{
-			if((tc=p->caches[n]))
-			{
-				tc->frees++;
-				RemoveCacheEntries(p, tc, 0);
-				assert(!tc->freeInCache);
-				tc->mymspace=-1;
-				tc->threadid=0;
-				CallFree(0, tc, 0);
-				p->caches[n]=0;
-			}
-		}
-	}
-}
-
-static NOINLINE threadcache *AllocCache(nedpool *RESTRICT p) THROWSPEC
-{
-	threadcache *tc=0;
-	int n, end;
-	ACQUIRE_LOCK(&p->mutex);
-	for(n=0; n<THREADCACHEMAXCACHES && p->caches[n]; n++);
-	if(THREADCACHEMAXCACHES==n)
-	{	/* List exhausted, so disable for this thread */
-		RELEASE_LOCK(&p->mutex);
-		return 0;
-	}
-	tc=p->caches[n]=(threadcache *) CallCalloc(p->m[0], sizeof(threadcache), 0);
-	if(!tc)
-	{
-		RELEASE_LOCK(&p->mutex);
-		return 0;
-	}
-#ifdef FULLSANITYCHECKS
-	tc->magic1=*(unsigned int *)"NEDMALC1";
-	tc->magic2=*(unsigned int *)"NEDMALC2";
-#endif
-	tc->threadid=(long)(size_t)CURRENT_THREAD;
-	for(end=0; p->m[end]; end++);
-	tc->mymspace=abs(tc->threadid) % end;
-	RELEASE_LOCK(&p->mutex);
-	if(TLSSET(p->mycache, (void *)(size_t)(n+1))) abort();
-	return tc;
-}
-
-static void *threadcache_malloc(nedpool *RESTRICT p, threadcache *RESTRICT tc, size_t *RESTRICT _size) THROWSPEC
-{
-	void *RESTRICT ret=0;
-	size_t size=*_size, blksize=0;
-	unsigned int bestsize;
-	unsigned int idx=size2binidx(size);
-	threadcacheblk *RESTRICT blk, **RESTRICT binsptr;
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	/* Calculate best fit bin size */
-	bestsize=1<<(idx+4);
-#if 0
-	/* Finer grained bin fit */
-	idx<<=1;
-	if(size>bestsize)
-	{
-		idx++;
-		bestsize+=bestsize>>1;
-	}
-	if(size>bestsize)
-	{
-		idx++;
-		bestsize=1<<(4+(idx>>1));
-	}
-#else
-	if(size>bestsize)
-	{
-		idx++;
-		bestsize<<=1;
-	}
-#endif
-	assert(bestsize>=size);
-	if(size<bestsize) size=bestsize;
-	assert(size<=THREADCACHEMAX);
-	assert(idx<=THREADCACHEMAXBINS);
-	binsptr=&tc->bins[idx*2];
-	/* Try to match close, but move up a bin if necessary */
-	blk=*binsptr;
-	if(!blk || blk->size<size)
-	{	/* Bump it up a bin */
-		if(idx<THREADCACHEMAXBINS)
-		{
-			idx++;
-			binsptr+=2;
-			blk=*binsptr;
-		}
-	}
-	if(blk)
-	{
-		blksize=blk->size; /*nedblksize(blk);*/
-		assert(nedblksize(0, blk)>=blksize);
-		assert(blksize>=size);
-		if(blk->next)
-			blk->next->prev=0;
-		*binsptr=blk->next;
-		if(!*binsptr)
-			binsptr[1]=0;
-#ifdef FULLSANITYCHECKS
-		blk->magic=0;
-#endif
-		assert(binsptr[0]!=blk && binsptr[1]!=blk);
-		assert(nedblksize(0, blk)>=sizeof(threadcacheblk) && nedblksize(0, blk)<=THREADCACHEMAX+CHUNK_OVERHEAD);
-		/*printf("malloc: %p, %p, %p, %lu\n", p, tc, blk, (long) _size);*/
-		ret=(void *) blk;
-	}
-	++tc->mallocs;
-	if(ret)
-	{
-		assert(blksize>=size);
-		++tc->successes;
-		tc->freeInCache-=blksize;
-		assert((long) tc->freeInCache>=0);
-	}
-#if defined(DEBUG) && 0
-	if(!(tc->mallocs & 0xfff))
-	{
-		printf("*** threadcache=%u, mallocs=%u (%f), free=%u (%f), freeInCache=%u\n", (unsigned int) tc->threadid, tc->mallocs,
-			(float) tc->successes/tc->mallocs, tc->frees, (float) tc->successes/tc->frees, (unsigned int) tc->freeInCache);
-	}
-#endif
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	*_size=size;
-	return ret;
-}
-static NOINLINE void ReleaseFreeInCache(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace) THROWSPEC
-{
-	unsigned int age=THREADCACHEMAXFREESPACE/8192;
-	/*ACQUIRE_LOCK(&p->m[mymspace]->mutex);*/
-	while(age && tc->freeInCache>=THREADCACHEMAXFREESPACE)
-	{
-		RemoveCacheEntries(p, tc, age);
-		/*printf("*** Removing cache entries older than %u (%u)\n", age, (unsigned int) tc->freeInCache);*/
-		age>>=1;
-	}
-	/*RELEASE_LOCK(&p->m[mymspace]->mutex);*/
-}
-static void threadcache_free(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, void *RESTRICT mem, size_t size) THROWSPEC
-{
-	unsigned int bestsize;
-	unsigned int idx=size2binidx(size);
-	threadcacheblk **RESTRICT binsptr, *RESTRICT tck=(threadcacheblk *) mem;
-	assert(size>=sizeof(threadcacheblk) && size<=THREADCACHEMAX+CHUNK_OVERHEAD);
-#ifdef DEBUG
-	/* Make sure this is a valid memory block */
-	assert(nedblksize(0, mem));
-#endif
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-	/* Calculate best fit bin size */
-	bestsize=1<<(idx+4);
-#if 0
-	/* Finer grained bin fit */
-	idx<<=1;
-	if(size>bestsize)
-	{
-		unsigned int biggerbestsize=bestsize+bestsize<<1;
-		if(size>=biggerbestsize)
-		{
-			idx++;
-			bestsize=biggerbestsize;
-		}
-	}
-#endif
-	if(bestsize!=size)	/* dlmalloc can round up, so we round down to preserve indexing */
-		size=bestsize;
-	binsptr=&tc->bins[idx*2];
-	assert(idx<=THREADCACHEMAXBINS);
-	if(tck==*binsptr)
-	{
-		fprintf(stderr, "nedmalloc: Attempt to free already freed memory block %p - aborting!\n", tck);
-		abort();
-	}
-#ifdef FULLSANITYCHECKS
-	tck->magic=*(unsigned int *) "NEDN";
-#endif
-	tck->lastUsed=++tc->frees;
-	tck->size=(unsigned int) size;
-	tck->next=*binsptr;
-	tck->prev=0;
-	if(tck->next)
-		tck->next->prev=tck;
-	else
-		binsptr[1]=tck;
-	assert(!*binsptr || (*binsptr)->size==tck->size);
-	*binsptr=tck;
-	assert(tck==tc->bins[idx*2]);
-	assert(tc->bins[idx*2+1]==tck || binsptr[0]->next->prev==tck);
-	/*printf("free: %p, %p, %p, %lu\n", p, tc, mem, (long) size);*/
-	tc->freeInCache+=size;
-#ifdef FULLSANITYCHECKS
-	tcfullsanitycheck(tc);
-#endif
-#if 1
-	if(tc->freeInCache>=THREADCACHEMAXFREESPACE)
-		ReleaseFreeInCache(p, tc, mymspace);
-#endif
-}
-
-
-
-
-static NOINLINE int InitPool(nedpool *RESTRICT p, size_t capacity, int threads) THROWSPEC
-{	/* threads is -1 for system pool */
-	ensure_initialization();
-	ACQUIRE_MALLOC_GLOBAL_LOCK();
-	if(p->threads) goto done;
-	if(INITIAL_LOCK(&p->mutex)) goto err;
-	if(TLSALLOC(&p->mycache)) goto err;
-#if USE_ALLOCATOR==0
-	p->m[0]=(mstate) mspacecounter++;
-#elif USE_ALLOCATOR==1
-	if(!(p->m[0]=(mstate) create_mspace(capacity, 1))) goto err;
-	p->m[0]->extp=p;
-#endif
-	p->threads=(threads<1 || threads>MAXTHREADSINPOOL) ? MAXTHREADSINPOOL : threads;
-done:
-	RELEASE_MALLOC_GLOBAL_LOCK();
-	return 1;
-err:
-	if(threads<0)
-		abort();			/* If you can't allocate for system pool, we're screwed */
-	DestroyCaches(p);
-	if(p->m[0])
-	{
-#if USE_ALLOCATOR==1
-		destroy_mspace(p->m[0]);
-#endif
-		p->m[0]=0;
-	}
-	if(p->mycache)
-	{
-		if(TLSFREE(p->mycache)) abort();
-		p->mycache=0;
-	}
-	RELEASE_MALLOC_GLOBAL_LOCK();
-	return 0;
-}
-static NOINLINE mstate FindMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int *RESTRICT lastUsed, size_t size) THROWSPEC
-{	/* Gets called when thread's last used mspace is in use. The strategy
-	is to run through the list of all available mspaces looking for an
-	unlocked one and if we fail, we create a new one so long as we don't
-	exceed p->threads */
-	int n, end;
-	for(n=end=*lastUsed+1; p->m[n]; end=++n)
-	{
-		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
-	}
-	for(n=0; n<*lastUsed && p->m[n]; n++)
-	{
-		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
-	}
-	if(end<p->threads)
-	{
-		mstate temp;
-#if USE_ALLOCATOR==0
-		temp=(mstate) mspacecounter++;
-#elif USE_ALLOCATOR==1
-		if(!(temp=(mstate) create_mspace(size, 1)))
-			goto badexit;
-#endif
-		/* Now we're ready to modify the lists, we lock */
-		ACQUIRE_LOCK(&p->mutex);
-		while(p->m[end] && end<p->threads)
-			end++;
-		if(end>=p->threads)
-		{	/* Drat, must destroy it now */
-			RELEASE_LOCK(&p->mutex);
-#if USE_ALLOCATOR==1
-			destroy_mspace((mstate) temp);
-#endif
-			goto badexit;
-		}
-		/* We really want to make sure this goes into memory now but we
-		have to be careful of breaking aliasing rules, so write it twice */
-		*((volatile struct malloc_state **) &p->m[end])=p->m[end]=temp;
-		ACQUIRE_LOCK(&p->m[end]->mutex);
-		/*printf("Created mspace idx %d\n", end);*/
-		RELEASE_LOCK(&p->mutex);
-		n=end;
-		goto found;
-	}
-	/* Let it lock on the last one it used */
-badexit:
-	ACQUIRE_LOCK(&p->m[*lastUsed]->mutex);
-	return p->m[*lastUsed];
-found:
-	*lastUsed=n;
-	if(tc)
-		tc->mymspace=n;
-	else
-	{
-		if(TLSSET(p->mycache, (void *)(size_t)(-(n+1)))) abort();
-	}
-	return p->m[n];
-}
-
-typedef struct PoolList_t
-{
-	size_t size;			/* Size of list */
-	size_t length;			/* Actual entries in list */
-#ifdef DEBUG
-	nedpool *list[1];		/* Force testing of list expansion */
-#else
-	nedpool *list[16];
-#endif
-} PoolList;
-static MLOCK_T poollistlock;
-static PoolList *poollist;
-NEDMALLOCPTRATTR nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC
-{
-	nedpool *ret=0;
-	if(!poollist)
-	{
-		PoolList *newpoollist=0;
-		if(!(newpoollist=(PoolList *) nedpcalloc(0, 1, sizeof(PoolList)+sizeof(nedpool *)))) return 0;
-		INITIAL_LOCK(&poollistlock);
-		ACQUIRE_LOCK(&poollistlock);
-		poollist=newpoollist;
-		poollist->size=sizeof(poollist->list)/sizeof(nedpool *);
-	}
-	else
-		ACQUIRE_LOCK(&poollistlock);
-	if(poollist->length==poollist->size)
-	{
-		PoolList *newpoollist=0;
-		size_t newsize=0;
-		newsize=sizeof(PoolList)+(poollist->size+1)*sizeof(nedpool *);
-		if(!(newpoollist=(PoolList *) nedprealloc(0, poollist, newsize))) goto badexit;
-		poollist=newpoollist;
-		memset(&poollist->list[poollist->size], 0, newsize-((size_t)&poollist->list[poollist->size]-(size_t)&poollist->list[0]));
-		poollist->size=((newsize-((char *)&poollist->list[0]-(char *)poollist))/sizeof(nedpool *))-1;
-		assert(poollist->size>poollist->length);
-	}
-	if(!(ret=(nedpool *) nedpcalloc(0, 1, sizeof(nedpool)))) goto badexit;
-	if(!InitPool(ret, capacity, threads))
-	{
-		nedpfree(0, ret);
-		goto badexit;
-	}
-	poollist->list[poollist->length++]=ret;
-badexit:
-	RELEASE_LOCK(&poollistlock);
-	return ret;
-}
-void neddestroypool(nedpool *p) THROWSPEC
-{
-	unsigned int n;
-	ACQUIRE_LOCK(&p->mutex);
-	DestroyCaches(p);
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		destroy_mspace(p->m[n]);
-#endif
-		p->m[n]=0;
-	}
-	RELEASE_LOCK(&p->mutex);
-	if(TLSFREE(p->mycache)) abort();
-	nedpfree(0, p);
-	ACQUIRE_LOCK(&poollistlock);
-	assert(poollist);
-	for(n=0; n<poollist->length && poollist->list[n]!=p; n++);
-	assert(n!=poollist->length);
-	memmove(&poollist->list[n], &poollist->list[n+1], (size_t)&poollist->list[poollist->length]-(size_t)&poollist->list[n]);
-	if(!--poollist->length)
-	{
-		assert(!poollist->list[0]);
-		nedpfree(0, poollist);
-		poollist=0;
-	}
-	RELEASE_LOCK(&poollistlock);
-}
-void neddestroysyspool() THROWSPEC
-{
-	nedpool *p=&syspool;
-	int n;
-	ACQUIRE_LOCK(&p->mutex);
-	DestroyCaches(p);
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		destroy_mspace(p->m[n]);
-#endif
-		p->m[n]=0;
-	}
-	/* Render syspool unusable */
-	for(n=0; n<THREADCACHEMAXCACHES; n++)
-		p->caches[n]=(threadcache *)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
-	for(n=0; n<MAXTHREADSINPOOL+1; n++)
-		p->m[n]=(mstate)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
-	if(TLSFREE(p->mycache)) abort();
-	RELEASE_LOCK(&p->mutex);
-}
-nedpool **nedpoollist() THROWSPEC
-{
-	nedpool **ret=0;
-	if(poollist)
-	{
-		ACQUIRE_LOCK(&poollistlock);
-		if(!(ret=(nedpool **) nedmalloc((poollist->length+1)*sizeof(nedpool *)))) goto badexit;
-		memcpy(ret, poollist->list, (poollist->length+1)*sizeof(nedpool *));
-badexit:
-		RELEASE_LOCK(&poollistlock);
-	}
-	return ret;
-}
-
-void nedpsetvalue(nedpool *p, void *v) THROWSPEC
-{
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	p->uservalue=v;
-}
-void *nedgetvalue(nedpool **p, void *mem) THROWSPEC
-{
-	nedpool *np=0;
-	mstate fm=nedblkmstate(mem);
-	if(!fm || !fm->extp) return 0;
-	np=(nedpool *) fm->extp;
-	if(p) *p=np;
-	return np->uservalue;
-}
-
-void nedtrimthreadcache(nedpool *p, int disable) THROWSPEC
-{
-	int mycache;
-	if(!p)
-	{
-		p=&syspool;
-		if(!syspool.threads) InitPool(&syspool, 0, -1);
-	}
-	mycache=(int)(size_t) TLSGET(p->mycache);
-	if(!mycache)
-	{	/* Set to mspace 0 */
-		if(disable && TLSSET(p->mycache, (void *)(size_t)-1)) abort();
-	}
-	else if(mycache>0)
-	{	/* Set to last used mspace */
-		threadcache *tc=p->caches[mycache-1];
-#if defined(DEBUG)
-		printf("Threadcache utilisation: %lf%% in cache with %lf%% lost to other threads\n",
-			100.0*tc->successes/tc->mallocs, 100.0*((double) tc->mallocs-tc->frees)/tc->mallocs);
-#endif
-		if(disable && TLSSET(p->mycache, (void *)(size_t)(-tc->mymspace))) abort();
-		tc->frees++;
-		RemoveCacheEntries(p, tc, 0);
-		assert(!tc->freeInCache);
-		if(disable)
-		{
-			tc->mymspace=-1;
-			tc->threadid=0;
-			CallFree(0, p->caches[mycache-1], 0);
-			p->caches[mycache-1]=0;
-		}
-	}
-}
-void neddisablethreadcache(nedpool *p) THROWSPEC
-{
-	nedtrimthreadcache(p, 1);
-}
-
-#define GETMSPACE(m,p,tc,ms,s,action)                 \
-  do                                                  \
-  {                                                   \
-    mstate m = GetMSpace((p),(tc),(ms),(s));          \
-    action;                                           \
-	if(USE_ALLOCATOR==1) { RELEASE_LOCK(&m->mutex); } \
-  } while (0)
-
-static FORCEINLINE mstate GetMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, size_t size) THROWSPEC
-{	/* Returns a locked and ready for use mspace */
-	mstate m=p->m[mymspace];
-	assert(m);
-#if USE_ALLOCATOR==1
-	if(!TRY_LOCK(&p->m[mymspace]->mutex)) m=FindMSpace(p, tc, &mymspace, size);
-	/*assert(IS_LOCKED(&p->m[mymspace]->mutex));*/
-#endif
-	return m;
-}
-static NOINLINE void GetThreadCache_cold1(nedpool *RESTRICT *RESTRICT p) THROWSPEC
-{
-	*p=&syspool;
-	if(!syspool.threads) InitPool(&syspool, 0, -1);
-}
-static NOINLINE void GetThreadCache_cold2(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, int mycache) THROWSPEC
-{
-	if(!mycache)
-	{	/* Need to allocate a new cache */
-		*tc=AllocCache(*p);
-		if(!*tc)
-		{	/* Disable */
-			if(TLSSET((*p)->mycache, (void *)(size_t)-1)) abort();
-			*mymspace=0;
-		}
-		else
-			*mymspace=(*tc)->mymspace;
-	}
-	else
-	{	/* Cache disabled, but we do have an assigned thread pool */
-		*tc=0;
-		*mymspace=-mycache-1;
-	}
-}
-static FORCEINLINE void GetThreadCache(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, size_t *RESTRICT size) THROWSPEC
-{
-	int mycache;
-	if(size && *size<sizeof(threadcacheblk)) *size=sizeof(threadcacheblk);
-	if(!*p)
-		GetThreadCache_cold1(p);
-	mycache=(int)(size_t) TLSGET((*p)->mycache);
-	if(mycache>0)
-	{	/* Already have a cache */
-		*tc=(*p)->caches[mycache-1];
-		*mymspace=(*tc)->mymspace;
-	}
-	else GetThreadCache_cold2(p, tc, mymspace, mycache);
-	assert(*mymspace>=0);
-	assert(!(*tc) || (long)(size_t)CURRENT_THREAD==(*tc)->threadid);
-#ifdef FULLSANITYCHECKS
-	if(*tc)
-	{
-		if(*(unsigned int *)"NEDMALC1"!=(*tc)->magic1 || *(unsigned int *)"NEDMALC2"!=(*tc)->magic2)
-		{
-			abort();
-		}
-	}
-#endif
-}
-
-NEDMALLOCPTRATTR void * nedpmalloc(nedpool *p, size_t size) THROWSPEC
-{
-	void *ret=0;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &size);
-#if THREADCACHEMAX
-	if(tc && size<=THREADCACHEMAX)
-	{	/* Use the thread cache */
-		ret=threadcache_malloc(p, tc, &size);
-	}
-#endif
-	if(!ret)
-	{	/* Use this thread's mspace */
-        GETMSPACE(m, p, tc, mymspace, size,
-                  ret=CallMalloc(m, size, 0));
-	}
-	return ret;
-}
-NEDMALLOCPTRATTR void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC
-{
-	size_t rsize=size*no;
-	void *ret=0;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &rsize);
-#if THREADCACHEMAX
-	if(tc && rsize<=THREADCACHEMAX)
-	{	/* Use the thread cache */
-		if((ret=threadcache_malloc(p, tc, &rsize)))
-			memset(ret, 0, rsize);
-	}
-#endif
-	if(!ret)
-	{	/* Use this thread's mspace */
-        GETMSPACE(m, p, tc, mymspace, rsize,
-                  ret=CallCalloc(m, rsize, 0));
-	}
-	return ret;
-}
-NEDMALLOCPTRATTR void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC
-{
-	void *ret=0;
-	threadcache *tc;
-	int mymspace, isforeign=1;
-	size_t memsize;
-	if(!mem) return nedpmalloc(p, size);
-	memsize=nedblksize(&isforeign, mem);
-	assert(memsize);
-	if(!memsize)
-	{
-		fprintf(stderr, "nedmalloc: nedprealloc() called with a block not created by nedmalloc!\n");
-		abort();
-	}
-	else if(size<=memsize && memsize-size<
-#ifdef DEBUG
-		32
-#else
-		1024
-#endif
-		)		/* If realloc size is within 1Kb smaller than existing, noop it */
-		return mem;
-	GetThreadCache(&p, &tc, &mymspace, &size);
-#if THREADCACHEMAX
-	if(tc && size && size<=THREADCACHEMAX)
-	{	/* Use the thread cache */
-		if((ret=threadcache_malloc(p, tc, &size)))
-		{
-			memcpy(ret, mem, memsize<size ? memsize : size);
-			if(memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
-				threadcache_free(p, tc, mymspace, mem, memsize);
-			else
-				CallFree(0, mem, isforeign);
-		}
-	}
-#endif
-	if(!ret)
-	{	/* Reallocs always happen in the mspace they happened in, so skip
-		locking the preferred mspace for this thread */
-		ret=CallRealloc(p->m[mymspace], mem, isforeign, memsize, size);
-	}
-	return ret;
-}
-void   nedpfree(nedpool *p, void *mem) THROWSPEC
-{	/* Frees always happen in the mspace they happened in, so skip
-	locking the preferred mspace for this thread */
-	threadcache *tc;
-	int mymspace, isforeign=1;
-	size_t memsize;
-	if(!mem)
-	{	/* If you tried this on FreeBSD you'd be sorry! */
-#ifdef DEBUG
-		fprintf(stderr, "nedmalloc: WARNING nedpfree() called with zero. This is not portable behaviour!\n");
-#endif
-		return;
-	}
-	memsize=nedblksize(&isforeign, mem);
-	assert(memsize);
-	if(!memsize)
-	{
-		fprintf(stderr, "nedmalloc: nedpfree() called with a block not created by nedmalloc!\n");
-		abort();
-	}
-	GetThreadCache(&p, &tc, &mymspace, 0);
-#if THREADCACHEMAX
-	if(mem && tc && memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
-		threadcache_free(p, tc, mymspace, mem, memsize);
-	else
-#endif
-		CallFree(0, mem, isforeign);
-}
-NEDMALLOCPTRATTR void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC
-{
-	void *ret;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &bytes);
-	{	/* Use this thread's mspace */
-        GETMSPACE(m, p, tc, mymspace, bytes,
-                  ret=CallMalloc(m, bytes, alignment));
-	}
-	return ret;
-}
-struct nedmallinfo nedpmallinfo(nedpool *p) THROWSPEC
-{
-	int n;
-	struct nedmallinfo ret={0};
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1 && !NO_MALLINFO
-		struct mallinfo t=mspace_mallinfo(p->m[n]);
-		ret.arena+=t.arena;
-		ret.ordblks+=t.ordblks;
-		ret.hblkhd+=t.hblkhd;
-		ret.usmblks+=t.usmblks;
-		ret.uordblks+=t.uordblks;
-		ret.fordblks+=t.fordblks;
-		ret.keepcost+=t.keepcost;
-#endif
-	}
-	return ret;
-}
-int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC
-{
-#if USE_ALLOCATOR==1
-	return mspace_mallopt(parno, value);
-#else
-	return 0;
-#endif
-}
-NEDMALLOCNOALIASATTR void*  nedmalloc_internals(size_t *granularity, size_t *magic) THROWSPEC
-{
-#if USE_ALLOCATOR==1
-	if(granularity) *granularity=mparams.granularity;
-	if(magic) *magic=mparams.magic;
-	return (void *) &syspool;
-#else
-	if(granularity) *granularity=0;
-	if(magic) *magic=0;
-	return 0;
-#endif
-}
-int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC
-{
-	int n, ret=0;
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		ret+=mspace_trim(p->m[n], pad);
-#endif
-	}
-	return ret;
-}
-void   nedpmalloc_stats(nedpool *p) THROWSPEC
-{
-	int n;
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		mspace_malloc_stats(p->m[n]);
-#endif
-	}
-}
-size_t nedpmalloc_footprint(nedpool *p) THROWSPEC
-{
-	size_t ret=0;
-	int n;
-	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
-	for(n=0; p->m[n]; n++)
-	{
-#if USE_ALLOCATOR==1
-		ret+=mspace_footprint(p->m[n]);
-#endif
-	}
-	return ret;
-}
-NEDMALLOCPTRATTR void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC
-{
-	void **ret;
-	threadcache *tc;
-	int mymspace;
-	GetThreadCache(&p, &tc, &mymspace, &elemsize);
-#if USE_ALLOCATOR==0
-    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
-              ret=unsupported_operation("independent_calloc"));
-#elif USE_ALLOCATOR==1
-    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
-              ret=mspace_independent_calloc(m, elemsno, elemsize, chunks));
-#endif
-	return ret;
-}
-NEDMALLOCPTRATTR void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC
-{
-	void **ret;
-	threadcache *tc;
-	int mymspace;
-    size_t i, *adjustedsizes=(size_t *) alloca(elems*sizeof(size_t));
-    if(!adjustedsizes) return 0;
-    for(i=0; i<elems; i++)
-        adjustedsizes[i]=sizes[i]<sizeof(threadcacheblk) ? sizeof(threadcacheblk) : sizes[i];
-	GetThreadCache(&p, &tc, &mymspace, 0);
-#if USE_ALLOCATOR==0
-	GETMSPACE(m, p, tc, mymspace, 0,
-              ret=unsupported_operation("independent_comalloc"));
-#elif USE_ALLOCATOR==1
-	GETMSPACE(m, p, tc, mymspace, 0,
-              ret=mspace_independent_comalloc(m, elems, adjustedsizes, chunks));
-#endif
-	return ret;
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#endif
+#ifdef NEDMALLOC_ENABLED
+/* Alternative malloc implementation for multiple threads without
+lock contention based on dlmalloc. (C) 2005-2009 Niall Douglas
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+#ifdef _MSC_VER
+/* Enable full aliasing on MSVC */
+/*#pragma optimize("a", on)*/
+#pragma warning(push)
+#pragma warning(disable:4100)	/* unreferenced formal parameter */
+#pragma warning(disable:4127)	/* conditional expression is constant */
+#pragma warning(disable:4706)	/* assignment within conditional expression */
+#endif
+
+/*#define ENABLE_TOLERANT_NEDMALLOC 1*/
+/*#define ENABLE_FAST_HEAP_DETECTION 1*/
+/*#define NEDMALLOC_DEBUG 1*/
+
+/*#define FULLSANITYCHECKS*/
+/* If link time code generation is on, don't force or prevent inlining */
+#if defined(_MSC_VER) && defined(NEDMALLOC_DLL_EXPORTS)
+#define FORCEINLINE
+#define NOINLINE
+#endif
+
+
+#include "nedmalloc.h"
+#ifdef WIN32
+ #include <malloc.h>
+ #include <stddef.h>
+#endif
+#if USE_ALLOCATOR==1
+ #define MSPACES 1
+ #define ONLY_MSPACES 1
+#endif
+#define USE_DL_PREFIX 1
+#ifndef USE_LOCKS
+ #define USE_LOCKS 1
+#endif
+#define FOOTERS 1           /* Need to enable footers so frees lock the right mspace */
+#ifndef NEDMALLOC_DEBUG
+ #if defined(DEBUG) || defined(_DEBUG)
+  #define NEDMALLOC_DEBUG 1
+ #else
+  #define NEDMALLOC_DEBUG 0
+ #endif
+#endif
+/* We need to consistently define DEBUG=0|1, _DEBUG and NDEBUG for dlmalloc */
+#undef DEBUG
+#undef _DEBUG
+#if NEDMALLOC_DEBUG
+ #define _DEBUG
+ #define DEBUG 1
+#else
+ #define DEBUG 0
+#endif
+#ifdef NDEBUG               /* Disable assert checking on release builds */
+ #undef DEBUG
+ #undef _DEBUG
+#endif
+/* The default of 64Kb means we spend too much time kernel-side */
+#ifndef DEFAULT_GRANULARITY
+#define DEFAULT_GRANULARITY (1*1024*1024)
+#if DEBUG
+#define DEFAULT_GRANULARITY_ALIGNED
+#endif
+#endif
+/*#define USE_SPIN_LOCKS 0*/
+
+
+#include "malloc.c.h"
+#ifdef NDEBUG               /* Disable assert checking on release builds */
+ #undef DEBUG
+#elif !NEDMALLOC_DEBUG
+ #ifdef __GNUC__
+  #warning DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.
+ #elif defined(_MSC_VER)
+  #pragma message(__FILE__ ": WARNING: DEBUG is defined so allocator will run with assert checking! Define NDEBUG to run at full speed.")
+ #endif
+#endif
+
+/* The maximum concurrent threads in a pool possible */
+#ifndef MAXTHREADSINPOOL
+#define MAXTHREADSINPOOL 16
+#endif
+/* The maximum number of threadcaches which can be allocated */
+#ifndef THREADCACHEMAXCACHES
+#define THREADCACHEMAXCACHES 256
+#endif
+/* The maximum size to be allocated from the thread cache */
+#ifndef THREADCACHEMAX
+#define THREADCACHEMAX 8192
+#endif
+#if 0
+/* The number of cache entries for finer grained bins. This is (topbitpos(THREADCACHEMAX)-4)*2 */
+#define THREADCACHEMAXBINS ((13-4)*2)
+#else
+/* The number of cache entries. This is (topbitpos(THREADCACHEMAX)-4) */
+#define THREADCACHEMAXBINS (13-4)
+#endif
+/* Point at which the free space in a thread cache is garbage collected */
+#ifndef THREADCACHEMAXFREESPACE
+#define THREADCACHEMAXFREESPACE (512*1024)
+#endif
+
+
+#ifdef WIN32
+ #define TLSVAR			DWORD
+ #define TLSALLOC(k)	(*(k)=TlsAlloc(), TLS_OUT_OF_INDEXES==*(k))
+ #define TLSFREE(k)		(!TlsFree(k))
+ #define TLSGET(k)		TlsGetValue(k)
+ #define TLSSET(k, a)	(!TlsSetValue(k, a))
+ #ifdef DEBUG
+static LPVOID ChkedTlsGetValue(DWORD idx)
+{
+	LPVOID ret=TlsGetValue(idx);
+	assert(S_OK==GetLastError());
+	return ret;
+}
+  #undef TLSGET
+  #define TLSGET(k) ChkedTlsGetValue(k)
+ #endif
+#else
+ #define TLSVAR			pthread_key_t
+ #define TLSALLOC(k)	pthread_key_create(k, 0)
+ #define TLSFREE(k)		pthread_key_delete(k)
+ #define TLSGET(k)		pthread_getspecific(k)
+ #define TLSSET(k, a)	pthread_setspecific(k, a)
+#endif
+
+#if defined(__cplusplus)
+#if !defined(NO_NED_NAMESPACE)
+namespace nedalloc {
+#else
+extern "C" {
+#endif
+#endif
+
+#if USE_ALLOCATOR==0
+static void *unsupported_operation(const char *opname) THROWSPEC
+{
+	fprintf(stderr, "nedmalloc: The operation %s is not supported under this build configuration\n", opname);
+	abort();
+	return 0;
+}
+static size_t mspacecounter=(size_t) 0xdeadbeef;
+#endif
+#ifndef ENABLE_FAST_HEAP_DETECTION
+static void *RESTRICT leastusedaddress;
+static size_t largestusedblock;
+#endif
+
+static FORCEINLINE void *CallMalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
+{
+	void *RESTRICT ret=0;
+	size_t _alignment=alignment;
+#if USE_MAGIC_HEADERS
+	size_t *_ret=0;
+	size+=alignment+3*sizeof(size_t);
+	_alignment=0;
+#endif
+#if USE_ALLOCATOR==0
+	ret=_alignment ? 
+#ifdef _MSC_VER
+	/* This is the MSVCRT equivalent */
+		_aligned_malloc(size, _alignment)
+#elif defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
+	/* This is the glibc/ptmalloc2/dlmalloc/BSD libc equivalent.  */
+		memalign(_alignment, size)
+#else
+#error Cannot aligned allocate with the memory allocator of an unknown system!
+#endif
+		: malloc(size);
+#elif USE_ALLOCATOR==1
+	ret=_alignment ? mspace_memalign((mstate) mspace, _alignment, size) : mspace_malloc((mstate) mspace, size);
+#ifndef ENABLE_FAST_HEAP_DETECTION
+	if(ret)
+	{
+		size_t truesize=chunksize(mem2chunk(ret));
+		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
+		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
+	}
+#endif
+#endif
+	if(!ret) return 0;
+#if USE_MAGIC_HEADERS
+	_ret=(size_t *) ret;
+	ret=(void *)(_ret+3);
+	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
+	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *)"NEDMALOC";
+	_ret[0]=(size_t) mspace;
+	_ret[1]=size-3*sizeof(size_t);
+#endif
+	return ret;
+}
+
+static FORCEINLINE void *CallCalloc(void *RESTRICT mspace, size_t size, size_t alignment) THROWSPEC
+{
+	void *RESTRICT ret=0;
+#if USE_MAGIC_HEADERS
+	size_t *_ret=0;
+	size+=alignment+3*sizeof(size_t);
+#endif
+#if USE_ALLOCATOR==0
+	ret=calloc(1, size);
+#elif USE_ALLOCATOR==1
+	ret=mspace_calloc((mstate) mspace, 1, size);
+#ifndef ENABLE_FAST_HEAP_DETECTION
+	if(ret)
+	{
+		size_t truesize=chunksize(mem2chunk(ret));
+		if(!leastusedaddress || (void *)((mstate) mspace)->least_addr<leastusedaddress) leastusedaddress=(void *)((mstate) mspace)->least_addr;
+		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
+	}
+#endif
+#endif
+	if(!ret) return 0;
+#if USE_MAGIC_HEADERS
+	_ret=(size_t *) ret;
+	ret=(void *)(_ret+3);
+	if(alignment) ret=(void *)(((size_t) ret+alignment-1)&~(alignment-1));
+	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
+	_ret[0]=(size_t) mspace;
+	_ret[1]=size-3*sizeof(size_t);
+#endif
+	return ret;
+}
+
+static FORCEINLINE void *CallRealloc(void *RESTRICT mspace, void *RESTRICT mem, int isforeign, size_t oldsize, size_t newsize) THROWSPEC
+{
+	void *RESTRICT ret=0;
+#if USE_MAGIC_HEADERS
+	mstate oldmspace=0;
+	size_t *_ret=0, *_mem=(size_t *) mem-3;
+#endif
+	if(isforeign)
+	{	/* Transfer */
+#if USE_MAGIC_HEADERS
+		assert(_mem[0]!=*(size_t *) "NEDMALOC");
+#endif
+		if((ret=CallMalloc(mspace, newsize, 0)))
+		{
+#if defined(DEBUG)
+			printf("*** nedmalloc frees system allocated block %p\n", mem);
+#endif
+			memcpy(ret, mem, oldsize<newsize ? oldsize : newsize);
+			free(mem);
+		}
+		return ret;
+	}
+#if USE_MAGIC_HEADERS
+	assert(_mem[0]==*(size_t *) "NEDMALOC");
+	newsize+=3*sizeof(size_t);
+	oldmspace=(mstate) _mem[1];
+	assert(oldsize>=_mem[2]);
+	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
+	mem=(void *)(++_mem);
+#endif
+#if USE_ALLOCATOR==0
+	ret=realloc(mem, newsize);
+#elif USE_ALLOCATOR==1
+	ret=mspace_realloc((mstate) mspace, mem, newsize);
+#ifndef ENABLE_FAST_HEAP_DETECTION
+	if(ret)
+	{
+		size_t truesize=chunksize(mem2chunk(ret));
+		if(!largestusedblock || truesize>largestusedblock) largestusedblock=(truesize+mparams.page_size) & ~(mparams.page_size-1);
+	}
+#endif
+#endif
+	if(!ret)
+	{	/* Put it back the way it was */
+#if USE_MAGIC_HEADERS
+		for(; *_mem==0; *_mem++=*(size_t *) "NEDMALOC");
+#endif
+		return 0;
+	}
+#if USE_MAGIC_HEADERS
+	_ret=(size_t *) ret;
+	ret=(void *)(_ret+3);
+	for(; _ret<(size_t *)ret-2; _ret++) *_ret=*(size_t *) "NEDMALOC";
+	_ret[0]=(size_t) mspace;
+	_ret[1]=newsize-3*sizeof(size_t);
+#endif
+	return ret;
+}
+
+static FORCEINLINE void CallFree(void *RESTRICT mspace, void *RESTRICT mem, int isforeign) THROWSPEC
+{
+#if USE_MAGIC_HEADERS
+	mstate oldmspace=0;
+	size_t *_mem=(size_t *) mem-3, oldsize=0;
+#endif
+	if(isforeign)
+	{
+#if USE_MAGIC_HEADERS
+		assert(_mem[0]!=*(size_t *) "NEDMALOC");
+#endif
+#if defined(DEBUG)
+		printf("*** nedmalloc frees system allocated block %p\n", mem);
+#endif
+		free(mem);
+		return;
+	}
+#if USE_MAGIC_HEADERS
+	assert(_mem[0]==*(size_t *) "NEDMALOC");
+	oldmspace=(mstate) _mem[1];
+	oldsize=_mem[2];
+	for(; *_mem==*(size_t *) "NEDMALOC"; *_mem--=*(size_t *) "nedmaloc");
+	mem=(void *)(++_mem);
+#endif
+#if USE_ALLOCATOR==0
+	free(mem);
+#elif USE_ALLOCATOR==1
+	mspace_free((mstate) mspace, mem);
+#endif
+}
+
+static NEDMALLOCNOALIASATTR mstate nedblkmstate(void *RESTRICT mem) THROWSPEC
+{
+	if(mem)
+	{
+#if USE_MAGIC_HEADERS
+		size_t *_mem=(size_t *) mem-3;
+		if(_mem[0]==*(size_t *) "NEDMALOC")
+		{
+			return (mstate) _mem[1];
+		}
+		else return 0;
+#else
+#if USE_ALLOCATOR==0
+		/* Fail everything */
+		return 0;
+#elif USE_ALLOCATOR==1
+#ifdef ENABLE_FAST_HEAP_DETECTION
+#ifdef WIN32
+		/*  On Windows for RELEASE both x86 and x64 the NT heap precedes each block with an eight byte header
+			which looks like:
+				normal: 4 bytes of size, 4 bytes of [char < 64, char < 64, char < 64 bit 0 always set, char random ]
+				mmaped: 4 bytes of size  4 bytes of [zero,      zero,      0xb,                        zero        ]
+
+			On Windows for DEBUG both x86 and x64 the preceding four bytes is always 0xfdfdfdfd (no man's land).
+		*/
+#pragma pack(push, 1)
+		struct _HEAP_ENTRY
+		{
+			USHORT Size;
+			USHORT PreviousSize;
+			UCHAR Cookie;			/* SegmentIndex */
+			UCHAR Flags;			/* always bit 0 (HEAP_ENTRY_BUSY). bit 1=(HEAP_ENTRY_EXTRA_PRESENT), bit 2=normal block (HEAP_ENTRY_FILL_PATTERN), bit 3=mmap block (HEAP_ENTRY_VIRTUAL_ALLOC). Bit 4 (HEAP_ENTRY_LAST_ENTRY) could be set */
+			UCHAR UnusedBytes;
+			UCHAR SmallTagIndex;	/* fastbin index. Always one of 0x02, 0x03, 0x04 < 0x80 */
+		} *RESTRICT he=((struct _HEAP_ENTRY *) mem)-1;
+#pragma pack(pop)
+		unsigned int header=((unsigned int *)mem)[-1], mask1=0x8080E100, result1, mask2=0xFFFFFF06, result2;
+		result1=header & mask1;	/* Positive testing for NT heap */
+		result2=header & mask2;	/* Positive testing for dlmalloc */
+		if(result1==0x00000100 && result2!=0x00000102)
+		{	/* This is likely a NT heap block */
+			return 0;
+		}
+#endif
+#ifdef __linux__
+		/* On Linux glibc uses ptmalloc2 (really dlmalloc) just as we do, but prev_foot contains rubbish
+		when the preceding block is allocated because ptmalloc2 finds the local mstate by rounding the ptr
+		down to the nearest megabyte. It's like dlmalloc with FOOTERS disabled. */
+		mchunkptr p=mem2chunk(mem);
+		mstate fm=get_mstate_for(p);
+		/* If it's a ptmalloc2 block, fm is likely to be some crazy value */
+		if(!is_aligned(fm)) return 0;
+		if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
+		if(ok_magic(fm))
+			return fm;
+		else
+			return 0;
+		if(1) { }
+#endif
+		else
+		{
+			mchunkptr p=mem2chunk(mem);
+			mstate fm=get_mstate_for(p);
+			assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
+			if(ok_magic(fm))
+				return fm;
+		}
+#else
+//#ifdef WIN32
+//		__try
+//#endif
+		{
+			/* We try to return zero here if it isn't one of our own blocks, however
+			the current block annotation scheme used by dlmalloc makes it impossible
+			to be absolutely sure of avoiding a segfault.
+
+			mchunkptr->prev_foot = mem-(2*size_t) = mstate ^ mparams.magic for PRECEDING block;
+			mchunkptr->head      = mem-(1*size_t) = 8 multiple size of this block with bottom three bits = FLAG_BITS
+			    FLAG_BITS = bit 0 is CINUSE (currently in use unless is mmap), bit 1 is PINUSE (previous block currently
+				            in use unless mmap), bit 2 is UNUSED and currently is always zero.
+			*/
+			register void *RESTRICT leastusedaddress_=leastusedaddress;		/* Cache these to avoid register reloading */
+			register size_t largestusedblock_=largestusedblock;
+			if(!is_aligned(mem)) return 0;		/* Would fail very rarely as all allocators return aligned blocks */
+			if(mem<leastusedaddress_) return 0;	/* Simple but effective */
+			{
+				mchunkptr p=mem2chunk(mem);
+				mstate fm=0;
+				int ismmapped=is_mmapped(p);
+				if((!ismmapped && !is_inuse(p)) || (p->head & FLAG4_BIT)) return 0;
+				/* Reduced uncertainty by 0.5^2 = 25.0% */
+				/* size should never exceed largestusedblock */
+				if(chunksize(p)>largestusedblock_) return 0;
+				/* Reduced uncertainty by a minimum of 0.5^3 = 12.5%, maximum 0.5^16 = 0.0015% */
+				/* Having sanity checked prev_foot and head, check next block */
+				if(!ismmapped && (!next_pinuse(p) || (next_chunk(p)->head & FLAG4_BIT))) return 0;
+				/* Reduced uncertainty by 0.5^5 = 3.13% or 0.5^18 = 0.00038% */
+	#if 0
+				/* If previous block is free, check that its next block pointer equals us */
+				if(!ismmapped && !pinuse(p))
+					if(next_chunk(prev_chunk(p))!=p) return 0;
+				/* We could start comparing prev_foot's for similarity but it starts getting slow. */
+	#endif
+				fm = get_mstate_for(p);
+				if(!is_aligned(fm) || (void *)fm<leastusedaddress_) return 0;
+				if((size_t)mem-(size_t)fm>=(size_t)1<<(SIZE_T_BITSIZE-1)) return 0;
+				assert(ok_magic(fm));	/* If this fails, someone tried to free a block twice */
+				if(ok_magic(fm))
+					return fm;
+			}
+		}
+//#ifdef WIN32
+//		__except(1) { }
+//#endif
+#endif
+#endif
+#endif
+	}
+	return 0;
+}
+NEDMALLOCNOALIASATTR size_t nedblksize(int *RESTRICT isforeign, void *RESTRICT mem) THROWSPEC
+{
+	if(mem)
+	{
+		if(isforeign) *isforeign=1;
+#if USE_MAGIC_HEADERS
+		{
+			size_t *_mem=(size_t *) mem-3;
+			if(_mem[0]==*(size_t *) "NEDMALOC")
+			{
+				mstate mspace=(mstate) _mem[1];
+				size_t size=_mem[2];
+				if(isforeign) *isforeign=0;
+				return size;
+			}
+		}
+#elif USE_ALLOCATOR==1
+		if(nedblkmstate(mem))
+		{
+			mchunkptr p=mem2chunk(mem);
+			if(isforeign) *isforeign=0;
+			return chunksize(p)-overhead_for(p);
+		}
+#ifdef DEBUG
+		else
+		{
+			int a=1; /* Set breakpoints here if needed */
+		}
+#endif
+#endif
+#if defined(ENABLE_TOLERANT_NEDMALLOC) || USE_ALLOCATOR==0
+#ifdef _MSC_VER
+		/* This is the MSVCRT equivalent */
+		return _msize(mem);
+#elif defined(__linux__)
+		/* This is the glibc/ptmalloc2/dlmalloc equivalent.  */
+		return malloc_usable_size(mem);
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+		/* This is the BSD libc equivalent.  */
+		return malloc_size(mem);
+#else
+#error Cannot tolerate the memory allocator of an unknown system!
+#endif
+#endif
+	}
+	return 0;
+}
+
+NEDMALLOCNOALIASATTR void nedsetvalue(void *v) THROWSPEC											{ nedpsetvalue((nedpool *) 0, v); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmalloc(size_t size) THROWSPEC						{ return nedpmalloc((nedpool *) 0, size); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedcalloc(size_t no, size_t size) THROWSPEC			{ return nedpcalloc((nedpool *) 0, no, size); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedrealloc(void *mem, size_t size) THROWSPEC			{ return nedprealloc((nedpool *) 0, mem, size); }
+NEDMALLOCNOALIASATTR void   nedfree(void *mem) THROWSPEC											{ nedpfree((nedpool *) 0, mem); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC	{ return nedpmemalign((nedpool *) 0, alignment, bytes); }
+NEDMALLOCNOALIASATTR struct nedmallinfo nedmallinfo(void) THROWSPEC									{ return nedpmallinfo((nedpool *) 0); }
+NEDMALLOCNOALIASATTR int    nedmallopt(int parno, int value) THROWSPEC								{ return nedpmallopt((nedpool *) 0, parno, value); }
+NEDMALLOCNOALIASATTR int    nedmalloc_trim(size_t pad) THROWSPEC									{ return nedpmalloc_trim((nedpool *) 0, pad); }
+void   nedmalloc_stats() THROWSPEC																	{ nedpmalloc_stats((nedpool *) 0); }
+NEDMALLOCNOALIASATTR size_t nedmalloc_footprint() THROWSPEC											{ return nedpmalloc_footprint((nedpool *) 0); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC	{ return nedpindependent_calloc((nedpool *) 0, elemsno, elemsize, chunks); }
+NEDMALLOCNOALIASATTR NEDMALLOCPTRATTR void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC		{ return nedpindependent_comalloc((nedpool *) 0, elems, sizes, chunks); }
+
+struct threadcacheblk_t;
+typedef struct threadcacheblk_t threadcacheblk;
+struct threadcacheblk_t
+{	/* Keep less than 16 bytes on 32 bit systems and 32 bytes on 64 bit systems */
+#ifdef FULLSANITYCHECKS
+	unsigned int magic;
+#endif
+	unsigned int lastUsed, size;
+	threadcacheblk *next, *prev;
+};
+typedef struct threadcache_t
+{
+#ifdef FULLSANITYCHECKS
+	unsigned int magic1;
+#endif
+	int mymspace;						/* Last mspace entry this thread used */
+	long threadid;
+	unsigned int mallocs, frees, successes;
+	size_t freeInCache;					/* How much free space is stored in this cache */
+	threadcacheblk *bins[(THREADCACHEMAXBINS+1)*2];
+#ifdef FULLSANITYCHECKS
+	unsigned int magic2;
+#endif
+} threadcache;
+struct nedpool_t
+{
+	MLOCK_T mutex;
+	void *uservalue;
+	int threads;						/* Max entries in m to use */
+	threadcache *caches[THREADCACHEMAXCACHES];
+	TLSVAR mycache;						/* Thread cache for this thread. 0 for unset, negative for use mspace-1 directly, otherwise is cache-1 */
+	mstate m[MAXTHREADSINPOOL+1];		/* mspace entries for this pool */
+};
+static nedpool syspool;
+
+static FORCEINLINE NEDMALLOCNOALIASATTR unsigned int size2binidx(size_t _size) THROWSPEC
+{	/* 8=1000	16=10000	20=10100	24=11000	32=100000	48=110000	4096=1000000000000 */
+	unsigned int topbit, size=(unsigned int)(_size>>4);
+	/* 16=1		20=1	24=1	32=10	48=11	64=100	96=110	128=1000	4096=100000000 */
+
+#if defined(__GNUC__)
+        topbit = sizeof(size)*__CHAR_BIT__ - 1 - __builtin_clz(size);
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+	{
+            unsigned long bsrTopBit;
+
+            _BitScanReverse(&bsrTopBit, size);
+
+            topbit = bsrTopBit;
+        }
+#else
+#if 0
+	union {
+		unsigned asInt[2];
+		double asDouble;
+	};
+	int n;
+
+	asDouble = (double)size + 0.5;
+	topbit = (asInt[!FOX_BIGENDIAN] >> 20) - 1023;
+#else
+	{
+		unsigned int x=size;
+		x = x | (x >> 1);
+		x = x | (x >> 2);
+		x = x | (x >> 4);
+		x = x | (x >> 8);
+		x = x | (x >>16);
+		x = ~x;
+		x = x - ((x >> 1) & 0x55555555);
+		x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+		x = (x + (x >> 4)) & 0x0F0F0F0F;
+		x = x + (x << 8);
+		x = x + (x << 16);
+		topbit=31 - (x >> 24);
+	}
+#endif
+#endif
+	return topbit;
+}
+
+
+#ifdef FULLSANITYCHECKS
+static void tcsanitycheck(threadcacheblk **ptr) THROWSPEC
+{
+	assert((ptr[0] && ptr[1]) || (!ptr[0] && !ptr[1]));
+	if(ptr[0] && ptr[1])
+	{
+		assert(nedblksize(ptr[0])>=sizeof(threadcacheblk));
+		assert(nedblksize(ptr[1])>=sizeof(threadcacheblk));
+		assert(*(unsigned int *) "NEDN"==ptr[0]->magic);
+		assert(*(unsigned int *) "NEDN"==ptr[1]->magic);
+		assert(!ptr[0]->prev);
+		assert(!ptr[1]->next);
+		if(ptr[0]==ptr[1])
+		{
+			assert(!ptr[0]->next);
+			assert(!ptr[1]->prev);
+		}
+	}
+}
+static void tcfullsanitycheck(threadcache *tc) THROWSPEC
+{
+	threadcacheblk **tcbptr=tc->bins;
+	int n;
+	for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
+	{
+		threadcacheblk *b, *ob=0;
+		tcsanitycheck(tcbptr);
+		for(b=tcbptr[0]; b; ob=b, b=b->next)
+		{
+			assert(*(unsigned int *) "NEDN"==b->magic);
+			assert(!ob || ob->next==b);
+			assert(!ob || b->prev==ob);
+		}
+	}
+}
+#endif
+
+static NOINLINE void RemoveCacheEntries(nedpool *RESTRICT p, threadcache *RESTRICT tc, unsigned int age) THROWSPEC
+{
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	if(tc->freeInCache)
+	{
+		threadcacheblk **tcbptr=tc->bins;
+		int n;
+		for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
+		{
+			threadcacheblk **tcb=tcbptr+1;		/* come from oldest end of list */
+			/*tcsanitycheck(tcbptr);*/
+			for(; *tcb && tc->frees-(*tcb)->lastUsed>=age; )
+			{
+				threadcacheblk *f=*tcb;
+				size_t blksize=f->size; /*nedblksize(f);*/
+				assert(blksize<=nedblksize(0, f));
+				assert(blksize);
+#ifdef FULLSANITYCHECKS
+				assert(*(unsigned int *) "NEDN"==(*tcb)->magic);
+#endif
+				*tcb=(*tcb)->prev;
+				if(*tcb)
+					(*tcb)->next=0;
+				else
+					*tcbptr=0;
+				tc->freeInCache-=blksize;
+				assert((long) tc->freeInCache>=0);
+				CallFree(0, f, 0);
+				/*tcsanitycheck(tcbptr);*/
+			}
+		}
+	}
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+}
+static void DestroyCaches(nedpool *RESTRICT p) THROWSPEC
+{
+	if(p->caches)
+	{
+		threadcache *tc;
+		int n;
+		for(n=0; n<THREADCACHEMAXCACHES; n++)
+		{
+			if((tc=p->caches[n]))
+			{
+				tc->frees++;
+				RemoveCacheEntries(p, tc, 0);
+				assert(!tc->freeInCache);
+				tc->mymspace=-1;
+				tc->threadid=0;
+				CallFree(0, tc, 0);
+				p->caches[n]=0;
+			}
+		}
+	}
+}
+
+static NOINLINE threadcache *AllocCache(nedpool *RESTRICT p) THROWSPEC
+{
+	threadcache *tc=0;
+	int n, end;
+	ACQUIRE_LOCK(&p->mutex);
+	for(n=0; n<THREADCACHEMAXCACHES && p->caches[n]; n++);
+	if(THREADCACHEMAXCACHES==n)
+	{	/* List exhausted, so disable for this thread */
+		RELEASE_LOCK(&p->mutex);
+		return 0;
+	}
+	tc=p->caches[n]=(threadcache *) CallCalloc(p->m[0], sizeof(threadcache), 0);
+	if(!tc)
+	{
+		RELEASE_LOCK(&p->mutex);
+		return 0;
+	}
+#ifdef FULLSANITYCHECKS
+	tc->magic1=*(unsigned int *)"NEDMALC1";
+	tc->magic2=*(unsigned int *)"NEDMALC2";
+#endif
+	tc->threadid=(long)(size_t)CURRENT_THREAD;
+	for(end=0; p->m[end]; end++);
+	tc->mymspace=abs(tc->threadid) % end;
+	RELEASE_LOCK(&p->mutex);
+	if(TLSSET(p->mycache, (void *)(size_t)(n+1))) abort();
+	return tc;
+}
+
+static void *threadcache_malloc(nedpool *RESTRICT p, threadcache *RESTRICT tc, size_t *RESTRICT _size) THROWSPEC
+{
+	void *RESTRICT ret=0;
+	size_t size=*_size, blksize=0;
+	unsigned int bestsize;
+	unsigned int idx=size2binidx(size);
+	threadcacheblk *RESTRICT blk, **RESTRICT binsptr;
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	/* Calculate best fit bin size */
+	bestsize=1<<(idx+4);
+#if 0
+	/* Finer grained bin fit */
+	idx<<=1;
+	if(size>bestsize)
+	{
+		idx++;
+		bestsize+=bestsize>>1;
+	}
+	if(size>bestsize)
+	{
+		idx++;
+		bestsize=1<<(4+(idx>>1));
+	}
+#else
+	if(size>bestsize)
+	{
+		idx++;
+		bestsize<<=1;
+	}
+#endif
+	assert(bestsize>=size);
+	if(size<bestsize) size=bestsize;
+	assert(size<=THREADCACHEMAX);
+	assert(idx<=THREADCACHEMAXBINS);
+	binsptr=&tc->bins[idx*2];
+	/* Try to match close, but move up a bin if necessary */
+	blk=*binsptr;
+	if(!blk || blk->size<size)
+	{	/* Bump it up a bin */
+		if(idx<THREADCACHEMAXBINS)
+		{
+			idx++;
+			binsptr+=2;
+			blk=*binsptr;
+		}
+	}
+	if(blk)
+	{
+		blksize=blk->size; /*nedblksize(blk);*/
+		assert(nedblksize(0, blk)>=blksize);
+		assert(blksize>=size);
+		if(blk->next)
+			blk->next->prev=0;
+		*binsptr=blk->next;
+		if(!*binsptr)
+			binsptr[1]=0;
+#ifdef FULLSANITYCHECKS
+		blk->magic=0;
+#endif
+		assert(binsptr[0]!=blk && binsptr[1]!=blk);
+		assert(nedblksize(0, blk)>=sizeof(threadcacheblk) && nedblksize(0, blk)<=THREADCACHEMAX+CHUNK_OVERHEAD);
+		/*printf("malloc: %p, %p, %p, %lu\n", p, tc, blk, (long) _size);*/
+		ret=(void *) blk;
+	}
+	++tc->mallocs;
+	if(ret)
+	{
+		assert(blksize>=size);
+		++tc->successes;
+		tc->freeInCache-=blksize;
+		assert((long) tc->freeInCache>=0);
+	}
+#if defined(DEBUG) && 0
+	if(!(tc->mallocs & 0xfff))
+	{
+		printf("*** threadcache=%u, mallocs=%u (%f), free=%u (%f), freeInCache=%u\n", (unsigned int) tc->threadid, tc->mallocs,
+			(float) tc->successes/tc->mallocs, tc->frees, (float) tc->successes/tc->frees, (unsigned int) tc->freeInCache);
+	}
+#endif
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	*_size=size;
+	return ret;
+}
+static NOINLINE void ReleaseFreeInCache(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace) THROWSPEC
+{
+	unsigned int age=THREADCACHEMAXFREESPACE/8192;
+	/*ACQUIRE_LOCK(&p->m[mymspace]->mutex);*/
+	while(age && tc->freeInCache>=THREADCACHEMAXFREESPACE)
+	{
+		RemoveCacheEntries(p, tc, age);
+		/*printf("*** Removing cache entries older than %u (%u)\n", age, (unsigned int) tc->freeInCache);*/
+		age>>=1;
+	}
+	/*RELEASE_LOCK(&p->m[mymspace]->mutex);*/
+}
+static void threadcache_free(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, void *RESTRICT mem, size_t size) THROWSPEC
+{
+	unsigned int bestsize;
+	unsigned int idx=size2binidx(size);
+	threadcacheblk **RESTRICT binsptr, *RESTRICT tck=(threadcacheblk *) mem;
+	assert(size>=sizeof(threadcacheblk) && size<=THREADCACHEMAX+CHUNK_OVERHEAD);
+#ifdef DEBUG
+	/* Make sure this is a valid memory block */
+	assert(nedblksize(0, mem));
+#endif
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	/* Calculate best fit bin size */
+	bestsize=1<<(idx+4);
+#if 0
+	/* Finer grained bin fit */
+	idx<<=1;
+	if(size>bestsize)
+	{
+		unsigned int biggerbestsize=bestsize+bestsize<<1;
+		if(size>=biggerbestsize)
+		{
+			idx++;
+			bestsize=biggerbestsize;
+		}
+	}
+#endif
+	if(bestsize!=size)	/* dlmalloc can round up, so we round down to preserve indexing */
+		size=bestsize;
+	binsptr=&tc->bins[idx*2];
+	assert(idx<=THREADCACHEMAXBINS);
+	if(tck==*binsptr)
+	{
+		fprintf(stderr, "nedmalloc: Attempt to free already freed memory block %p - aborting!\n", tck);
+		abort();
+	}
+#ifdef FULLSANITYCHECKS
+	tck->magic=*(unsigned int *) "NEDN";
+#endif
+	tck->lastUsed=++tc->frees;
+	tck->size=(unsigned int) size;
+	tck->next=*binsptr;
+	tck->prev=0;
+	if(tck->next)
+		tck->next->prev=tck;
+	else
+		binsptr[1]=tck;
+	assert(!*binsptr || (*binsptr)->size==tck->size);
+	*binsptr=tck;
+	assert(tck==tc->bins[idx*2]);
+	assert(tc->bins[idx*2+1]==tck || binsptr[0]->next->prev==tck);
+	/*printf("free: %p, %p, %p, %lu\n", p, tc, mem, (long) size);*/
+	tc->freeInCache+=size;
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+#if 1
+	if(tc->freeInCache>=THREADCACHEMAXFREESPACE)
+		ReleaseFreeInCache(p, tc, mymspace);
+#endif
+}
+
+
+
+
+static NOINLINE int InitPool(nedpool *RESTRICT p, size_t capacity, int threads) THROWSPEC
+{	/* threads is -1 for system pool */
+	ensure_initialization();
+	ACQUIRE_MALLOC_GLOBAL_LOCK();
+	if(p->threads) goto done;
+	if(INITIAL_LOCK(&p->mutex)) goto err;
+	if(TLSALLOC(&p->mycache)) goto err;
+#if USE_ALLOCATOR==0
+	p->m[0]=(mstate) mspacecounter++;
+#elif USE_ALLOCATOR==1
+	if(!(p->m[0]=(mstate) create_mspace(capacity, 1))) goto err;
+	p->m[0]->extp=p;
+#endif
+	p->threads=(threads<1 || threads>MAXTHREADSINPOOL) ? MAXTHREADSINPOOL : threads;
+done:
+	RELEASE_MALLOC_GLOBAL_LOCK();
+	return 1;
+err:
+	if(threads<0)
+		abort();			/* If you can't allocate for system pool, we're screwed */
+	DestroyCaches(p);
+	if(p->m[0])
+	{
+#if USE_ALLOCATOR==1
+		destroy_mspace(p->m[0]);
+#endif
+		p->m[0]=0;
+	}
+	if(p->mycache)
+	{
+		if(TLSFREE(p->mycache)) abort();
+		p->mycache=0;
+	}
+	RELEASE_MALLOC_GLOBAL_LOCK();
+	return 0;
+}
+static NOINLINE mstate FindMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int *RESTRICT lastUsed, size_t size) THROWSPEC
+{	/* Gets called when thread's last used mspace is in use. The strategy
+	is to run through the list of all available mspaces looking for an
+	unlocked one and if we fail, we create a new one so long as we don't
+	exceed p->threads */
+	int n, end;
+	for(n=end=*lastUsed+1; p->m[n]; end=++n)
+	{
+		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
+	}
+	for(n=0; n<*lastUsed && p->m[n]; n++)
+	{
+		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
+	}
+	if(end<p->threads)
+	{
+		mstate temp;
+#if USE_ALLOCATOR==0
+		temp=(mstate) mspacecounter++;
+#elif USE_ALLOCATOR==1
+		if(!(temp=(mstate) create_mspace(size, 1)))
+			goto badexit;
+#endif
+		/* Now we're ready to modify the lists, we lock */
+		ACQUIRE_LOCK(&p->mutex);
+		while(p->m[end] && end<p->threads)
+			end++;
+		if(end>=p->threads)
+		{	/* Drat, must destroy it now */
+			RELEASE_LOCK(&p->mutex);
+#if USE_ALLOCATOR==1
+			destroy_mspace((mstate) temp);
+#endif
+			goto badexit;
+		}
+		/* We really want to make sure this goes into memory now but we
+		have to be careful of breaking aliasing rules, so write it twice */
+		*((volatile struct malloc_state **) &p->m[end])=p->m[end]=temp;
+		ACQUIRE_LOCK(&p->m[end]->mutex);
+		/*printf("Created mspace idx %d\n", end);*/
+		RELEASE_LOCK(&p->mutex);
+		n=end;
+		goto found;
+	}
+	/* Let it lock on the last one it used */
+badexit:
+	ACQUIRE_LOCK(&p->m[*lastUsed]->mutex);
+	return p->m[*lastUsed];
+found:
+	*lastUsed=n;
+	if(tc)
+		tc->mymspace=n;
+	else
+	{
+		if(TLSSET(p->mycache, (void *)(size_t)(-(n+1)))) abort();
+	}
+	return p->m[n];
+}
+
+typedef struct PoolList_t
+{
+	size_t size;			/* Size of list */
+	size_t length;			/* Actual entries in list */
+#ifdef DEBUG
+	nedpool *list[1];		/* Force testing of list expansion */
+#else
+	nedpool *list[16];
+#endif
+} PoolList;
+static MLOCK_T poollistlock;
+static PoolList *poollist;
+NEDMALLOCPTRATTR nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC
+{
+	nedpool *ret=0;
+	if(!poollist)
+	{
+		PoolList *newpoollist=0;
+		if(!(newpoollist=(PoolList *) nedpcalloc(0, 1, sizeof(PoolList)+sizeof(nedpool *)))) return 0;
+		INITIAL_LOCK(&poollistlock);
+		ACQUIRE_LOCK(&poollistlock);
+		poollist=newpoollist;
+		poollist->size=sizeof(poollist->list)/sizeof(nedpool *);
+	}
+	else
+		ACQUIRE_LOCK(&poollistlock);
+	if(poollist->length==poollist->size)
+	{
+		PoolList *newpoollist=0;
+		size_t newsize=0;
+		newsize=sizeof(PoolList)+(poollist->size+1)*sizeof(nedpool *);
+		if(!(newpoollist=(PoolList *) nedprealloc(0, poollist, newsize))) goto badexit;
+		poollist=newpoollist;
+		memset(&poollist->list[poollist->size], 0, newsize-((size_t)&poollist->list[poollist->size]-(size_t)&poollist->list[0]));
+		poollist->size=((newsize-((char *)&poollist->list[0]-(char *)poollist))/sizeof(nedpool *))-1;
+		assert(poollist->size>poollist->length);
+	}
+	if(!(ret=(nedpool *) nedpcalloc(0, 1, sizeof(nedpool)))) goto badexit;
+	if(!InitPool(ret, capacity, threads))
+	{
+		nedpfree(0, ret);
+		goto badexit;
+	}
+	poollist->list[poollist->length++]=ret;
+badexit:
+	RELEASE_LOCK(&poollistlock);
+	return ret;
+}
+void neddestroypool(nedpool *p) THROWSPEC
+{
+	unsigned int n;
+	ACQUIRE_LOCK(&p->mutex);
+	DestroyCaches(p);
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		destroy_mspace(p->m[n]);
+#endif
+		p->m[n]=0;
+	}
+	RELEASE_LOCK(&p->mutex);
+	if(TLSFREE(p->mycache)) abort();
+	nedpfree(0, p);
+	ACQUIRE_LOCK(&poollistlock);
+	assert(poollist);
+	for(n=0; n<poollist->length && poollist->list[n]!=p; n++);
+	assert(n!=poollist->length);
+	memmove(&poollist->list[n], &poollist->list[n+1], (size_t)&poollist->list[poollist->length]-(size_t)&poollist->list[n]);
+	if(!--poollist->length)
+	{
+		assert(!poollist->list[0]);
+		nedpfree(0, poollist);
+		poollist=0;
+	}
+	RELEASE_LOCK(&poollistlock);
+}
+void neddestroysyspool() THROWSPEC
+{
+	nedpool *p=&syspool;
+	int n;
+	ACQUIRE_LOCK(&p->mutex);
+	DestroyCaches(p);
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		destroy_mspace(p->m[n]);
+#endif
+		p->m[n]=0;
+	}
+	/* Render syspool unusable */
+	for(n=0; n<THREADCACHEMAXCACHES; n++)
+		p->caches[n]=(threadcache *)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
+	for(n=0; n<MAXTHREADSINPOOL+1; n++)
+		p->m[n]=(mstate)(size_t)(sizeof(size_t)>4 ? 0xdeadbeefdeadbeefULL : 0xdeadbeefUL);
+	if(TLSFREE(p->mycache)) abort();
+	RELEASE_LOCK(&p->mutex);
+}
+nedpool **nedpoollist() THROWSPEC
+{
+	nedpool **ret=0;
+	if(poollist)
+	{
+		ACQUIRE_LOCK(&poollistlock);
+		if(!(ret=(nedpool **) nedmalloc((poollist->length+1)*sizeof(nedpool *)))) goto badexit;
+		memcpy(ret, poollist->list, (poollist->length+1)*sizeof(nedpool *));
+badexit:
+		RELEASE_LOCK(&poollistlock);
+	}
+	return ret;
+}
+
+void nedpsetvalue(nedpool *p, void *v) THROWSPEC
+{
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	p->uservalue=v;
+}
+void *nedgetvalue(nedpool **p, void *mem) THROWSPEC
+{
+	nedpool *np=0;
+	mstate fm=nedblkmstate(mem);
+	if(!fm || !fm->extp) return 0;
+	np=(nedpool *) fm->extp;
+	if(p) *p=np;
+	return np->uservalue;
+}
+
+void nedtrimthreadcache(nedpool *p, int disable) THROWSPEC
+{
+	int mycache;
+	if(!p)
+	{
+		p=&syspool;
+		if(!syspool.threads) InitPool(&syspool, 0, -1);
+	}
+	mycache=(int)(size_t) TLSGET(p->mycache);
+	if(!mycache)
+	{	/* Set to mspace 0 */
+		if(disable && TLSSET(p->mycache, (void *)(size_t)-1)) abort();
+	}
+	else if(mycache>0)
+	{	/* Set to last used mspace */
+		threadcache *tc=p->caches[mycache-1];
+#if defined(DEBUG)
+		printf("Threadcache utilisation: %lf%% in cache with %lf%% lost to other threads\n",
+			100.0*tc->successes/tc->mallocs, 100.0*((double) tc->mallocs-tc->frees)/tc->mallocs);
+#endif
+		if(disable && TLSSET(p->mycache, (void *)(size_t)(-tc->mymspace))) abort();
+		tc->frees++;
+		RemoveCacheEntries(p, tc, 0);
+		assert(!tc->freeInCache);
+		if(disable)
+		{
+			tc->mymspace=-1;
+			tc->threadid=0;
+			CallFree(0, p->caches[mycache-1], 0);
+			p->caches[mycache-1]=0;
+		}
+	}
+}
+void neddisablethreadcache(nedpool *p) THROWSPEC
+{
+	nedtrimthreadcache(p, 1);
+}
+
+#define GETMSPACE(m,p,tc,ms,s,action)                 \
+  do                                                  \
+  {                                                   \
+    mstate m = GetMSpace((p),(tc),(ms),(s));          \
+    action;                                           \
+	if(USE_ALLOCATOR==1) { RELEASE_LOCK(&m->mutex); } \
+  } while (0)
+
+static FORCEINLINE mstate GetMSpace(nedpool *RESTRICT p, threadcache *RESTRICT tc, int mymspace, size_t size) THROWSPEC
+{	/* Returns a locked and ready for use mspace */
+	mstate m=p->m[mymspace];
+	assert(m);
+#if USE_ALLOCATOR==1
+	if(!TRY_LOCK(&p->m[mymspace]->mutex)) m=FindMSpace(p, tc, &mymspace, size);
+	/*assert(IS_LOCKED(&p->m[mymspace]->mutex));*/
+#endif
+	return m;
+}
+static NOINLINE void GetThreadCache_cold1(nedpool *RESTRICT *RESTRICT p) THROWSPEC
+{
+	*p=&syspool;
+	if(!syspool.threads) InitPool(&syspool, 0, -1);
+}
+static NOINLINE void GetThreadCache_cold2(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, int mycache) THROWSPEC
+{
+	if(!mycache)
+	{	/* Need to allocate a new cache */
+		*tc=AllocCache(*p);
+		if(!*tc)
+		{	/* Disable */
+			if(TLSSET((*p)->mycache, (void *)(size_t)-1)) abort();
+			*mymspace=0;
+		}
+		else
+			*mymspace=(*tc)->mymspace;
+	}
+	else
+	{	/* Cache disabled, but we do have an assigned thread pool */
+		*tc=0;
+		*mymspace=-mycache-1;
+	}
+}
+static FORCEINLINE void GetThreadCache(nedpool *RESTRICT *RESTRICT p, threadcache *RESTRICT *RESTRICT tc, int *RESTRICT mymspace, size_t *RESTRICT size) THROWSPEC
+{
+	int mycache;
+	if(size && *size<sizeof(threadcacheblk)) *size=sizeof(threadcacheblk);
+	if(!*p)
+		GetThreadCache_cold1(p);
+	mycache=(int)(size_t) TLSGET((*p)->mycache);
+	if(mycache>0)
+	{	/* Already have a cache */
+		*tc=(*p)->caches[mycache-1];
+		*mymspace=(*tc)->mymspace;
+	}
+	else GetThreadCache_cold2(p, tc, mymspace, mycache);
+	assert(*mymspace>=0);
+	assert(!(*tc) || (long)(size_t)CURRENT_THREAD==(*tc)->threadid);
+#ifdef FULLSANITYCHECKS
+	if(*tc)
+	{
+		if(*(unsigned int *)"NEDMALC1"!=(*tc)->magic1 || *(unsigned int *)"NEDMALC2"!=(*tc)->magic2)
+		{
+			abort();
+		}
+	}
+#endif
+}
+
+NEDMALLOCPTRATTR void * nedpmalloc(nedpool *p, size_t size) THROWSPEC
+{
+	void *ret=0;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &size);
+#if THREADCACHEMAX
+	if(tc && size<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		ret=threadcache_malloc(p, tc, &size);
+	}
+#endif
+	if(!ret)
+	{	/* Use this thread's mspace */
+        GETMSPACE(m, p, tc, mymspace, size,
+                  ret=CallMalloc(m, size, 0));
+	}
+	return ret;
+}
+NEDMALLOCPTRATTR void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC
+{
+	size_t rsize=size*no;
+	void *ret=0;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &rsize);
+#if THREADCACHEMAX
+	if(tc && rsize<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		if((ret=threadcache_malloc(p, tc, &rsize)))
+			memset(ret, 0, rsize);
+	}
+#endif
+	if(!ret)
+	{	/* Use this thread's mspace */
+        GETMSPACE(m, p, tc, mymspace, rsize,
+                  ret=CallCalloc(m, rsize, 0));
+	}
+	return ret;
+}
+NEDMALLOCPTRATTR void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC
+{
+	void *ret=0;
+	threadcache *tc;
+	int mymspace, isforeign=1;
+	size_t memsize;
+	if(!mem) return nedpmalloc(p, size);
+	memsize=nedblksize(&isforeign, mem);
+	assert(memsize);
+	if(!memsize)
+	{
+		fprintf(stderr, "nedmalloc: nedprealloc() called with a block not created by nedmalloc!\n");
+		abort();
+	}
+	else if(size<=memsize && memsize-size<
+#ifdef DEBUG
+		32
+#else
+		1024
+#endif
+		)		/* If realloc size is within 1Kb smaller than existing, noop it */
+		return mem;
+	GetThreadCache(&p, &tc, &mymspace, &size);
+#if THREADCACHEMAX
+	if(tc && size && size<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		if((ret=threadcache_malloc(p, tc, &size)))
+		{
+			memcpy(ret, mem, memsize<size ? memsize : size);
+			if(memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
+				threadcache_free(p, tc, mymspace, mem, memsize);
+			else
+				CallFree(0, mem, isforeign);
+		}
+	}
+#endif
+	if(!ret)
+	{	/* Reallocs always happen in the mspace they happened in, so skip
+		locking the preferred mspace for this thread */
+		ret=CallRealloc(p->m[mymspace], mem, isforeign, memsize, size);
+	}
+	return ret;
+}
+void   nedpfree(nedpool *p, void *mem) THROWSPEC
+{	/* Frees always happen in the mspace they happened in, so skip
+	locking the preferred mspace for this thread */
+	threadcache *tc;
+	int mymspace, isforeign=1;
+	size_t memsize;
+	if(!mem)
+	{	/* If you tried this on FreeBSD you'd be sorry! */
+#ifdef DEBUG
+		fprintf(stderr, "nedmalloc: WARNING nedpfree() called with zero. This is not portable behaviour!\n");
+#endif
+		return;
+	}
+	memsize=nedblksize(&isforeign, mem);
+	assert(memsize);
+	if(!memsize)
+	{
+		fprintf(stderr, "nedmalloc: nedpfree() called with a block not created by nedmalloc!\n");
+		abort();
+	}
+	GetThreadCache(&p, &tc, &mymspace, 0);
+#if THREADCACHEMAX
+	if(mem && tc && memsize>=sizeof(threadcacheblk) && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
+		threadcache_free(p, tc, mymspace, mem, memsize);
+	else
+#endif
+		CallFree(0, mem, isforeign);
+}
+NEDMALLOCPTRATTR void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC
+{
+	void *ret;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &bytes);
+	{	/* Use this thread's mspace */
+        GETMSPACE(m, p, tc, mymspace, bytes,
+                  ret=CallMalloc(m, bytes, alignment));
+	}
+	return ret;
+}
+struct nedmallinfo nedpmallinfo(nedpool *p) THROWSPEC
+{
+	int n;
+	struct nedmallinfo ret={0};
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1 && !NO_MALLINFO
+		struct mallinfo t=mspace_mallinfo(p->m[n]);
+		ret.arena+=t.arena;
+		ret.ordblks+=t.ordblks;
+		ret.hblkhd+=t.hblkhd;
+		ret.usmblks+=t.usmblks;
+		ret.uordblks+=t.uordblks;
+		ret.fordblks+=t.fordblks;
+		ret.keepcost+=t.keepcost;
+#endif
+	}
+	return ret;
+}
+int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC
+{
+#if USE_ALLOCATOR==1
+	return mspace_mallopt(parno, value);
+#else
+	return 0;
+#endif
+}
+NEDMALLOCNOALIASATTR void*  nedmalloc_internals(size_t *granularity, size_t *magic) THROWSPEC
+{
+#if USE_ALLOCATOR==1
+	if(granularity) *granularity=mparams.granularity;
+	if(magic) *magic=mparams.magic;
+	return (void *) &syspool;
+#else
+	if(granularity) *granularity=0;
+	if(magic) *magic=0;
+	return 0;
+#endif
+}
+int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC
+{
+	int n, ret=0;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		ret+=mspace_trim(p->m[n], pad);
+#endif
+	}
+	return ret;
+}
+void   nedpmalloc_stats(nedpool *p) THROWSPEC
+{
+	int n;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		mspace_malloc_stats(p->m[n]);
+#endif
+	}
+}
+size_t nedpmalloc_footprint(nedpool *p) THROWSPEC
+{
+	size_t ret=0;
+	int n;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+#if USE_ALLOCATOR==1
+		ret+=mspace_footprint(p->m[n]);
+#endif
+	}
+	return ret;
+}
+NEDMALLOCPTRATTR void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC
+{
+	void **ret;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &elemsize);
+#if USE_ALLOCATOR==0
+    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
+              ret=unsupported_operation("independent_calloc"));
+#elif USE_ALLOCATOR==1
+    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
+              ret=mspace_independent_calloc(m, elemsno, elemsize, chunks));
+#endif
+	return ret;
+}
+NEDMALLOCPTRATTR void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC
+{
+	void **ret;
+	threadcache *tc;
+	int mymspace;
+    size_t i, *adjustedsizes=(size_t *) alloca(elems*sizeof(size_t));
+    if(!adjustedsizes) return 0;
+    for(i=0; i<elems; i++)
+        adjustedsizes[i]=sizes[i]<sizeof(threadcacheblk) ? sizeof(threadcacheblk) : sizes[i];
+	GetThreadCache(&p, &tc, &mymspace, 0);
+#if USE_ALLOCATOR==0
+	GETMSPACE(m, p, tc, mymspace, 0,
+              ret=unsupported_operation("independent_comalloc"));
+#elif USE_ALLOCATOR==1
+	GETMSPACE(m, p, tc, mymspace, 0,
+              ret=mspace_independent_comalloc(m, elems, adjustedsizes, chunks));
+#endif
+	return ret;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно