Sfoglia il codice sorgente

Merge branch 'master' into webgl-port

rdb 9 anni fa
parent
commit
40cc045a52
100 ha cambiato i file con 13923 aggiunte e 1828 eliminazioni
  1. 1 1
      direct/src/fsm/FSM.py
  2. 24 25
      direct/src/gui/DirectScrolledList.py
  3. 1 2
      direct/src/gui/OnscreenText.py
  4. 16 16
      direct/src/interval/MetaInterval.py
  5. 1 0
      direct/src/showbase/ShowBaseGlobal.py
  6. 0 374
      direct/src/showbase/pandaSqueezeTool.py
  7. 0 57
      direct/src/showbase/pandaSqueezer.py
  8. 17 3
      dtool/src/dtoolbase/pvector.h
  9. 3 2
      dtool/src/dtoolbase/typeRegistry.h
  10. 31 4
      dtool/src/interrogate/interfaceMakerPythonNative.cxx
  11. 4 1
      dtool/src/interrogate/interfaceMakerPythonNative.h
  12. 1 0
      dtool/src/parser-inc/sys/time.h
  13. 0 6
      makepanda/installer.nsi
  14. 23 12
      makepanda/makepanda.py
  15. 49 19
      makepanda/makepandacore.py
  16. 1 1
      panda/src/bullet/bulletContactResult.I
  17. 1 1
      panda/src/bullet/bulletContactResult.h
  18. 1 1
      panda/src/bullet/bulletHeightfieldShape.I
  19. 4 0
      panda/src/bullet/bulletTriangleMesh.cxx
  20. 10 4
      panda/src/chan/animChannelMatrixXfmTable.cxx
  21. 10 4
      panda/src/chan/animChannelScalarTable.cxx
  22. 11 0
      panda/src/chan/config_chan.cxx
  23. 9 9
      panda/src/display/drawableRegion.I
  24. 10 13
      panda/src/display/drawableRegion.cxx
  25. 1 1
      panda/src/display/drawableRegion.h
  26. 7 0
      panda/src/display/frameBufferProperties.cxx
  27. 3 0
      panda/src/display/graphicsEngine.cxx
  28. 9 0
      panda/src/display/graphicsStateGuardian.I
  29. 5 1
      panda/src/display/graphicsStateGuardian.cxx
  30. 3 0
      panda/src/display/graphicsStateGuardian.h
  31. 12 0
      panda/src/downloader/socketStream.h
  32. 5 0
      panda/src/dxgsg9/config_dxgsg9.cxx
  33. 55 31
      panda/src/dxgsg9/dxGraphicsStateGuardian9.cxx
  34. 1 0
      panda/src/dxgsg9/dxGraphicsStateGuardian9.h
  35. 53 0
      panda/src/express/zStreamBuf.cxx
  36. 3 0
      panda/src/express/zStreamBuf.h
  37. 15 5
      panda/src/gles2gsg/gles2gsg.h
  38. 760 1039
      panda/src/gles2gsg/panda_esgl2ext.h
  39. 4 5
      panda/src/glstuff/glGraphicsBuffer_src.cxx
  40. 251 65
      panda/src/glstuff/glGraphicsStateGuardian_src.cxx
  41. 6 0
      panda/src/glstuff/glGraphicsStateGuardian_src.h
  42. 9 8
      panda/src/glstuff/glShaderContext_src.cxx
  43. 2 1
      panda/src/gobj/geomPrimitive.cxx
  44. 9 0
      panda/src/gobj/geomVertexFormat.I
  45. 2 4
      panda/src/gobj/geomVertexFormat.cxx
  46. 4 0
      panda/src/gobj/geomVertexFormat.h
  47. 5 3
      panda/src/gobj/shader.cxx
  48. 174 78
      panda/src/gobj/texture.cxx
  49. 2 0
      panda/src/grutil/config_grutil.cxx
  50. 1 0
      panda/src/grutil/p3grutil_composite1.cxx
  51. 191 0
      panda/src/grutil/shaderTerrainMesh.I
  52. 715 0
      panda/src/grutil/shaderTerrainMesh.cxx
  53. 205 0
      panda/src/grutil/shaderTerrainMesh.h
  54. 5 0
      panda/src/movies/config_movies.cxx
  55. 2976 0
      panda/src/movies/dr_flac.h
  56. 12 0
      panda/src/movies/flacAudio.I
  57. 64 0
      panda/src/movies/flacAudio.cxx
  58. 54 0
      panda/src/movies/flacAudio.h
  59. 12 0
      panda/src/movies/flacAudioCursor.I
  60. 120 0
      panda/src/movies/flacAudioCursor.cxx
  61. 65 0
      panda/src/movies/flacAudioCursor.h
  62. 2 0
      panda/src/movies/p3movies_composite1.cxx
  63. 4 0
      panda/src/pgraph/alphaTestAttrib.h
  64. 5 0
      panda/src/pgraph/antialiasAttrib.h
  65. 3 0
      panda/src/pgraph/audioVolumeAttrib.h
  66. 3 0
      panda/src/pgraph/auxBitplaneAttrib.h
  67. 8 3
      panda/src/pgraph/camera.cxx
  68. 4 0
      panda/src/pgraph/colorAttrib.h
  69. 44 15
      panda/src/pgraph/colorBlendAttrib.I
  70. 67 5
      panda/src/pgraph/colorBlendAttrib.cxx
  71. 36 5
      panda/src/pgraph/colorBlendAttrib.h
  72. 3 0
      panda/src/pgraph/colorScaleAttrib.h
  73. 3 0
      panda/src/pgraph/colorWriteAttrib.h
  74. 4 0
      panda/src/pgraph/cullBinAttrib.h
  75. 5 0
      panda/src/pgraph/cullFaceAttrib.h
  76. 5 0
      panda/src/pgraph/depthOffsetAttrib.h
  77. 3 0
      panda/src/pgraph/depthTestAttrib.h
  78. 3 0
      panda/src/pgraph/depthWriteAttrib.h
  79. 3 0
      panda/src/pgraph/fogAttrib.h
  80. 3 0
      panda/src/pgraph/lightRampAttrib.h
  81. 3 0
      panda/src/pgraph/materialAttrib.h
  82. 6 1
      panda/src/pgraph/renderModeAttrib.h
  83. 1 0
      panda/src/pgraph/rescaleNormalAttrib.h
  84. 3 0
      panda/src/pgraph/scissorAttrib.h
  85. 1 0
      panda/src/pgraph/shadeModelAttrib.h
  86. 4 0
      panda/src/pgraph/shaderAttrib.h
  87. 1 0
      panda/src/pgraph/transparencyAttrib.h
  88. 3 0
      panda/src/pgraphnodes/config_pgraphnodes.cxx
  89. 2 0
      panda/src/pgraphnodes/p3pgraphnodes_composite1.cxx
  90. 1 2
      panda/src/pgraphnodes/p3pgraphnodes_composite2.cxx
  91. 48 0
      panda/src/pgraphnodes/sphereLight.I
  92. 146 0
      panda/src/pgraphnodes/sphereLight.cxx
  93. 90 0
      panda/src/pgraphnodes/sphereLight.h
  94. 7 0
      panda/src/pnmimagetypes/config_pnmimagetypes.cxx
  95. 1 0
      panda/src/pnmimagetypes/p3pnmimagetypes_composite2.cxx
  96. 509 0
      panda/src/pnmimagetypes/pnmFileTypeStbImage.cxx
  97. 73 0
      panda/src/pnmimagetypes/pnmFileTypeStbImage.h
  98. 6755 0
      panda/src/pnmimagetypes/stb_image.h
  99. 2 1
      panda/src/putil/bam.h
  100. 11 0
      panda/src/putil/bamReader.I

+ 1 - 1
direct/src/fsm/FSM.py

@@ -310,7 +310,7 @@ class FSM(DirectObject):
                 self.name, request, str(args)[1:]))
                 self.name, request, str(args)[1:]))
 
 
             filter = self.getCurrentFilter()
             filter = self.getCurrentFilter()
-            result = list(filter(request, args))
+            result = filter(request, args)
             if result:
             if result:
                 if isinstance(result, str):
                 if isinstance(result, str):
                     # If the return value is a string, it's just the name
                     # If the return value is a string, it's just the name

+ 24 - 25
direct/src/gui/DirectScrolledList.py

@@ -8,7 +8,6 @@ from direct.directnotify import DirectNotifyGlobal
 from direct.task.Task import Task
 from direct.task.Task import Task
 from .DirectFrame import *
 from .DirectFrame import *
 from .DirectButton import *
 from .DirectButton import *
-import types
 
 
 
 
 class DirectScrolledListItem(DirectButton):
 class DirectScrolledListItem(DirectButton):
@@ -49,7 +48,7 @@ class DirectScrolledList(DirectFrame):
     def __init__(self, parent = None, **kw):
     def __init__(self, parent = None, **kw):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
         self.index = 0
         self.index = 0
-        self.forceHeight = None
+        self.__forceHeight = None
 
 
         """ If one were to want a scrolledList that makes and adds its items
         """ If one were to want a scrolledList that makes and adds its items
            as needed, simply pass in an items list of strings (type 'str')
            as needed, simply pass in an items list of strings (type 'str')
@@ -115,12 +114,12 @@ class DirectScrolledList(DirectFrame):
 
 
     def setForceHeight(self):
     def setForceHeight(self):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
-        self.forceHeight = self["forceHeight"]
+        self.__forceHeight = self["forceHeight"]
 
 
     def recordMaxHeight(self):
     def recordMaxHeight(self):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
-        if self.forceHeight is not None:
-            self.maxHeight = self.forceHeight
+        if self.__forceHeight is not None:
+            self.maxHeight = self.__forceHeight
         else:
         else:
             self.maxHeight = 0.0
             self.maxHeight = 0.0
             for item in self["items"]:
             for item in self["items"]:
@@ -130,24 +129,24 @@ class DirectScrolledList(DirectFrame):
     def setScrollSpeed(self):
     def setScrollSpeed(self):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
         # Items per second to move
         # Items per second to move
-        self.scrollSpeed = self["scrollSpeed"]
-        if self.scrollSpeed <= 0:
-            self.scrollSpeed = 1
+        self.__scrollSpeed = self["scrollSpeed"]
+        if self.__scrollSpeed <= 0:
+            self.__scrollSpeed = 1
 
 
     def setNumItemsVisible(self):
     def setNumItemsVisible(self):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
         # Items per second to move
         # Items per second to move
-        self.numItemsVisible = self["numItemsVisible"]
+        self.__numItemsVisible = self["numItemsVisible"]
 
 
     def destroy(self):
     def destroy(self):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
         taskMgr.remove(self.taskName("scroll"))
         taskMgr.remove(self.taskName("scroll"))
         if hasattr(self, "currentSelected"):
         if hasattr(self, "currentSelected"):
             del self.currentSelected
             del self.currentSelected
-        if self.incButtonCallback:
-            self.incButtonCallback = None
-        if self.decButtonCallback:
-            self.decButtonCallback = None
+        if self.__incButtonCallback:
+            self.__incButtonCallback = None
+        if self.__decButtonCallback:
+            self.__decButtonCallback = None
         self.incButton.destroy()
         self.incButton.destroy()
         self.decButton.destroy()
         self.decButton.destroy()
         DirectFrame.destroy(self)
         DirectFrame.destroy(self)
@@ -169,10 +168,10 @@ class DirectScrolledList(DirectFrame):
         #for i in range(len(self["items"])):
         #for i in range(len(self["items"])):
         #    print "buttontext[", i,"]", self["items"][i]["text"]
         #    print "buttontext[", i,"]", self["items"][i]["text"]
 
 
-        if(len(self["items"])==0):
+        if len(self["items"]) == 0:
             return 0
             return 0
 
 
-        if(type(self["items"][0])!=types.InstanceType):
+        if type(self["items"][0]) == type(''):
             self.notify.warning("getItemIndexForItemID: cant find itemID for non-class list items!")
             self.notify.warning("getItemIndexForItemID: cant find itemID for non-class list items!")
             return 0
             return 0
 
 
@@ -309,7 +308,7 @@ class DirectScrolledList(DirectFrame):
     def __incButtonDown(self, event):
     def __incButtonDown(self, event):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
         task = Task(self.__scrollByTask)
         task = Task(self.__scrollByTask)
-        task.setDelay(1.0 / self.scrollSpeed)
+        task.setDelay(1.0 / self.__scrollSpeed)
         task.prevTime = 0.0
         task.prevTime = 0.0
         task.delta = 1
         task.delta = 1
         taskName = self.taskName("scroll")
         taskName = self.taskName("scroll")
@@ -317,13 +316,13 @@ class DirectScrolledList(DirectFrame):
         taskMgr.add(task, taskName)
         taskMgr.add(task, taskName)
         self.scrollBy(task.delta)
         self.scrollBy(task.delta)
         messenger.send('wakeup')
         messenger.send('wakeup')
-        if self.incButtonCallback:
-            self.incButtonCallback()
+        if self.__incButtonCallback:
+            self.__incButtonCallback()
 
 
     def __decButtonDown(self, event):
     def __decButtonDown(self, event):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
         task = Task(self.__scrollByTask)
         task = Task(self.__scrollByTask)
-        task.setDelay(1.0 / self.scrollSpeed)
+        task.setDelay(1.0 / self.__scrollSpeed)
         task.prevTime = 0.0
         task.prevTime = 0.0
         task.delta = -1
         task.delta = -1
         taskName = self.taskName("scroll")
         taskName = self.taskName("scroll")
@@ -331,8 +330,8 @@ class DirectScrolledList(DirectFrame):
         taskMgr.add(task, taskName)
         taskMgr.add(task, taskName)
         self.scrollBy(task.delta)
         self.scrollBy(task.delta)
         messenger.send('wakeup')
         messenger.send('wakeup')
-        if self.decButtonCallback:
-            self.decButtonCallback()
+        if self.__decButtonCallback:
+            self.__decButtonCallback()
 
 
     def __buttonUp(self, event):
     def __buttonUp(self, event):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
@@ -345,7 +344,7 @@ class DirectScrolledList(DirectFrame):
         Add this string and extraArg to the list
         Add this string and extraArg to the list
         """
         """
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
-        if(type(item) == types.InstanceType):
+        if type(item) != type(''):
             # cant add attribs to non-classes (like strings & ints)
             # cant add attribs to non-classes (like strings & ints)
             item.itemID = self.nextItemID
             item.itemID = self.nextItemID
             self.nextItemID += 1
             self.nextItemID += 1
@@ -354,7 +353,7 @@ class DirectScrolledList(DirectFrame):
             item.reparentTo(self.itemFrame)
             item.reparentTo(self.itemFrame)
         if refresh:
         if refresh:
             self.refresh()
             self.refresh()
-        if(type(item) == types.InstanceType):
+        if type(item) != type(''):
             return item.itemID  # to pass to scrollToItemID
             return item.itemID  # to pass to scrollToItemID
 
 
     def removeItem(self, item, refresh=1):
     def removeItem(self, item, refresh=1):
@@ -466,11 +465,11 @@ class DirectScrolledList(DirectFrame):
 
 
     def setIncButtonCallback(self):
     def setIncButtonCallback(self):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
-        self.incButtonCallback = self["incButtonCallback"]
+        self.__incButtonCallback = self["incButtonCallback"]
 
 
     def setDecButtonCallback(self):
     def setDecButtonCallback(self):
         assert self.notify.debugStateCall(self)
         assert self.notify.debugStateCall(self)
-        self.decButtonCallback = self["decButtonCallback"]
+        self.__decButtonCallback = self["decButtonCallback"]
 
 
 
 
 """
 """

+ 1 - 2
direct/src/gui/OnscreenText.py

@@ -4,7 +4,6 @@ __all__ = ['OnscreenText', 'Plain', 'ScreenTitle', 'ScreenPrompt', 'NameConfirm'
 
 
 from panda3d.core import *
 from panda3d.core import *
 from . import DirectGuiGlobals as DGG
 from . import DirectGuiGlobals as DGG
-from direct.showbase.DirectObject import DirectObject
 import sys
 import sys
 
 
 ## These are the styles of text we might commonly see.  They set the
 ## These are the styles of text we might commonly see.  They set the
@@ -17,7 +16,7 @@ ScreenPrompt = 3
 NameConfirm = 4
 NameConfirm = 4
 BlackOnWhite = 5
 BlackOnWhite = 5
 
 
-class OnscreenText(DirectObject, NodePath):
+class OnscreenText(NodePath):
 
 
     def __init__(self, text = '',
     def __init__(self, text = '',
                  style = Plain,
                  style = Plain,

+ 16 - 16
direct/src/interval/MetaInterval.py

@@ -268,7 +268,7 @@ class MetaInterval(CMetaInterval):
             self.addInterval(ival, maxDuration - ival.getDuration(), TRACK_START)
             self.addInterval(ival, maxDuration - ival.getDuration(), TRACK_START)
         self.popLevel(duration)
         self.popLevel(duration)
 
 
-    def addTrack(self, list, name, relTime, relTo, duration):
+    def addTrack(self, trackList, name, relTime, relTo, duration):
         # Adds a "track list".  This is a list of tuples of the form:
         # Adds a "track list".  This is a list of tuples of the form:
         #
         #
         #   (<delay>, <Interval>,
         #   (<delay>, <Interval>,
@@ -281,19 +281,19 @@ class MetaInterval(CMetaInterval):
         # (TRACK_START).  If the relative code is omitted, the default
         # (TRACK_START).  If the relative code is omitted, the default
         # is TRACK_START.
         # is TRACK_START.
         self.pushLevel(name, relTime, relTo)
         self.pushLevel(name, relTime, relTo)
-        for tuple in list:
-            if isinstance(tuple, tuple) or \
-               isinstance(tuple, list):
-                relTime = tuple[0]
-                ival = tuple[1]
-                if len(tuple) >= 3:
-                    relTo = tuple[2]
+        for tupleObj in trackList:
+            if isinstance(tupleObj, tuple) or \
+               isinstance(tupleObj, list):
+                relTime = tupleObj[0]
+                ival = tupleObj[1]
+                if len(tupleObj) >= 3:
+                    relTo = tupleObj[2]
                 else:
                 else:
                     relTo = TRACK_START
                     relTo = TRACK_START
                 self.addInterval(ival, relTime, relTo)
                 self.addInterval(ival, relTime, relTo)
 
 
             else:
             else:
-                self.notify.error("Not a tuple in Track: %s" % (tuple,))
+                self.notify.error("Not a tuple in Track: %s" % (tupleObj,))
         self.popLevel(duration)
         self.popLevel(duration)
 
 
     def addInterval(self, ival, relTime, relTo):
     def addInterval(self, ival, relTime, relTo):
@@ -593,22 +593,22 @@ class Track(MetaInterval):
         meta.addTrack(self.ivals, self.getName(),
         meta.addTrack(self.ivals, self.getName(),
                       relTime, relTo, self.phonyDuration)
                       relTime, relTo, self.phonyDuration)
 
 
-    def validateComponent(self, tuple):
+    def validateComponent(self, tupleObj):
         # This is called only in debug mode to verify that the
         # This is called only in debug mode to verify that the
         # indicated component added to the MetaInterval is appropriate
         # indicated component added to the MetaInterval is appropriate
         # to this type of MetaInterval.  In most cases except Track,
         # to this type of MetaInterval.  In most cases except Track,
         # this is the same as asking that the component is itself an
         # this is the same as asking that the component is itself an
         # Interval.
         # Interval.
 
 
-        if not (isinstance(tuple, tuple) or \
-                isinstance(tuple, list)):
+        if not (isinstance(tupleObj, tuple) or \
+                isinstance(tupleObj, list)):
             # It's not a tuple.
             # It's not a tuple.
             return 0
             return 0
 
 
-        relTime = tuple[0]
-        ival = tuple[1]
-        if len(tuple) >= 3:
-            relTo = tuple[2]
+        relTime = tupleObj[0]
+        ival = tupleObj[1]
+        if len(tupleObj) >= 3:
+            relTo = tupleObj[2]
         else:
         else:
             relTo = TRACK_START
             relTo = TRACK_START
 
 

+ 1 - 0
direct/src/showbase/ShowBaseGlobal.py

@@ -20,6 +20,7 @@ def inspect(anObject):
     Inspector = importlib.import_module('direct.tkpanels.Inspector')
     Inspector = importlib.import_module('direct.tkpanels.Inspector')
     return Inspector.inspect(anObject)
     return Inspector.inspect(anObject)
 
 
+import sys
 if sys.version_info >= (3, 0):
 if sys.version_info >= (3, 0):
     import builtins
     import builtins
 else:
 else:

+ 0 - 374
direct/src/showbase/pandaSqueezeTool.py

@@ -1,374 +0,0 @@
-"""Undocumented Module"""
-
-__all__ = ['usage', 'Squeezer', 'Loader', 'boot', 'open', 'explode', 'getloader', 'squeeze', 'searchPath']
-
-#!/usr/bin/env python
-#
-# SQUEEZE
-#
-# squeeze a python program
-#
-# installation:
-# - use this script as is, or squeeze it using the following command:
-#
-# python squeezeTool.py -1su -o squeeze -b squeezeTool squeezeTool.py
-#
-# notes:
-# - this is pretty messy.  make sure to test everything carefully
-#   if you change anything
-#
-# - the name "squeeze" is taken from an ABC800 utility which did
-#   about the same thing with Basic II bytecodes.
-#
-# history:
-# 1.0   97-04-22 fl     Created
-# 1.1   97-05-25 fl     Added base64 embedding option (-1)
-#       97-05-25 fl     Check for broken package file
-# 1.2   97-05-26 fl     Support uncompressed packages (-u)
-# 1.3   97-05-27 fl     Check byte code magic, eliminated StringIO, etc.
-# 1.4   97-06-04 fl     Removed last bits of white space, removed try/except
-# 1.5   97-06-17 fl     Added squeeze archive capabilities (-x)
-# 1.6   98-05-04 fl     Minor fixes in preparation for public source release
-#
-# reviews:
-#       "Fredrik Lundh is a friggin genius"
-#       -- Aaron Watters, author of 'Internet Programming with Python'
-#
-#       "I agree ... this is a friggin Good Thing"
-#       -- Paul Everitt, Digital Creations
-#
-# Copyright (c) 1997 by Fredrik Lundh.
-# Copyright (c) 1997-1998 by Secret Labs AB
-#
-# [email protected]
-# http://www.pythonware.com
-#
-# --------------------------------------------------------------------
-# Permission to use, copy, modify, and distribute this software and
-# its associated documentation for any purpose and without fee is
-# hereby granted.  This software is provided as is.
-# --------------------------------------------------------------------
-
-VERSION = "1.6/98-05-04"
-MAGIC   = "[PANDASQUEEZE]"
-
-import base64, imp, marshal, os, sys
-
-# --------------------------------------------------------------------
-# usage
-
-def usage():
-        print("\nSQUEEZE", VERSION, "(c) 1997-1998 by Secret Labs AB")
-        print("""\
-Convert a Python application to a compressed module package.
-
-Usage: squeeze [-1ux] -o app [-b start] modules... [-d files...]
-
-This utility creates a compressed package file named "app.pyz", which
-contains the given module files.  It also creates a bootstrap script
-named "app.py", which loads the package and imports the given "start"
-module to get things going.  Example:
-
-        squeeze -o app -b appMain app*.py
-
-The -1 option tells squeeze to put the package file inside the boot-
-strap script using base64 encoding.  The result is a single text file
-containing the full application.
-
-The -u option disables compression.  Otherwise, the package will be
-compressed using zlib, and the user needs zlib to run the resulting
-application.
-
-The -d option can be used to put additional files in the package file.
-You can access these files via "__main__.open(filename)" (returns a
-StringIO file object).
-
-The -x option can be used with -d to create a self-extracting archive,
-instead of a package.  When the resulting script is executed, the
-data files are extracted.  Omit the -b option in this case.
-""")
-        sys.exit(1)
-
-
-# --------------------------------------------------------------------
-# squeezer -- collect squeezed modules
-
-class Squeezer:
-
-        def __init__(self):
-
-                self.rawbytes = self.bytes = 0
-                self.modules = {}
-
-        def addmodule(self, file):
-
-                if file[-1] == "c":
-                        file = file[:-1]
-
-                m = os.path.splitext(os.path.split(file)[1])[0]
-
-                # read sourcefile
-                f = open(file)
-                codestring = f.read()
-                f.close()
-
-                # dump to file
-                self.modules[m] = compile(codestring, m, "exec")
-
-        def adddata(self, file):
-
-                self.modules["+"+file] = open(file, "rb").read()
-
-        def getarchive(self):
-
-                # marshal our module dictionary
-                data = marshal.dumps(self.modules)
-                self.rawbytes = len(data)
-
-                # return (compressed) dictionary
-                data = zlib.compress(data, 9)
-                self.bytes = len(data)
-
-                return data
-
-        def getstatus(self):
-                return self.bytes, self.rawbytes
-
-
-# --------------------------------------------------------------------
-# loader (used in bootstrap code)
-
-loader = """
-import ihooks
-
-PYZ_MODULE = 64
-
-class Loader(ihooks.ModuleLoader):
-
-    def __init__(self, modules):
-        self.__modules = modules
-        return ihooks.ModuleLoader.__init__(self)
-
-    def find_module(self, name, path = None):
-        try:
-            self.__modules[name]
-            return None, None, (None, None, PYZ_MODULE)
-        except KeyError:
-            return ihooks.ModuleLoader.find_module(self, name, path)
-
-    def load_module(self, name, stuff):
-        file, filename, (suff, mode, type) = stuff
-        if type != PYZ_MODULE:
-            return ihooks.ModuleLoader.load_module(self, name, stuff)
-        #print "PYZ:", "import", name
-        basename = name.split('.')[-1]
-        code = self.__modules[basename]
-        del self.__modules[basename] # no need to keep this one around
-        m = self.hooks.add_module(name)
-        m.__file__ = filename
-        exec code in m.__dict__
-        return m
-
-def boot(name, fp, size, offset = 0):
-
-    global data
-
-    try:
-        import %(modules)s
-    except ImportError:
-        #print "PYZ:", "failed to load marshal and zlib libraries"
-        return # cannot boot from PYZ file
-    #print "PYZ:", "boot from", name+".PYZ"
-
-    # load archive and install import hook
-    if offset:
-        data = fp[offset:]
-    else:
-        data = fp.read(size)
-        fp.close()
-
-    if len(data) != size:
-        raise IOError, "package is truncated"
-
-    data = marshal.loads(%(data)s)
-
-    ihooks.install(ihooks.ModuleImporter(Loader(data)))
-"""
-
-loaderopen = """
-
-def open(name):
-    from io import StringIO
-    try:
-        return StringIO(data["+"+name])
-    except KeyError:
-        raise IOError, (0, "no such file")
-"""
-
-loaderexplode = """
-
-def explode():
-    for k, v in data.items():
-        if k[0] == "+":
-            try:
-                open(k[1:], "wb").write(v)
-                print k[1:], "extracted ok"
-            except IOError, v:
-                print k[1:], "failed:", "IOError", v
-
-"""
-
-def getloader(data, package):
-
-        s = loader
-
-        if data:
-                if explode:
-                        s = s + loaderexplode
-                else:
-                        s = s + loaderopen
-
-        dict = {
-                "modules": "marshal, zlib",
-                "data":    "zlib.decompress(data)",
-                }
-
-        s = s % dict
-
-        return marshal.dumps(compile(s, "<package>", "exec"))
-
-
-# --------------------------------------------------------------------
-# Main
-# --------------------------------------------------------------------
-
-#
-# parse options
-
-import sys
-import zlib
-
-embed = 0
-explode = 0
-
-def squeeze(app, start, filelist, outputDir):
-        localMagic = MAGIC
-        data = None
-
-        bootstrap = os.path.join(outputDir, app + ".py")
-        archiveBase = app + ".pyz"
-        archive   = os.path.join(outputDir, archiveBase)
-
-        archiveid = app
-
-        #
-        # avoid overwriting files not generated by squeeze
-
-        try:
-                fp = open(bootstrap)
-                s = fp.readline()
-                s.index(MAGIC)
-        except IOError:
-                pass
-        except ValueError:
-                print("%s was not created by squeeze.  You have to manually" % (bootstrap))
-                print("remove the file to proceed.")
-                sys.exit(1)
-
-        #
-        # collect modules
-
-        sq = Squeezer()
-        for file in filelist:
-                # print 'addmodule:', file
-                sq.addmodule(file)
-
-        package = sq.getarchive()
-        size = len(package)
-
-        #
-        # get loader
-
-        loader = getloader(data, package)
-
-        zbegin, zend = "zlib.decompress(", ")"
-        loader = zlib.compress(loader, 9)
-
-        loaderlen = len(loader)
-
-        magic = repr(imp.get_magic())
-        version = sys.version.split()[0]
-
-        #
-        # generate script and package files
-
-        if embed:
-
-                # embedded archive
-                data = base64.encodestring(loader + package)
-
-                fp = open(bootstrap, "w")
-                fp.write('''\
-#%(localMagic)s %(archiveid)s
-import ihooks, zlib, base64, marshal
-s=base64.decodestring("""
-%(data)s""")
-exec marshal.loads(%(zbegin)ss[:%(loaderlen)d]%(zend)s)
-boot("%(app)s", s, %(size)d, %(loaderlen)d)
-exec "import %(start)s"
-''' % locals())
-                bytes = fp.tell()
-
-        else:
-
-                # separate archive file
-
-                fp = open(archive, "wb")
-
-                fp.write(loader)
-                fp.write(package)
-
-                bytes = fp.tell()
-                fp.close()
-                #
-                # create bootstrap code
-
-                fp = open(bootstrap, "w")
-                # Note: David Rose adjusted the following to be panda-specific.
-                fp.write("""\
-#%(localMagic)s %(archiveid)s
-import ihooks, zlib, marshal, os, sys
-
-def searchPath(filename):
-  # Look along panda3d.__path__ for the indicated filename.  Returns
-  # the located pathname, or None if the filename is not found.
-  import panda3d
-
-  for dir in panda3d.__path__:
-    pathname = os.path.join(dir, filename)
-    if os.path.exists(pathname):
-      return pathname
-
-  return None
-
-# Look for %(archiveBase)s along panda3d.__path__.
-archiveName = "%(archiveBase)s"
-archivePath = searchPath(archiveName)
-if archivePath == None:
-  raise ImportError, "Could not locate panda3d.%%s." %% (archiveName)
-
-f=open(archivePath,"rb")
-exec marshal.loads(%(zbegin)sf.read(%(loaderlen)d)%(zend)s)
-boot("%(app)s", f, %(size)d)
-exec "from %(start)s import *"
-#exec "run()"
-""" % locals())
-                bytes = bytes + fp.tell()
-                fp.close()
-
-        #
-        # show statistics
-
-        dummy, rawbytes = sq.getstatus()
-
-        print("squeezed %s to %s (%d%%)" % (rawbytes, bytes, bytes * 100 / rawbytes))

+ 0 - 57
direct/src/showbase/pandaSqueezer.py

@@ -1,57 +0,0 @@
-"""Undocumented Module"""
-
-__all__ = []
-
-import os
-import sys
-import getopt
-from . import pandaSqueezeTool
-
-# Assumption: We will be squeezing the files from the current directory or the -d directory.
-
-if __name__ == "__main__":
-    try:
-        opts, pargs = getopt.getopt(sys.argv[1:], 'Od:')
-    except Exception as e:
-        # User passed in a bad option, print the error and the help, then exit
-        print(e)
-        print('Usage: pass in -O for optimized')
-        print('       pass in -d directory')
-        sys.exit()
-
-    fOptimized = 0
-    # Store the option values into our variables
-    for opt in opts:
-        flag, value = opt
-        if (flag == '-O'):
-            fOptimized = 1
-            print('Squeezing pyo files')
-        elif (flag == '-d'):
-            os.chdir(value)
-
-    def getSqueezeableFiles():
-        fileList = os.listdir(".")
-        newFileList = []
-        if fOptimized:
-            targetFileExtension = ".pyo"
-        else:
-            targetFileExtension = ".pyc"
-        for i in fileList:
-            base, ext = os.path.splitext(i)
-            if (ext == ".py"):
-                newFileList.append(i)
-        return newFileList
-
-    def squeezePandaFiles():
-        l = getSqueezeableFiles()
-        pandaSqueezeTool.squeeze("PandaModules", "PandaModulesUnsqueezed", l)
-
-        # Clean up the source files now that they've been squeezed.  If
-        # you don't like this behavior (e.g. if you want to inspect the
-        # generated files), use genPyCode -n to avoid squeezing
-        # altogether.
-        for i in l:
-            os.unlink(i)
-
-
-    squeezePandaFiles()

+ 17 - 3
dtool/src/dtoolbase/pvector.h

@@ -39,11 +39,25 @@ public:
   typedef vector<Type, allocator> base_class;
   typedef vector<Type, allocator> base_class;
   typedef TYPENAME base_class::size_type size_type;
   typedef TYPENAME base_class::size_type size_type;
 
 
-  pvector(TypeHandle type_handle = pvector_type_handle) : base_class(allocator(type_handle)) { }
+  explicit pvector(TypeHandle type_handle = pvector_type_handle) : base_class(allocator(type_handle)) { }
   pvector(const pvector<Type> &copy) : base_class(copy) { }
   pvector(const pvector<Type> &copy) : base_class(copy) { }
-  pvector(size_type n, TypeHandle type_handle = pvector_type_handle) : base_class(n, Type(), allocator(type_handle)) { }
-  pvector(size_type n, const Type &value, TypeHandle type_handle = pvector_type_handle) : base_class(n, value, allocator(type_handle)) { }
+  explicit pvector(size_type n, TypeHandle type_handle = pvector_type_handle) : base_class(n, Type(), allocator(type_handle)) { }
+  explicit pvector(size_type n, const Type &value, TypeHandle type_handle = pvector_type_handle) : base_class(n, value, allocator(type_handle)) { }
   pvector(const Type *begin, const Type *end, TypeHandle type_handle = pvector_type_handle) : base_class(begin, end, allocator(type_handle)) { }
   pvector(const Type *begin, const Type *end, TypeHandle type_handle = pvector_type_handle) : base_class(begin, end, allocator(type_handle)) { }
+
+#ifdef USE_MOVE_SEMANTICS
+  pvector(pvector<Type> &&from) NOEXCEPT : base_class(move(from)) {};
+
+  pvector<Type> &operator =(pvector<Type> &&from) NOEXCEPT {
+    base_class::operator =(move(from));
+    return *this;
+  }
+#endif
+
+  pvector<Type> &operator =(const pvector<Type> &copy) {
+    base_class::operator =(copy);
+    return *this;
+  }
 };
 };
 
 
 #endif  // USE_STL_ALLOCATOR
 #endif  // USE_STL_ALLOCATOR

+ 3 - 2
dtool/src/dtoolbase/typeRegistry.h

@@ -37,14 +37,15 @@ class EXPCL_DTOOL TypeRegistry : public MemoryBase {
 public:
 public:
   // User code shouldn't generally need to call TypeRegistry::register_type()
   // User code shouldn't generally need to call TypeRegistry::register_type()
   // or record_derivation() directly; instead, use the register_type
   // or record_derivation() directly; instead, use the register_type
-  // convenience function, defined below.
+  // convenience function, defined in register_type.h.
   bool register_type(TypeHandle &type_handle, const string &name);
   bool register_type(TypeHandle &type_handle, const string &name);
+
+PUBLISHED:
   TypeHandle register_dynamic_type(const string &name);
   TypeHandle register_dynamic_type(const string &name);
 
 
   void record_derivation(TypeHandle child, TypeHandle parent);
   void record_derivation(TypeHandle child, TypeHandle parent);
   void record_alternate_name(TypeHandle type, const string &name);
   void record_alternate_name(TypeHandle type, const string &name);
 
 
-PUBLISHED:
   TypeHandle find_type(const string &name) const;
   TypeHandle find_type(const string &name) const;
   TypeHandle find_type_by_id(int id) const;
   TypeHandle find_type_by_id(int id) const;
 
 

+ 31 - 4
dtool/src/interrogate/interfaceMakerPythonNative.cxx

@@ -1212,6 +1212,10 @@ write_sub_module(ostream &out, Object *obj) {
       out << "  assert(" << class_ptr << " != NULL);\n";
       out << "  assert(" << class_ptr << " != NULL);\n";
     } else {
     } else {
       class_ptr = "&Dtool_" + class_name;
       class_ptr = "&Dtool_" + class_name;
+
+      // If this is a typedef to a class defined in the same module, make sure
+      // that the class is initialized before we try to define the typedef.
+      out << "  Dtool_PyModuleClassInit_" << class_name << "(module);\n";
     }
     }
   }
   }
 
 
@@ -1734,7 +1738,7 @@ write_module_class(ostream &out, Object *obj) {
 
 
       switch (rfi->second._wrapper_type) {
       switch (rfi->second._wrapper_type) {
       case WT_no_params:
       case WT_no_params:
-      case WT_iter_next: // TODO: fix iter_next to return NULL instead of None
+      case WT_iter_next:
         // PyObject *func(PyObject *self)
         // PyObject *func(PyObject *self)
         {
         {
           out << "//////////////////\n";
           out << "//////////////////\n";
@@ -1747,9 +1751,15 @@ write_module_class(ostream &out, Object *obj) {
           out << "    return NULL;\n";
           out << "    return NULL;\n";
           out << "  }\n\n";
           out << "  }\n\n";
 
 
+          int return_flags = RF_pyobject | RF_err_null;
+          if (rfi->second._wrapper_type == WT_iter_next) {
+            // If the function returns NULL, we should return NULL to indicate
+            // a StopIteration, rather than returning None.
+            return_flags |= RF_preserve_null;
+          }
           string expected_params;
           string expected_params;
           write_function_forset(out, def._remaps, 0, 0, expected_params, 2, true, true,
           write_function_forset(out, def._remaps, 0, 0, expected_params, 2, true, true,
-                                AT_no_args, RF_pyobject | RF_err_null, false);
+                                AT_no_args, return_flags, false);
 
 
           out << "  if (!_PyErr_OCCURRED()) {\n";
           out << "  if (!_PyErr_OCCURRED()) {\n";
           out << "    return Dtool_Raise_BadArgumentsError(\n";
           out << "    return Dtool_Raise_BadArgumentsError(\n";
@@ -2692,6 +2702,12 @@ write_module_class(ostream &out, Object *obj) {
   out << "#if PY_VERSION_HEX >= 0x02050000\n";
   out << "#if PY_VERSION_HEX >= 0x02050000\n";
   write_function_slot(out, 2, slots, "nb_index");
   write_function_slot(out, 2, slots, "nb_index");
   out << "#endif\n";
   out << "#endif\n";
+
+  out << "#if PY_VERSION_HEX >= 0x03050000\n";
+  write_function_slot(out, 2, slots, "nb_matrix_multiply");
+  write_function_slot(out, 2, slots, "nb_inplace_matrix_multiply");
+  out << "#endif\n";
+
   out << "};\n\n";
   out << "};\n\n";
 
 
   // NB: it's tempting not to write this table when a class doesn't have them.
   // NB: it's tempting not to write this table when a class doesn't have them.
@@ -2938,6 +2954,10 @@ write_module_class(ostream &out, Object *obj) {
   out << "#if PY_VERSION_HEX >= 0x02060000\n";
   out << "#if PY_VERSION_HEX >= 0x02060000\n";
   out << "    0, // tp_version_tag\n";
   out << "    0, // tp_version_tag\n";
   out << "#endif\n";
   out << "#endif\n";
+  // destructor tp_finalize
+  out << "#if PY_VERSION_HEX >= 0x03040000\n";
+  out << "    0, // tp_finalize\n";
+  out << "#endif\n";
   out << "  },\n";
   out << "  },\n";
 
 
   // It's tempting to initialize the type handle here, but this causes static
   // It's tempting to initialize the type handle here, but this causes static
@@ -5842,8 +5862,15 @@ write_function_instance(ostream &out, FunctionRemap *remap,
       indent(out, indent_level) << "Py_INCREF(Py_None);\n";
       indent(out, indent_level) << "Py_INCREF(Py_None);\n";
       indent(out, indent_level) << "return Py_None;\n";
       indent(out, indent_level) << "return Py_None;\n";
 
 
+    } else if (return_flags & RF_preserve_null) {
+      indent(out, indent_level) << "if (" << return_expr << " == NULL) {\n";
+      indent(out, indent_level) << "  return NULL;\n";
+      indent(out, indent_level) << "} else {\n";
+      pack_return_value(out, indent_level + 2, remap, return_expr, return_flags);
+      indent(out, indent_level) << "}\n";
+
     } else {
     } else {
-      pack_return_value(out, indent_level, remap, return_expr);
+      pack_return_value(out, indent_level, remap, return_expr, return_flags);
     }
     }
 
 
   } else if (return_flags & RF_coerced) {
   } else if (return_flags & RF_coerced) {
@@ -6000,7 +6027,7 @@ error_raise_return(ostream &out, int indent_level, int return_flags,
  */
  */
 void InterfaceMakerPythonNative::
 void InterfaceMakerPythonNative::
 pack_return_value(ostream &out, int indent_level, FunctionRemap *remap,
 pack_return_value(ostream &out, int indent_level, FunctionRemap *remap,
-                  string return_expr) {
+                  string return_expr, int return_flags) {
 
 
   ParameterRemap *return_type = remap->_return_type;
   ParameterRemap *return_type = remap->_return_type;
   CPPType *orig_type = return_type->get_orig_type();
   CPPType *orig_type = return_type->get_orig_type();

+ 4 - 1
dtool/src/interrogate/interfaceMakerPythonNative.h

@@ -101,6 +101,9 @@ private:
     // Assign to the coerced argument, in the case of a coercion constructor.
     // Assign to the coerced argument, in the case of a coercion constructor.
     RF_coerced = 0x040,
     RF_coerced = 0x040,
 
 
+    // Don't automatically map NULL to None
+    RF_preserve_null = 0x080,
+
     // These indicate what should be returned on error.
     // These indicate what should be returned on error.
     RF_err_notimplemented = 0x002,
     RF_err_notimplemented = 0x002,
     RF_err_null = 0x004,
     RF_err_null = 0x004,
@@ -164,7 +167,7 @@ private:
                           const string &exc_type, const string &message,
                           const string &exc_type, const string &message,
                           const string &format_args = "");
                           const string &format_args = "");
   void pack_return_value(ostream &out, int indent_level, FunctionRemap *remap,
   void pack_return_value(ostream &out, int indent_level, FunctionRemap *remap,
-                         std::string return_expr);
+                         std::string return_expr, int return_flags);
 
 
   void write_make_seq(ostream &out, Object *obj, const std::string &ClassName,
   void write_make_seq(ostream &out, Object *obj, const std::string &ClassName,
                       const std::string &cClassName, MakeSeq *make_seq);
                       const std::string &cClassName, MakeSeq *make_seq);

+ 1 - 0
dtool/src/parser-inc/sys/time.h

@@ -2,3 +2,4 @@
 
 
 struct timeval;
 struct timeval;
 struct fd_set;
 struct fd_set;
+struct timezone;

+ 0 - 6
makepanda/installer.nsi

@@ -15,7 +15,6 @@
 ;   BUILT         - location of panda install tree.
 ;   BUILT         - location of panda install tree.
 ;   SOURCE        - location of the panda source-tree if available, OR location of panda install tree.
 ;   SOURCE        - location of the panda source-tree if available, OR location of panda install tree.
 ;   PYVER         - version of Python that Panda was built with (ie, "2.7")
 ;   PYVER         - version of Python that Panda was built with (ie, "2.7")
-;   PYEXTRAS      - directory containing python extras, if any.
 ;   REGVIEW       - either 32 or 64, depending on the build architecture.
 ;   REGVIEW       - either 32 or 64, depending on the build architecture.
 ;
 ;
 
 
@@ -372,11 +371,6 @@ SectionGroup "Python support"
         SetOutPath "$INSTDIR\python"
         SetOutPath "$INSTDIR\python"
         File /r "${BUILT}\python\*"
         File /r "${BUILT}\python\*"
 
 
-        !ifdef PYEXTRAS
-        SetOutPath "$INSTDIR\python\lib"
-        File /nonfatal /r "${PYEXTRAS}\*"
-        !endif
-
         SetDetailsPrint both
         SetDetailsPrint both
         DetailPrint "Adding registry keys for Python..."
         DetailPrint "Adding registry keys for Python..."
         SetDetailsPrint listonly
         SetDetailsPrint listonly

+ 23 - 12
makepanda/makepanda.py

@@ -561,6 +561,10 @@ if (COMPILER == "MSVC"):
                 LibName(pkg, 'dxerrVNUM.lib'.replace("VNUM", vnum))
                 LibName(pkg, 'dxerrVNUM.lib'.replace("VNUM", vnum))
             #LibName(pkg, 'ddraw.lib')
             #LibName(pkg, 'ddraw.lib')
             LibName(pkg, 'dxguid.lib')
             LibName(pkg, 'dxguid.lib')
+
+    if not PkgSkip("FREETYPE") and os.path.isdir(GetThirdpartyDir() + "freetype/include/freetype2"):
+        IncDirectory("FREETYPE", GetThirdpartyDir() + "freetype/include/freetype2")
+
     IncDirectory("ALWAYS", GetThirdpartyDir() + "extras/include")
     IncDirectory("ALWAYS", GetThirdpartyDir() + "extras/include")
     LibName("WINSOCK", "wsock32.lib")
     LibName("WINSOCK", "wsock32.lib")
     LibName("WINSOCK2", "wsock32.lib")
     LibName("WINSOCK2", "wsock32.lib")
@@ -587,17 +591,26 @@ if (COMPILER == "MSVC"):
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "quartz.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "quartz.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbc32.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbc32.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbccp32.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbccp32.lib")
-    if (PkgSkip("PNG")==0):      LibName("PNG",      GetThirdpartyDir() + "png/lib/libpng_static.lib")
+    if (PkgSkip("OPENSSL")==0):
+        LibName("OPENSSL", GetThirdpartyDir() + "openssl/lib/libpandassl.lib")
+        LibName("OPENSSL", GetThirdpartyDir() + "openssl/lib/libpandaeay.lib")
+    if (PkgSkip("PNG")==0):
+        if os.path.isfile(GetThirdpartyDir() + "png/lib/libpng16_static.lib"):
+            LibName("PNG", GetThirdpartyDir() + "png/lib/libpng16_static.lib")
+        else:
+            LibName("PNG", GetThirdpartyDir() + "png/lib/libpng_static.lib")
+    if (PkgSkip("TIFF")==0):
+        if os.path.isfile(GetThirdpartyDir() + "tiff/lib/libtiff.lib"):
+            LibName("TIFF", GetThirdpartyDir() + "tiff/lib/libtiff.lib")
+        else:
+            LibName("TIFF", GetThirdpartyDir() + "tiff/lib/tiff.lib")
     if (PkgSkip("JPEG")==0):     LibName("JPEG",     GetThirdpartyDir() + "jpeg/lib/jpeg-static.lib")
     if (PkgSkip("JPEG")==0):     LibName("JPEG",     GetThirdpartyDir() + "jpeg/lib/jpeg-static.lib")
-    if (PkgSkip("TIFF")==0):     LibName("TIFF",     GetThirdpartyDir() + "tiff/lib/libtiff.lib")
     if (PkgSkip("ZLIB")==0):     LibName("ZLIB",     GetThirdpartyDir() + "zlib/lib/zlibstatic.lib")
     if (PkgSkip("ZLIB")==0):     LibName("ZLIB",     GetThirdpartyDir() + "zlib/lib/zlibstatic.lib")
     if (PkgSkip("VRPN")==0):     LibName("VRPN",     GetThirdpartyDir() + "vrpn/lib/vrpn.lib")
     if (PkgSkip("VRPN")==0):     LibName("VRPN",     GetThirdpartyDir() + "vrpn/lib/vrpn.lib")
     if (PkgSkip("VRPN")==0):     LibName("VRPN",     GetThirdpartyDir() + "vrpn/lib/quat.lib")
     if (PkgSkip("VRPN")==0):     LibName("VRPN",     GetThirdpartyDir() + "vrpn/lib/quat.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("CGGL",     GetThirdpartyDir() + "nvidiacg/lib/cgGL.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("CGGL",     GetThirdpartyDir() + "nvidiacg/lib/cgGL.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("CGDX9",    GetThirdpartyDir() + "nvidiacg/lib/cgD3D9.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("CGDX9",    GetThirdpartyDir() + "nvidiacg/lib/cgD3D9.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("NVIDIACG", GetThirdpartyDir() + "nvidiacg/lib/cg.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("NVIDIACG", GetThirdpartyDir() + "nvidiacg/lib/cg.lib")
-    if (PkgSkip("OPENSSL")==0):  LibName("OPENSSL",  GetThirdpartyDir() + "openssl/lib/libpandassl.lib")
-    if (PkgSkip("OPENSSL")==0):  LibName("OPENSSL",  GetThirdpartyDir() + "openssl/lib/libpandaeay.lib")
     if (PkgSkip("FREETYPE")==0): LibName("FREETYPE", GetThirdpartyDir() + "freetype/lib/freetype.lib")
     if (PkgSkip("FREETYPE")==0): LibName("FREETYPE", GetThirdpartyDir() + "freetype/lib/freetype.lib")
     if (PkgSkip("FFTW")==0):     LibName("FFTW",     GetThirdpartyDir() + "fftw/lib/rfftw.lib")
     if (PkgSkip("FFTW")==0):     LibName("FFTW",     GetThirdpartyDir() + "fftw/lib/rfftw.lib")
     if (PkgSkip("FFTW")==0):     LibName("FFTW",     GetThirdpartyDir() + "fftw/lib/fftw.lib")
     if (PkgSkip("FFTW")==0):     LibName("FFTW",     GetThirdpartyDir() + "fftw/lib/fftw.lib")
@@ -703,7 +716,7 @@ if (COMPILER == "MSVC"):
         IncDirectory("SPEEDTREE", SDK["SPEEDTREE"] + "/Include")
         IncDirectory("SPEEDTREE", SDK["SPEEDTREE"] + "/Include")
     if (PkgSkip("BULLET")==0):
     if (PkgSkip("BULLET")==0):
         suffix = '.lib'
         suffix = '.lib'
-        if GetTargetArch() == 'x64':
+        if GetTargetArch() == 'x64' and os.path.isfile(GetThirdpartyDir() + "bullet/lib/BulletCollision_x64.lib"):
             suffix = '_x64.lib'
             suffix = '_x64.lib'
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/LinearMath" + suffix)
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/LinearMath" + suffix)
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/BulletCollision" + suffix)
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/BulletCollision" + suffix)
@@ -2237,6 +2250,7 @@ DTOOL_CONFIG=[
     ("HAVE_SOFTIMAGE_PIC",             '1',                      '1'),
     ("HAVE_SOFTIMAGE_PIC",             '1',                      '1'),
     ("HAVE_BMP",                       '1',                      '1'),
     ("HAVE_BMP",                       '1',                      '1'),
     ("HAVE_PNM",                       '1',                      '1'),
     ("HAVE_PNM",                       '1',                      '1'),
+    ("HAVE_STB_IMAGE",                 '1',                      '1'),
     ("HAVE_VORBIS",                    'UNDEF',                  'UNDEF'),
     ("HAVE_VORBIS",                    'UNDEF',                  'UNDEF'),
     ("HAVE_NVIDIACG",                  'UNDEF',                  'UNDEF'),
     ("HAVE_NVIDIACG",                  'UNDEF',                  'UNDEF'),
     ("HAVE_FREETYPE",                  'UNDEF',                  'UNDEF'),
     ("HAVE_FREETYPE",                  'UNDEF',                  'UNDEF'),
@@ -3467,8 +3481,7 @@ if (not RUNTIME):
   TargetAdd('libp3putil.in', opts=OPTS, input=IGATEFILES)
   TargetAdd('libp3putil.in', opts=OPTS, input=IGATEFILES)
   TargetAdd('libp3putil.in', opts=['IMOD:panda3d.core', 'ILIB:libp3putil', 'SRCDIR:panda/src/putil'])
   TargetAdd('libp3putil.in', opts=['IMOD:panda3d.core', 'ILIB:libp3putil', 'SRCDIR:panda/src/putil'])
   TargetAdd('libp3putil_igate.obj', input='libp3putil.in', opts=["DEPENDENCYONLY"])
   TargetAdd('libp3putil_igate.obj', input='libp3putil.in', opts=["DEPENDENCYONLY"])
-  TargetAdd('p3putil_typedWritable_ext.obj', opts=OPTS, input='typedWritable_ext.cxx')
-  TargetAdd('p3putil_pythonCallbackObject.obj', opts=OPTS, input='pythonCallbackObject.cxx')
+  TargetAdd('p3putil_ext_composite.obj', opts=OPTS, input='p3putil_ext_composite.cxx')
 
 
 #
 #
 # DIRECTORY: panda/src/audio/
 # DIRECTORY: panda/src/audio/
@@ -4077,8 +4090,7 @@ if (not RUNTIME):
   if PkgSkip("FREETYPE")==0:
   if PkgSkip("FREETYPE")==0:
     TargetAdd('core.pyd', input="libp3pnmtext_igate.obj")
     TargetAdd('core.pyd', input="libp3pnmtext_igate.obj")
 
 
-  TargetAdd('core.pyd', input='p3putil_typedWritable_ext.obj')
-  TargetAdd('core.pyd', input='p3putil_pythonCallbackObject.obj')
+  TargetAdd('core.pyd', input='p3putil_ext_composite.obj')
   TargetAdd('core.pyd', input='p3pnmimage_pfmFile_ext.obj')
   TargetAdd('core.pyd', input='p3pnmimage_pfmFile_ext.obj')
   TargetAdd('core.pyd', input='p3event_pythonTask.obj')
   TargetAdd('core.pyd', input='p3event_pythonTask.obj')
   TargetAdd('core.pyd', input='p3gobj_ext_composite.obj')
   TargetAdd('core.pyd', input='p3gobj_ext_composite.obj')
@@ -6600,7 +6612,6 @@ def MakeInstallerNSIS(file, title, installdir):
         'BUILT'       : panda,
         'BUILT'       : panda,
         'SOURCE'      : psource,
         'SOURCE'      : psource,
         'PYVER'       : SDK["PYTHONVERSION"][6:9],
         'PYVER'       : SDK["PYTHONVERSION"][6:9],
-        'PYEXTRAS'    : os.path.join(os.path.abspath(GetThirdpartyBase()), 'win-extras'),
         'REGVIEW'     : regview,
         'REGVIEW'     : regview,
     }
     }
 
 
@@ -6977,8 +6988,8 @@ def MakeInstallerOSX():
         oscmd("cp -R %s/pandac                dstroot/pythoncode/Developer/Panda3D/pandac" % GetOutputDir())
         oscmd("cp -R %s/pandac                dstroot/pythoncode/Developer/Panda3D/pandac" % GetOutputDir())
         oscmd("cp -R %s/direct                dstroot/pythoncode/Developer/Panda3D/direct" % GetOutputDir())
         oscmd("cp -R %s/direct                dstroot/pythoncode/Developer/Panda3D/direct" % GetOutputDir())
         oscmd("ln -s %s                       dstroot/pythoncode/usr/local/bin/ppython" % SDK["PYTHONEXEC"])
         oscmd("ln -s %s                       dstroot/pythoncode/usr/local/bin/ppython" % SDK["PYTHONEXEC"])
-        oscmd("cp -R %s/*.so                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir())
-        oscmd("cp -R %s/*.py                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir())
+        oscmd("cp -R %s/*.so                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir(), True)
+        oscmd("cp -R %s/*.py                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir(), True)
         if os.path.isdir(GetOutputDir()+"/Pmw"):
         if os.path.isdir(GetOutputDir()+"/Pmw"):
             oscmd("cp -R %s/Pmw               dstroot/pythoncode/Developer/Panda3D/Pmw" % GetOutputDir())
             oscmd("cp -R %s/Pmw               dstroot/pythoncode/Developer/Panda3D/Pmw" % GetOutputDir())
             compileall.compile_dir("dstroot/pythoncode/Developer/Panda3D/Pmw")
             compileall.compile_dir("dstroot/pythoncode/Developer/Panda3D/Pmw")

+ 49 - 19
makepanda/makepandacore.py

@@ -1133,12 +1133,12 @@ def GetThirdpartyDir():
     target_arch = GetTargetArch()
     target_arch = GetTargetArch()
 
 
     if (target == 'windows'):
     if (target == 'windows'):
+        vc = SDK["VISUALSTUDIO_VERSION"].split('.')[0]
+
         if target_arch == 'x64':
         if target_arch == 'x64':
-            THIRDPARTYDIR = base + "/win-libs-vc10-x64/"
-            if not os.path.isdir(THIRDPARTYDIR):
-                THIRDPARTYDIR = base + "/win-libs-vc10/"
+            THIRDPARTYDIR = base + "/win-libs-vc" + vc + "-x64/"
         else:
         else:
-            THIRDPARTYDIR = base + "/win-libs-vc10/"
+            THIRDPARTYDIR = base + "/win-libs-vc" + vc + "/"
 
 
     elif (target == 'darwin'):
     elif (target == 'darwin'):
         # OSX thirdparty binaries are universal, where possible.
         # OSX thirdparty binaries are universal, where possible.
@@ -1423,10 +1423,16 @@ def PkgConfigEnable(opt, pkgname, tool = "pkg-config"):
     for i, j in PkgConfigGetDefSymbols(pkgname, tool).items():
     for i, j in PkgConfigGetDefSymbols(pkgname, tool).items():
         DefSymbol(opt, i, j)
         DefSymbol(opt, i, j)
 
 
-def LocateLibrary(lib, lpath=[]):
-    """ Returns True if this library was found in the given search path, False otherwise. """
+def LocateLibrary(lib, lpath=[], prefer_static=False):
+    """Searches for the library in the search path, returning its path if found,
+    or None if it was not found."""
     target = GetTarget()
     target = GetTarget()
 
 
+    if prefer_static and target != 'windows':
+        for dir in lpath:
+            if os.path.isfile(os.path.join(dir, 'lib%s.a' % lib)):
+                return os.path.join(dir, 'lib%s.a' % lib)
+
     for dir in lpath:
     for dir in lpath:
         if target == 'darwin' and os.path.isfile(os.path.join(dir, 'lib%s.dylib' % lib)):
         if target == 'darwin' and os.path.isfile(os.path.join(dir, 'lib%s.dylib' % lib)):
             return os.path.join(dir, 'lib%s.dylib' % lib)
             return os.path.join(dir, 'lib%s.dylib' % lib)
@@ -1498,6 +1504,7 @@ def SmartPkgEnable(pkg, pkgconfig = None, libs = None, incs = None, defs = None,
         LibName(target_pkg, "-lswresample")
         LibName(target_pkg, "-lswresample")
         return
         return
 
 
+    # First check if the package is in the thirdparty directory.
     pkg_dir = os.path.join(GetThirdpartyDir(), pkg.lower())
     pkg_dir = os.path.join(GetThirdpartyDir(), pkg.lower())
     if not custom_loc and os.path.isdir(pkg_dir):
     if not custom_loc and os.path.isdir(pkg_dir):
         if framework and os.path.isdir(os.path.join(pkg_dir, framework + ".framework")):
         if framework and os.path.isdir(os.path.join(pkg_dir, framework + ".framework")):
@@ -1508,32 +1515,55 @@ def SmartPkgEnable(pkg, pkgconfig = None, libs = None, incs = None, defs = None,
         if os.path.isdir(os.path.join(pkg_dir, "include")):
         if os.path.isdir(os.path.join(pkg_dir, "include")):
             IncDirectory(target_pkg, os.path.join(pkg_dir, "include"))
             IncDirectory(target_pkg, os.path.join(pkg_dir, "include"))
 
 
-        if os.path.isdir(os.path.join(pkg_dir, "lib")):
-            LibDirectory(target_pkg, os.path.join(pkg_dir, "lib"))
+            # Handle cases like freetype2 where the include dir is a subdir under "include"
+            for i in incs:
+                if os.path.isdir(os.path.join(pkg_dir, "include", i)):
+                    IncDirectory(target_pkg, os.path.join(pkg_dir, "include", i))
+
+        lpath = [os.path.join(pkg_dir, "lib")]
 
 
-        if (PkgSkip("PYTHON") == 0):
+        if not PkgSkip("PYTHON"):
             py_lib_dir = os.path.join(pkg_dir, "lib", SDK["PYTHONVERSION"])
             py_lib_dir = os.path.join(pkg_dir, "lib", SDK["PYTHONVERSION"])
             if os.path.isdir(py_lib_dir):
             if os.path.isdir(py_lib_dir):
-                LibDirectory(target_pkg, py_lib_dir)
+                lpath.append(py_lib_dir)
 
 
-        # TODO: check for a .pc file in the lib/pkg-config/ dir
+        # TODO: check for a .pc file in the lib/pkgconfig/ dir
         if (tool != None and os.path.isfile(os.path.join(pkg_dir, "bin", tool))):
         if (tool != None and os.path.isfile(os.path.join(pkg_dir, "bin", tool))):
             tool = os.path.join(pkg_dir, "bin", tool)
             tool = os.path.join(pkg_dir, "bin", tool)
             for i in PkgConfigGetLibs(None, tool):
             for i in PkgConfigGetLibs(None, tool):
-                LibName(target_pkg, i)
+                if i.startswith('-l'):
+                    # To make sure we don't pick up the system copy, write out
+                    # the full path instead.
+                    libname = i[2:]
+                    location = LocateLibrary(libname, lpath, prefer_static=True)
+                    if location is not None:
+                        LibName(target_pkg, location)
+                    else:
+                        print(GetColor("cyan") + "Couldn't find library lib" + libname + " in thirdparty directory " + pkg.lower() + GetColor())
+                        LibName(target_pkg, i)
+                else:
+                    LibName(target_pkg, i)
             for i, j in PkgConfigGetDefSymbols(None, tool).items():
             for i, j in PkgConfigGetDefSymbols(None, tool).items():
                 DefSymbol(target_pkg, i, j)
                 DefSymbol(target_pkg, i, j)
             return
             return
 
 
+        # Now search for the libraries in the package's lib directories.
         for l in libs:
         for l in libs:
             libname = l
             libname = l
             if l.startswith("lib"):
             if l.startswith("lib"):
                 libname = l[3:]
                 libname = l[3:]
-            # This is for backward compatibility - in the thirdparty dir, we kept some libs with "panda" prefix, like libpandatiff.
-            if len(glob.glob(os.path.join(pkg_dir, "lib", "libpanda%s.*" % (libname)))) > 0 \
-               and len(glob.glob(os.path.join(pkg_dir, "lib", "lib%s.*" % (libname)))) == 0:
-                libname = "panda" + libname
-            LibName(target_pkg, "-l" + libname)
+
+            location = LocateLibrary(libname, lpath, prefer_static=True)
+            if location is not None:
+                LibName(target_pkg, location)
+            else:
+                # This is for backward compatibility - in the thirdparty dir,
+                # we kept some libs with "panda" prefix, like libpandatiff.
+                location = LocateLibrary("panda" + libname, lpath, prefer_static=True)
+                if location is not None:
+                    LibName(target_pkg, location)
+                else:
+                    print(GetColor("cyan") + "Couldn't find library lib" + libname + " in thirdparty directory " + pkg.lower() + GetColor())
 
 
         for d, v in defs.values():
         for d, v in defs.values():
             DefSymbol(target_pkg, d, v)
             DefSymbol(target_pkg, d, v)
@@ -2301,8 +2331,8 @@ def SetupVisualStudioEnviron():
         AddToPathEnv("PATH",    SDK["MSPLATFORM"] + "bin\\" + arch)
         AddToPathEnv("PATH",    SDK["MSPLATFORM"] + "bin\\" + arch)
 
 
         # Windows Kit 10 introduces the "universal CRT".
         # Windows Kit 10 introduces the "universal CRT".
-        inc_dir = SDK["MSPLATFORM"] + "Include\\10.0.10240.0\\"
-        lib_dir = SDK["MSPLATFORM"] + "Lib\\10.0.10240.0\\"
+        inc_dir = SDK["MSPLATFORM"] + "Include\\10.0.10586.0\\"
+        lib_dir = SDK["MSPLATFORM"] + "Lib\\10.0.10586.0\\"
         AddToPathEnv("INCLUDE", inc_dir + "shared")
         AddToPathEnv("INCLUDE", inc_dir + "shared")
         AddToPathEnv("INCLUDE", inc_dir + "ucrt")
         AddToPathEnv("INCLUDE", inc_dir + "ucrt")
         AddToPathEnv("INCLUDE", inc_dir + "um")
         AddToPathEnv("INCLUDE", inc_dir + "um")

+ 1 - 1
panda/src/bullet/bulletContactResult.I

@@ -86,7 +86,7 @@ get_num_contacts() const {
 /**
 /**
  *
  *
  */
  */
-INLINE BulletContact &BulletContactResult::
+INLINE BulletContact BulletContactResult::
 get_contact(int idx) {
 get_contact(int idx) {
 
 
   nassertr(idx >= 0 && idx < (int)_contacts.size(), _empty);
   nassertr(idx >= 0 && idx < (int)_contacts.size(), _empty);

+ 1 - 1
panda/src/bullet/bulletContactResult.h

@@ -62,7 +62,7 @@ struct EXPCL_PANDABULLET BulletContactResult : public btCollisionWorld::ContactR
 
 
 PUBLISHED:
 PUBLISHED:
   INLINE int get_num_contacts() const;
   INLINE int get_num_contacts() const;
-  INLINE BulletContact &get_contact(int idx);
+  INLINE BulletContact get_contact(int idx);
   MAKE_SEQ(get_contacts, get_num_contacts, get_contact);
   MAKE_SEQ(get_contacts, get_num_contacts, get_contact);
 
 
 public:
 public:

+ 1 - 1
panda/src/bullet/bulletHeightfieldShape.I

@@ -18,7 +18,7 @@ INLINE BulletHeightfieldShape::
 ~BulletHeightfieldShape() {
 ~BulletHeightfieldShape() {
 
 
   delete _shape;
   delete _shape;
-  delete _data;
+  delete [] _data;
 }
 }
 
 
 /**
 /**

+ 4 - 0
panda/src/bullet/bulletTriangleMesh.cxx

@@ -134,6 +134,8 @@ add_geom(const Geom *geom, bool remove_duplicate_vertices, const TransformState
       _mesh->addTriangle(v0, v1, v2, remove_duplicate_vertices);
       _mesh->addTriangle(v0, v1, v2, remove_duplicate_vertices);
     }
     }
   }
   }
+
+  delete [] vertices;
 }
 }
 
 
 /**
 /**
@@ -163,6 +165,8 @@ add_array(const PTA_LVecBase3 &points, const PTA_int &indices, bool remove_dupli
 
 
     _mesh->addTriangle(v0, v1, v2, remove_duplicate_vertices);
     _mesh->addTriangle(v0, v1, v2, remove_duplicate_vertices);
   }
   }
+
+  delete [] vertices;
 }
 }
 
 
 /**
 /**

+ 10 - 4
panda/src/chan/animChannelMatrixXfmTable.cxx

@@ -327,10 +327,16 @@ void AnimChannelMatrixXfmTable::
 write_datagram(BamWriter *manager, Datagram &me) {
 write_datagram(BamWriter *manager, Datagram &me) {
   AnimChannelMatrix::write_datagram(manager, me);
   AnimChannelMatrix::write_datagram(manager, me);
 
 
-  if (compress_channels && !FFTCompressor::is_compression_available()) {
-    chan_cat.error()
-      << "Compression is not available; writing uncompressed channels.\n";
-    compress_channels = false;
+  if (compress_channels) {
+    chan_cat.warning()
+      << "FFT compression of animations is deprecated.  For compatibility "
+         "with future versions of Panda3D, set compress-channels to false.\n";
+
+    if (!FFTCompressor::is_compression_available()) {
+      chan_cat.error()
+        << "Compression is not available; writing uncompressed channels.\n";
+      compress_channels = false;
+    }
   }
   }
 
 
   me.add_bool(compress_channels);
   me.add_bool(compress_channels);

+ 10 - 4
panda/src/chan/animChannelScalarTable.cxx

@@ -146,10 +146,16 @@ void AnimChannelScalarTable::
 write_datagram(BamWriter *manager, Datagram &me) {
 write_datagram(BamWriter *manager, Datagram &me) {
   AnimChannelScalar::write_datagram(manager, me);
   AnimChannelScalar::write_datagram(manager, me);
 
 
-  if (compress_channels && !FFTCompressor::is_compression_available()) {
-    chan_cat.error()
-      << "Compression is not available; writing uncompressed channels.\n";
-    compress_channels = false;
+  if (compress_channels) {
+    chan_cat.warning()
+      << "FFT compression of animations is deprecated.  For compatibility "
+         "with future versions of Panda3D, set compress-channels to false.\n";
+
+    if (!FFTCompressor::is_compression_available()) {
+      chan_cat.error()
+        << "Compression is not available; writing uncompressed channels.\n";
+      compress_channels = false;
+    }
   }
   }
 
 
   me.add_bool(compress_channels);
   me.add_bool(compress_channels);

+ 11 - 0
panda/src/chan/config_chan.cxx

@@ -138,4 +138,15 @@ ConfigureFn(config_chan) {
   AnimChannelScalarTable::register_with_read_factory();
   AnimChannelScalarTable::register_with_read_factory();
   AnimChannelScalarDynamic::register_with_read_factory();
   AnimChannelScalarDynamic::register_with_read_factory();
   AnimPreloadTable::register_with_read_factory();
   AnimPreloadTable::register_with_read_factory();
+
+  // For compatibility with old .bam files.
+#ifndef STDFLOAT_DOUBLE
+  TypeRegistry *reg = TypeRegistry::ptr();
+  reg->record_alternate_name(AnimChannelFixed<ACMatrixSwitchType>::get_class_type(),
+                             "AnimChannelFixed<LMatrix4f>");
+  reg->record_alternate_name(MovingPart<ACMatrixSwitchType>::get_class_type(),
+                             "MovingPart<LMatrix4f>");
+  reg->record_alternate_name(MovingPart<ACScalarSwitchType>::get_class_type(),
+                             "MovingPart<float>");
+#endif
 }
 }

+ 9 - 9
panda/src/display/drawableRegion.I

@@ -17,10 +17,10 @@
 INLINE DrawableRegion::
 INLINE DrawableRegion::
 DrawableRegion() :
 DrawableRegion() :
   _screenshot_buffer_type(RenderBuffer::T_front),
   _screenshot_buffer_type(RenderBuffer::T_front),
-  _draw_buffer_type(RenderBuffer::T_back)
+  _draw_buffer_type(RenderBuffer::T_back),
+  _clear_mask(0)
 {
 {
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = false;
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = LColor(0.0f, 0.0f, 0.0f, 0.0f);
     _clear_value[i] = LColor(0.0f, 0.0f, 0.0f, 0.0f);
   }
   }
   _clear_value[RTP_depth] = LColor(1.0f,1.0f,1.0f,1.0f);
   _clear_value[RTP_depth] = LColor(1.0f,1.0f,1.0f,1.0f);
@@ -35,11 +35,11 @@ INLINE DrawableRegion::
 DrawableRegion(const DrawableRegion &copy) :
 DrawableRegion(const DrawableRegion &copy) :
   _screenshot_buffer_type(copy._screenshot_buffer_type),
   _screenshot_buffer_type(copy._screenshot_buffer_type),
   _draw_buffer_type(copy._draw_buffer_type),
   _draw_buffer_type(copy._draw_buffer_type),
+  _clear_mask(copy._clear_mask),
   _pixel_zoom(copy._pixel_zoom),
   _pixel_zoom(copy._pixel_zoom),
   _pixel_factor(copy._pixel_factor)
   _pixel_factor(copy._pixel_factor)
 {
 {
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = copy._clear_active[i];
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = copy._clear_value[i];
     _clear_value[i] = copy._clear_value[i];
   }
   }
 }
 }
@@ -51,8 +51,8 @@ INLINE void DrawableRegion::
 operator = (const DrawableRegion &copy) {
 operator = (const DrawableRegion &copy) {
   _screenshot_buffer_type = copy._screenshot_buffer_type;
   _screenshot_buffer_type = copy._screenshot_buffer_type;
   _draw_buffer_type = copy._draw_buffer_type;
   _draw_buffer_type = copy._draw_buffer_type;
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = copy._clear_active[i];
+  _clear_mask = copy._clear_mask;
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = copy._clear_value[i];
     _clear_value[i] = copy._clear_value[i];
   }
   }
   _pixel_zoom = copy._pixel_zoom;
   _pixel_zoom = copy._pixel_zoom;
@@ -64,8 +64,8 @@ operator = (const DrawableRegion &copy) {
  */
  */
 INLINE void DrawableRegion::
 INLINE void DrawableRegion::
 copy_clear_settings(const DrawableRegion &copy) {
 copy_clear_settings(const DrawableRegion &copy) {
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = copy._clear_active[i];
+  _clear_mask = copy._clear_mask;
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = copy._clear_value[i];
     _clear_value[i] = copy._clear_value[i];
   }
   }
   update_pixel_factor();
   update_pixel_factor();

+ 10 - 13
panda/src/display/drawableRegion.cxx

@@ -27,8 +27,12 @@ DrawableRegion::
  */
  */
 void DrawableRegion::
 void DrawableRegion::
 set_clear_active(int n, bool clear_active) {
 set_clear_active(int n, bool clear_active) {
-  nassertv((n >= 0)&&(n < RTP_COUNT));
-  _clear_active[n] = clear_active;
+  nassertv(n >= 0 && n < RTP_COUNT);
+  if (clear_active) {
+    _clear_mask |= 1 << n;
+  } else {
+    _clear_mask &= ~(1 << n);
+  }
   update_pixel_factor();
   update_pixel_factor();
 }
 }
 
 
@@ -37,8 +41,8 @@ set_clear_active(int n, bool clear_active) {
  */
  */
 bool DrawableRegion::
 bool DrawableRegion::
 get_clear_active(int n) const {
 get_clear_active(int n) const {
-  nassertr((n >= 0)&&(n < RTP_COUNT), false);
-  return _clear_active[n];
+  nassertr(n >= 0 && n < RTP_COUNT, false);
+  return (_clear_mask & (1 << n)) != 0;
 }
 }
 
 
 /**
 /**
@@ -66,9 +70,7 @@ get_clear_value(int n) const {
  */
  */
 void DrawableRegion::
 void DrawableRegion::
 disable_clears() {
 disable_clears() {
-  for (int i = 0; i < RTP_COUNT; ++i) {
-    _clear_active[i] = false;
-  }
+  _clear_mask = 0;
   update_pixel_factor();
   update_pixel_factor();
 }
 }
 
 
@@ -79,12 +81,7 @@ disable_clears() {
  */
  */
 bool DrawableRegion::
 bool DrawableRegion::
 is_any_clear_active() const {
 is_any_clear_active() const {
-  for (int i = 0; i < RTP_COUNT; ++i) {
-    if (get_clear_active(i)) {
-      return true;
-    }
-  }
-  return false;
+  return (_clear_mask != 0);
 }
 }
 
 
 /**
 /**

+ 1 - 1
panda/src/display/drawableRegion.h

@@ -109,9 +109,9 @@ protected:
 protected:
 protected:
   int _screenshot_buffer_type;
   int _screenshot_buffer_type;
   int _draw_buffer_type;
   int _draw_buffer_type;
+  int _clear_mask;
 
 
 private:
 private:
-  bool    _clear_active[RTP_COUNT];
   LColor  _clear_value[RTP_COUNT];
   LColor  _clear_value[RTP_COUNT];
 
 
   PN_stdfloat _pixel_zoom;
   PN_stdfloat _pixel_zoom;

+ 7 - 0
panda/src/display/frameBufferProperties.cxx

@@ -480,6 +480,13 @@ get_quality(const FrameBufferProperties &reqs) const {
     quality -= 10000000;
     quality -= 10000000;
   }
   }
 
 
+  // Deduct for software-only renderers in absence of a special request.
+  // Cost: 2,000,000
+
+  if (get_force_software() && !reqs.get_force_software()) {
+    quality -= 2000000;
+  }
+
   // Deduct for missing depth, color, alpha, stencil, or accum.  Cost:
   // Deduct for missing depth, color, alpha, stencil, or accum.  Cost:
   // 1,000,000
   // 1,000,000
 
 

+ 3 - 0
panda/src/display/graphicsEngine.cxx

@@ -415,6 +415,9 @@ make_output(GraphicsPipe *pipe,
           if (flags & GraphicsPipe::BF_fb_props_optional) {
           if (flags & GraphicsPipe::BF_fb_props_optional) {
             display_cat.warning()
             display_cat.warning()
               << "FrameBufferProperties available less than requested.\n";
               << "FrameBufferProperties available less than requested.\n";
+            display_cat.warning(false)
+              << "  requested: " << fb_prop << "\n"
+              << "  got: " << window->get_fb_properties() << "\n";
             return window;
             return window;
           }
           }
           display_cat.error()
           display_cat.error()

+ 9 - 0
panda/src/display/graphicsStateGuardian.I

@@ -684,6 +684,15 @@ get_max_color_targets() const {
   return _max_color_targets;
   return _max_color_targets;
 }
 }
 
 
+/**
+ * Returns true if dual source (incoming1_color and incoming1_alpha) blend
+ * operands are supported by this GSG.
+ */
+INLINE bool GraphicsStateGuardian::
+get_supports_dual_source_blending() const {
+  return _supports_dual_source_blending;
+}
+
 /**
 /**
  * Deprecated.  Use get_max_color_targets() instead, which returns the exact
  * Deprecated.  Use get_max_color_targets() instead, which returns the exact
  * same value.
  * same value.

+ 5 - 1
panda/src/display/graphicsStateGuardian.cxx

@@ -246,6 +246,7 @@ GraphicsStateGuardian(CoordinateSystem internal_coordinate_system,
 
 
   // Assume a maximum of 1 render target in absence of MRT.
   // Assume a maximum of 1 render target in absence of MRT.
   _max_color_targets = 1;
   _max_color_targets = 1;
+  _supports_dual_source_blending = false;
 
 
   _supported_geom_rendering = 0;
   _supported_geom_rendering = 0;
 
 
@@ -2195,7 +2196,10 @@ begin_draw_primitives(const GeomPipelineReader *geom_reader,
                       bool force) {
                       bool force) {
   _munger = munger;
   _munger = munger;
   _data_reader = data_reader;
   _data_reader = data_reader;
-  return _data_reader->has_vertex();
+
+  // Always draw if we have a shader, since the shader might use a different
+  // mechanism for fetching vertex data.
+  return _data_reader->has_vertex() || (_target_shader && _target_shader->has_shader());
 }
 }
 
 
 /**
 /**

+ 3 - 0
panda/src/display/graphicsStateGuardian.h

@@ -172,6 +172,7 @@ PUBLISHED:
 
 
   INLINE int get_max_color_targets() const;
   INLINE int get_max_color_targets() const;
   INLINE int get_maximum_simultaneous_render_targets() const;
   INLINE int get_maximum_simultaneous_render_targets() const;
+  INLINE bool get_supports_dual_source_blending() const;
 
 
   MAKE_PROPERTY(max_vertices_per_array, get_max_vertices_per_array);
   MAKE_PROPERTY(max_vertices_per_array, get_max_vertices_per_array);
   MAKE_PROPERTY(max_vertices_per_primitive, get_max_vertices_per_primitive);
   MAKE_PROPERTY(max_vertices_per_primitive, get_max_vertices_per_primitive);
@@ -217,6 +218,7 @@ PUBLISHED:
   MAKE_PROPERTY(supports_timer_query, get_supports_timer_query);
   MAKE_PROPERTY(supports_timer_query, get_supports_timer_query);
   MAKE_PROPERTY(timer_queries_active, get_timer_queries_active);
   MAKE_PROPERTY(timer_queries_active, get_timer_queries_active);
   MAKE_PROPERTY(max_color_targets, get_max_color_targets);
   MAKE_PROPERTY(max_color_targets, get_max_color_targets);
+  MAKE_PROPERTY(supports_dual_source_blending, get_supports_dual_source_blending);
 
 
   INLINE ShaderModel get_shader_model() const;
   INLINE ShaderModel get_shader_model() const;
   INLINE void set_shader_model(ShaderModel shader_model);
   INLINE void set_shader_model(ShaderModel shader_model);
@@ -609,6 +611,7 @@ protected:
   bool _supports_indirect_draw;
   bool _supports_indirect_draw;
 
 
   int _max_color_targets;
   int _max_color_targets;
+  bool _supports_dual_source_blending;
 
 
   int  _supported_geom_rendering;
   int  _supported_geom_rendering;
   bool _color_scale_via_lighting;
   bool _color_scale_via_lighting;

+ 12 - 0
panda/src/downloader/socketStream.h

@@ -126,6 +126,10 @@ public:
   INLINE ISocketStream(streambuf *buf);
   INLINE ISocketStream(streambuf *buf);
   virtual ~ISocketStream();
   virtual ~ISocketStream();
 
 
+#if _MSC_VER >= 1800
+  INLINE ISocketStream(const ISocketStream &copy) = delete;
+#endif
+
 PUBLISHED:
 PUBLISHED:
   enum ReadState {
   enum ReadState {
     RS_initial,
     RS_initial,
@@ -155,6 +159,10 @@ class EXPCL_PANDAEXPRESS OSocketStream : public ostream, public SSWriter {
 public:
 public:
   INLINE OSocketStream(streambuf *buf);
   INLINE OSocketStream(streambuf *buf);
 
 
+#if _MSC_VER >= 1800
+  INLINE OSocketStream(const OSocketStream &copy) = delete;
+#endif
+
 PUBLISHED:
 PUBLISHED:
   virtual bool is_closed() = 0;
   virtual bool is_closed() = 0;
   virtual void close() = 0;
   virtual void close() = 0;
@@ -170,6 +178,10 @@ class EXPCL_PANDAEXPRESS SocketStream : public iostream, public SSReader, public
 public:
 public:
   INLINE SocketStream(streambuf *buf);
   INLINE SocketStream(streambuf *buf);
 
 
+#if _MSC_VER >= 1800
+  INLINE SocketStream(const SocketStream &copy) = delete;
+#endif
+
 PUBLISHED:
 PUBLISHED:
   virtual bool is_closed() = 0;
   virtual bool is_closed() = 0;
   virtual void close() = 0;
   virtual void close() = 0;

+ 5 - 0
panda/src/dxgsg9/config_dxgsg9.cxx

@@ -265,3 +265,8 @@ init_libdxgsg9() {
   PandaSystem *ps = PandaSystem::get_global_ptr();
   PandaSystem *ps = PandaSystem::get_global_ptr();
   ps->add_system("DirectX9");
   ps->add_system("DirectX9");
 }
 }
+
+// Necessary to allow use of dxerr from MSVC 2015
+#if _MSC_VER >= 1900
+int (WINAPIV * __vsnprintf)(char *, size_t, const char*, va_list) = _vsnprintf;
+#endif

+ 55 - 31
panda/src/dxgsg9/dxGraphicsStateGuardian9.cxx

@@ -3766,43 +3766,24 @@ do_issue_blending() {
     }
     }
   }
   }
 
 
-  const ColorBlendAttrib *target_color_blend = DCAST(ColorBlendAttrib, _target_rs->get_attrib_def(ColorBlendAttrib::get_class_slot()));
-  CPT(ColorBlendAttrib) color_blend = target_color_blend;
-  ColorBlendAttrib::Mode color_blend_mode = target_color_blend->get_mode();
+  const ColorBlendAttrib *color_blend;
+  _target_rs->get_attrib_def(color_blend);
+  ColorBlendAttrib::Mode color_blend_mode = color_blend->get_mode();
 
 
-  const TransparencyAttrib *target_transparency = DCAST(TransparencyAttrib, _target_rs->get_attrib_def(TransparencyAttrib::get_class_slot()));
+  const TransparencyAttrib *target_transparency;
+  _target_rs->get_attrib_def(target_transparency);
   TransparencyAttrib::Mode transparency_mode = target_transparency->get_mode();
   TransparencyAttrib::Mode transparency_mode = target_transparency->get_mode();
 
 
   // Is there a color blend set?
   // Is there a color blend set?
   if (color_blend_mode != ColorBlendAttrib::M_none) {
   if (color_blend_mode != ColorBlendAttrib::M_none) {
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
-
-    switch (color_blend_mode) {
-    case ColorBlendAttrib::M_add:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
-      break;
-
-    case ColorBlendAttrib::M_subtract:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_SUBTRACT);
-      break;
-
-    case ColorBlendAttrib::M_inv_subtract:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_REVSUBTRACT);
-      break;
-
-    case ColorBlendAttrib::M_min:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_MIN);
-      break;
-
-    case ColorBlendAttrib::M_max:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_MAX);
-      break;
-    }
-
-    set_render_state(D3DRS_SRCBLEND,
-        get_blend_func(color_blend->get_operand_a()));
-    set_render_state(D3DRS_DESTBLEND,
-        get_blend_func(color_blend->get_operand_b()));
+    set_render_state(D3DRS_SEPARATEALPHABLENDENABLE, TRUE);
+    set_render_state(D3DRS_BLENDOP, get_blend_mode(color_blend_mode));
+    set_render_state(D3DRS_BLENDOPALPHA, get_blend_mode(color_blend->get_alpha_mode()));
+    set_render_state(D3DRS_SRCBLEND, get_blend_func(color_blend->get_operand_a()));
+    set_render_state(D3DRS_DESTBLEND, get_blend_func(color_blend->get_operand_b()));
+    set_render_state(D3DRS_SRCBLENDALPHA, get_blend_func(color_blend->get_alpha_operand_a()));
+    set_render_state(D3DRS_DESTBLENDALPHA, get_blend_func(color_blend->get_alpha_operand_b()));
     return;
     return;
   }
   }
 
 
@@ -3817,6 +3798,7 @@ do_issue_blending() {
   case TransparencyAttrib::M_multisample_mask:
   case TransparencyAttrib::M_multisample_mask:
   case TransparencyAttrib::M_dual:
   case TransparencyAttrib::M_dual:
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
+    set_render_state(D3DRS_SEPARATEALPHABLENDENABLE, FALSE);
     set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
     set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
     set_render_state(D3DRS_SRCBLEND, D3DBLEND_SRCALPHA);
     set_render_state(D3DRS_SRCBLEND, D3DBLEND_SRCALPHA);
     set_render_state(D3DRS_DESTBLEND, D3DBLEND_INVSRCALPHA);
     set_render_state(D3DRS_DESTBLEND, D3DBLEND_INVSRCALPHA);
@@ -3824,6 +3806,7 @@ do_issue_blending() {
 
 
   case TransparencyAttrib::M_premultiplied_alpha:
   case TransparencyAttrib::M_premultiplied_alpha:
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
+    set_render_state(D3DRS_SEPARATEALPHABLENDENABLE, FALSE);
     set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
     set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
     set_render_state(D3DRS_SRCBLEND, D3DBLEND_ONE);
     set_render_state(D3DRS_SRCBLEND, D3DBLEND_ONE);
     set_render_state(D3DRS_DESTBLEND, D3DBLEND_INVSRCALPHA);
     set_render_state(D3DRS_DESTBLEND, D3DBLEND_INVSRCALPHA);
@@ -4052,6 +4035,33 @@ get_light_color(Light *light) const {
   return *(D3DCOLORVALUE *)cf.get_data();
   return *(D3DCOLORVALUE *)cf.get_data();
 }
 }
 
 
+/**
+ * Maps from ColorBlendAttrib::Mode to D3DBLENDOP vaule.
+ */
+D3DBLENDOP DXGraphicsStateGuardian9::
+get_blend_mode(ColorBlendAttrib::Mode mode) {
+  switch (mode) {
+  case ColorBlendAttrib::M_add:
+    return D3DBLENDOP_ADD;
+
+  case ColorBlendAttrib::M_subtract:
+    return D3DBLENDOP_SUBTRACT;
+
+  case ColorBlendAttrib::M_inv_subtract:
+    return D3DBLENDOP_REVSUBTRACT;
+
+  case ColorBlendAttrib::M_min:
+    return D3DBLENDOP_MIN;
+
+  case ColorBlendAttrib::M_max:
+    return D3DBLENDOP_MAX;
+  }
+
+  dxgsg9_cat.error()
+    << "Unknown color blend mode " << (int)mode << endl;
+  return D3DBLENDOP_ADD;
+}
+
 /**
 /**
  * Maps from ColorBlendAttrib::Operand to D3DBLEND value.
  * Maps from ColorBlendAttrib::Operand to D3DBLEND value.
  */
  */
@@ -4106,6 +4116,20 @@ get_blend_func(ColorBlendAttrib::Operand operand) {
 
 
   case ColorBlendAttrib::O_incoming_color_saturate:
   case ColorBlendAttrib::O_incoming_color_saturate:
     return D3DBLEND_SRCALPHASAT;
     return D3DBLEND_SRCALPHASAT;
+
+  case ColorBlendAttrib::O_incoming1_color:
+    return (D3DBLEND)16; //D3DBLEND_SRCCOLOR2;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+    return (D3DBLEND)17; //D3DBLEND_INVSRCCOLOR2;
+
+  case ColorBlendAttrib::O_incoming1_alpha:
+    // Not supported by DX9.
+    return (D3DBLEND)18;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    // Not supported by DX9.
+    return (D3DBLEND)19;
   }
   }
 
 
   dxgsg9_cat.error()
   dxgsg9_cat.error()

+ 1 - 0
panda/src/dxgsg9/dxGraphicsStateGuardian9.h

@@ -217,6 +217,7 @@ protected:
   const D3DCOLORVALUE &get_light_color(Light *light) const;
   const D3DCOLORVALUE &get_light_color(Light *light) const;
   INLINE static D3DTRANSFORMSTATETYPE get_tex_mat_sym(int stage_index);
   INLINE static D3DTRANSFORMSTATETYPE get_tex_mat_sym(int stage_index);
 
 
+  static D3DBLENDOP get_blend_mode(ColorBlendAttrib::Mode mode);
   static D3DBLEND get_blend_func(ColorBlendAttrib::Operand operand);
   static D3DBLEND get_blend_func(ColorBlendAttrib::Operand operand);
   void report_texmgr_stats();
   void report_texmgr_stats();
 
 

+ 53 - 0
panda/src/express/zStreamBuf.cxx

@@ -170,6 +170,59 @@ close_write() {
   }
   }
 }
 }
 
 
+/**
+ * Implements seeking within the stream.  ZStreamBuf only allows seeking back
+ * to the beginning of the stream.
+ */
+streampos ZStreamBuf::
+seekoff(streamoff off, ios_seekdir dir, ios_openmode which) {
+  // Necessary for tellg() to work after seeking to 0.
+  if (dir == ios::cur && off == 0) {
+    if (_source->tellg() == 0) {
+      return 0;
+    } else {
+      return -1;
+    }
+  }
+
+  if (off != 0 || dir != ios::beg) {
+    // We only know how to reposition to the beginning.
+    return -1;
+  }
+
+  if (which != ios::in) {
+    // We can only do this with the input stream.
+    return -1;
+  }
+
+  size_t n = egptr() - gptr();
+  gbump(n);
+
+  _source->seekg(0, ios::beg);
+  if (_source->tellg() == 0) {
+    _z_source.next_in = Z_NULL;
+    _z_source.avail_in = 0;
+    _z_source.next_out = Z_NULL;
+    _z_source.avail_out = 0;
+    int result = inflateReset(&_z_source);
+    if (result < 0) {
+      show_zlib_error("inflateReset", result, _z_source);
+    }
+    return 0;
+  }
+
+  return -1;
+}
+
+/**
+ * Implements seeking within the stream.  ZStreamBuf only allows seeking back
+ * to the beginning of the stream.
+ */
+streampos ZStreamBuf::
+seekpos(streampos pos, ios_openmode which) {
+  return seekoff(pos, ios::beg, which);
+}
+
 /**
 /**
  * Called by the system ostream implementation when its internal buffer is
  * Called by the system ostream implementation when its internal buffer is
  * filled, plus one character.
  * filled, plus one character.

+ 3 - 0
panda/src/express/zStreamBuf.h

@@ -35,6 +35,9 @@ public:
   void open_write(ostream *dest, bool owns_dest, int compression_level);
   void open_write(ostream *dest, bool owns_dest, int compression_level);
   void close_write();
   void close_write();
 
 
+  virtual streampos seekoff(streamoff off, ios_seekdir dir, ios_openmode which);
+  virtual streampos seekpos(streampos pos, ios_openmode which);
+
 protected:
 protected:
   virtual int overflow(int c);
   virtual int overflow(int c);
   virtual int sync();
   virtual int sync();

+ 15 - 5
panda/src/gles2gsg/gles2gsg.h

@@ -80,8 +80,6 @@ typedef char GLchar;
 #define GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS
 #define GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS
 #define GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT GL_FRAMEBUFFER_INCOMPLETE_FORMATS
 #define GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT GL_FRAMEBUFFER_INCOMPLETE_FORMATS
 #define GL_DEPTH_ATTACHMENT_EXT GL_DEPTH_ATTACHMENT
 #define GL_DEPTH_ATTACHMENT_EXT GL_DEPTH_ATTACHMENT
-#define GL_COLOR_ATTACHMENT0_EXT GL_COLOR_ATTACHMENT0
-#define GL_COLOR_ATTACHMENT1_EXT (GL_COLOR_ATTACHMENT0 + 1)
 #define GL_STENCIL_ATTACHMENT_EXT GL_STENCIL_ATTACHMENT
 #define GL_STENCIL_ATTACHMENT_EXT GL_STENCIL_ATTACHMENT
 #define GL_DEPTH_STENCIL GL_DEPTH_STENCIL_OES
 #define GL_DEPTH_STENCIL GL_DEPTH_STENCIL_OES
 #define GL_DEPTH_STENCIL_EXT GL_DEPTH_STENCIL_OES
 #define GL_DEPTH_STENCIL_EXT GL_DEPTH_STENCIL_OES
@@ -89,7 +87,6 @@ typedef char GLchar;
 #define GL_DEPTH24_STENCIL8_EXT GL_DEPTH24_STENCIL8_OES
 #define GL_DEPTH24_STENCIL8_EXT GL_DEPTH24_STENCIL8_OES
 #define GL_DEPTH_COMPONENT24 GL_DEPTH_COMPONENT24_OES
 #define GL_DEPTH_COMPONENT24 GL_DEPTH_COMPONENT24_OES
 #define GL_DEPTH_COMPONENT32 GL_DEPTH_COMPONENT32_OES
 #define GL_DEPTH_COMPONENT32 GL_DEPTH_COMPONENT32_OES
-#define GL_TEXTURE_3D GL_TEXTURE_3D_OES
 #define GL_MAX_3D_TEXTURE_SIZE GL_MAX_3D_TEXTURE_SIZE_OES
 #define GL_MAX_3D_TEXTURE_SIZE GL_MAX_3D_TEXTURE_SIZE_OES
 #define GL_SAMPLER_3D GL_SAMPLER_3D_OES
 #define GL_SAMPLER_3D GL_SAMPLER_3D_OES
 #define GL_BGRA GL_BGRA_EXT
 #define GL_BGRA GL_BGRA_EXT
@@ -121,8 +118,21 @@ typedef char GLchar;
 #define GL_COMPARE_R_TO_TEXTURE_ARB GL_COMPARE_REF_TO_TEXTURE_EXT
 #define GL_COMPARE_R_TO_TEXTURE_ARB GL_COMPARE_REF_TO_TEXTURE_EXT
 #define GL_SAMPLER_2D_SHADOW GL_SAMPLER_2D_SHADOW_EXT
 #define GL_SAMPLER_2D_SHADOW GL_SAMPLER_2D_SHADOW_EXT
 #define GL_MAX_DRAW_BUFFERS GL_MAX_DRAW_BUFFERS_NV
 #define GL_MAX_DRAW_BUFFERS GL_MAX_DRAW_BUFFERS_NV
-#define GL_COMPRESSED_RGBA_S3TC_DXT3_EXT GL_COMPRESSED_RGBA_S3TC_DXT3_ANGLE
-#define GL_COMPRESSED_RGBA_S3TC_DXT5_EXT GL_COMPRESSED_RGBA_S3TC_DXT5_ANGLE
+#define GL_SRC1_COLOR GL_SRC1_COLOR_EXT
+#define GL_ONE_MINUS_SRC1_COLOR GL_ONE_MINUS_SRC1_COLOR_EXT
+#define GL_SRC1_ALPHA GL_SRC1_ALPHA_EXT
+#define GL_ONE_MINUS_SRC1_ALPHA GL_ONE_MINUS_SRC1_ALPHA_EXT
+
+#define GL_DEBUG_OUTPUT_SYNCHRONOUS GL_DEBUG_OUTPUT_SYNCHRONOUS_KHR
+#define GL_DEBUG_TYPE_PERFORMANCE GL_DEBUG_TYPE_PERFORMANCE_KHR
+#define GL_DEBUG_SEVERITY_HIGH GL_DEBUG_SEVERITY_HIGH_KHR
+#define GL_DEBUG_SEVERITY_MEDIUM GL_DEBUG_SEVERITY_MEDIUM_KHR
+#define GL_DEBUG_SEVERITY_LOW GL_DEBUG_SEVERITY_LOW_KHR
+#define GL_DEBUG_SEVERITY_NOTIFICATION GL_DEBUG_SEVERITY_NOTIFICATION_KHR
+#define GL_BUFFER GL_BUFFER_KHR
+#define GL_SHADER GL_SHADER_KHR
+#define GL_PROGRAM GL_PROGRAM_KHR
+#define GL_DEBUG_OUTPUT GL_DEBUG_OUTPUT_KHR
 
 
 // For GLES 3 compat - need a better solution for this
 // For GLES 3 compat - need a better solution for this
 #define GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT 0x1
 #define GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT 0x1

File diff suppressed because it is too large
+ 760 - 1039
panda/src/gles2gsg/panda_esgl2ext.h


+ 4 - 5
panda/src/glstuff/glGraphicsBuffer_src.cxx

@@ -902,7 +902,7 @@ bind_slot(int layer, bool rb_resize, Texture **attach, RenderTexturePlane slot,
             }
             }
           } else {
           } else {
             if (_fb_properties.get_color_bits() > 16 * 3) {
             if (_fb_properties.get_color_bits() > 16 * 3) {
-              gl_format = GL_RGBA32F_ARB;
+              gl_format = GL_RGB32F_ARB;
             } else if (_fb_properties.get_color_bits() > 8 * 3) {
             } else if (_fb_properties.get_color_bits() > 8 * 3) {
               gl_format = GL_RGB16_EXT;
               gl_format = GL_RGB16_EXT;
             } else {
             } else {
@@ -920,11 +920,11 @@ bind_slot(int layer, bool rb_resize, Texture **attach, RenderTexturePlane slot,
             }
             }
           } else {
           } else {
             if (_fb_properties.get_color_bits() > 16 * 3) {
             if (_fb_properties.get_color_bits() > 16 * 3) {
-              gl_format = GL_RGB32F_ARB;
+              gl_format = GL_RGBA32F_ARB;
             } else if (_fb_properties.get_color_bits() > 8 * 3) {
             } else if (_fb_properties.get_color_bits() > 8 * 3) {
-              gl_format = GL_RGB16_EXT;
+              gl_format = GL_RGBA16_EXT;
             } else {
             } else {
-              gl_format = GL_RGB;
+              gl_format = GL_RGBA;
             }
             }
           }
           }
         }
         }
@@ -1090,7 +1090,6 @@ bind_slot_multisample(bool rb_resize, Texture **attach, RenderTexturePlane slot,
                                         GL_RENDERBUFFER_EXT, _rbm[slot]);
                                         GL_RENDERBUFFER_EXT, _rbm[slot]);
     }
     }
   } else {
   } else {
-    Texture *Tex = attach[slot];
     GLuint gl_format = GL_RGBA;
     GLuint gl_format = GL_RGBA;
 #ifndef OPENGLES
 #ifndef OPENGLES
     switch (slot) {
     switch (slot) {

+ 251 - 65
panda/src/glstuff/glGraphicsStateGuardian_src.cxx

@@ -129,16 +129,23 @@ null_glActiveTexture(GLenum gl_texture_stage) {
 
 
 #ifdef OPENGLES_2
 #ifdef OPENGLES_2
 #define _glBlendEquation glBlendEquation
 #define _glBlendEquation glBlendEquation
+#define _glBlendEquationSeparate glBlendEquationSeparate
+#define _glBlendFuncSeparate glBlendFuncSeparate
 #define _glBlendColor glBlendColor
 #define _glBlendColor glBlendColor
 #else
 #else
 static void APIENTRY
 static void APIENTRY
 null_glBlendEquation(GLenum) {
 null_glBlendEquation(GLenum) {
 }
 }
-#endif
+
+static void APIENTRY
+null_glBlendFuncSeparate(GLenum src, GLenum dest, GLenum, GLenum) {
+  glBlendFunc(src, dest);
+}
 
 
 static void APIENTRY
 static void APIENTRY
 null_glBlendColor(GLclampf, GLclampf, GLclampf, GLclampf) {
 null_glBlendColor(GLclampf, GLclampf, GLclampf, GLclampf) {
 }
 }
+#endif
 
 
 #ifndef OPENGLES_1
 #ifndef OPENGLES_1
 // We have a default shader that will be applied when there isn't any shader
 // We have a default shader that will be applied when there isn't any shader
@@ -1954,6 +1961,9 @@ reset() {
 #endif
 #endif
 
 
 #ifdef OPENGLES_1
 #ifdef OPENGLES_1
+  _supports_framebuffer_multisample = false;
+  _supports_framebuffer_blit = false;
+
   if (has_extension("GL_OES_framebuffer_object")) {
   if (has_extension("GL_OES_framebuffer_object")) {
     _supports_framebuffer_object = true;
     _supports_framebuffer_object = true;
     _glIsRenderbuffer = (PFNGLISRENDERBUFFEROESPROC)
     _glIsRenderbuffer = (PFNGLISRENDERBUFFEROESPROC)
@@ -2011,9 +2021,76 @@ reset() {
   _glGetFramebufferAttachmentParameteriv = glGetFramebufferAttachmentParameteriv;
   _glGetFramebufferAttachmentParameteriv = glGetFramebufferAttachmentParameteriv;
   _glGenerateMipmap = glGenerateMipmap;
   _glGenerateMipmap = glGenerateMipmap;
 
 
-#else
-  // TODO: add ARB3.0 version
-  if (has_extension("GL_EXT_framebuffer_object")) {
+  if (is_at_least_gles_version(3, 0)) {
+    _supports_framebuffer_multisample = true;
+    _supports_framebuffer_blit = true;
+
+    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
+      get_extension_func("glRenderbufferStorageMultisample");
+    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
+      get_extension_func("glBlitFramebuffer");
+  } else {
+    if (has_extension("GL_ANGLE_framebuffer_multisample")) {
+      _supports_framebuffer_multisample = true;
+      _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEANGLEPROC)
+        get_extension_func("glRenderbufferStorageMultisampleANGLE");
+    } else {
+      _supports_framebuffer_multisample = false;
+    }
+    if (has_extension("GL_ANGLE_framebuffer_blit")) {
+      _supports_framebuffer_blit = true;
+      _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERANGLEPROC)
+        get_extension_func("glBlitFramebufferANGLE");
+    } else {
+      _supports_framebuffer_blit = false;
+    }
+  }
+#else  // Desktop OpenGL case.
+  if (is_at_least_gl_version(3, 0) || has_extension("GL_ARB_framebuffer_object")) {
+    _supports_framebuffer_object = true;
+    _supports_framebuffer_multisample = true;
+    _supports_framebuffer_blit = true;
+
+    _glIsRenderbuffer = (PFNGLISRENDERBUFFERPROC)
+      get_extension_func("glIsRenderbuffer");
+    _glBindRenderbuffer = (PFNGLBINDRENDERBUFFERPROC)
+      get_extension_func("glBindRenderbuffer");
+    _glDeleteRenderbuffers = (PFNGLDELETERENDERBUFFERSPROC)
+      get_extension_func("glDeleteRenderbuffers");
+    _glGenRenderbuffers = (PFNGLGENRENDERBUFFERSPROC)
+      get_extension_func("glGenRenderbuffers");
+    _glRenderbufferStorage = (PFNGLRENDERBUFFERSTORAGEPROC)
+      get_extension_func("glRenderbufferStorage");
+    _glGetRenderbufferParameteriv = (PFNGLGETRENDERBUFFERPARAMETERIVPROC)
+      get_extension_func("glGetRenderbufferParameteriv");
+    _glIsFramebuffer = (PFNGLISFRAMEBUFFERPROC)
+      get_extension_func("glIsFramebuffer");
+    _glBindFramebuffer = (PFNGLBINDFRAMEBUFFERPROC)
+      get_extension_func("glBindFramebuffer");
+    _glDeleteFramebuffers = (PFNGLDELETEFRAMEBUFFERSPROC)
+      get_extension_func("glDeleteFramebuffers");
+    _glGenFramebuffers = (PFNGLGENFRAMEBUFFERSPROC)
+      get_extension_func("glGenFramebuffers");
+    _glCheckFramebufferStatus = (PFNGLCHECKFRAMEBUFFERSTATUSPROC)
+      get_extension_func("glCheckFramebufferStatus");
+    _glFramebufferTexture1D = (PFNGLFRAMEBUFFERTEXTURE1DPROC)
+      get_extension_func("glFramebufferTexture1D");
+    _glFramebufferTexture2D = (PFNGLFRAMEBUFFERTEXTURE2DPROC)
+      get_extension_func("glFramebufferTexture2D");
+    _glFramebufferTexture3D = (PFNGLFRAMEBUFFERTEXTURE3DPROC)
+      get_extension_func("glFramebufferTexture3D");
+    _glFramebufferRenderbuffer = (PFNGLFRAMEBUFFERRENDERBUFFERPROC)
+      get_extension_func("glFramebufferRenderbuffer");
+    _glGetFramebufferAttachmentParameteriv = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)
+      get_extension_func("glGetFramebufferAttachmentParameteriv");
+    _glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC)
+      get_extension_func("glGenerateMipmap");
+    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC)
+      get_extension_func("glRenderbufferStorageMultisampleEXT");
+    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERPROC)
+      get_extension_func("glBlitFramebuffer");
+
+  } else if (has_extension("GL_EXT_framebuffer_object")) {
     _supports_framebuffer_object = true;
     _supports_framebuffer_object = true;
     _glIsRenderbuffer = (PFNGLISRENDERBUFFEREXTPROC)
     _glIsRenderbuffer = (PFNGLISRENDERBUFFEREXTPROC)
       get_extension_func("glIsRenderbufferEXT");
       get_extension_func("glIsRenderbufferEXT");
@@ -2050,14 +2127,25 @@ reset() {
     _glGenerateMipmap = (PFNGLGENERATEMIPMAPEXTPROC)
     _glGenerateMipmap = (PFNGLGENERATEMIPMAPEXTPROC)
       get_extension_func("glGenerateMipmapEXT");
       get_extension_func("glGenerateMipmapEXT");
 
 
-  } else if (is_at_least_gl_version(3, 0)) {
-    // This case should go away when we support the ARB/3.0 version of FBOs.
-    _supports_framebuffer_object = false;
-    _glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC)
-      get_extension_func("glGenerateMipmap");
+    if (has_extension("GL_EXT_framebuffer_multisample")) {
+      _supports_framebuffer_multisample = true;
+      _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
+        get_extension_func("glRenderbufferStorageMultisampleEXT");
+    } else {
+      _supports_framebuffer_multisample = false;
+    }
+    if (has_extension("GL_EXT_framebuffer_blit")) {
+      _supports_framebuffer_blit = true;
+      _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
+        get_extension_func("glBlitFramebufferEXT");
+    } else {
+      _supports_framebuffer_blit = false;
+    }
 
 
   } else {
   } else {
     _supports_framebuffer_object = false;
     _supports_framebuffer_object = false;
+    _supports_framebuffer_multisample = false;
+    _supports_framebuffer_blit = false;
     _glGenerateMipmap = NULL;
     _glGenerateMipmap = NULL;
   }
   }
 #endif
 #endif
@@ -2086,49 +2174,16 @@ reset() {
   }
   }
 #endif  // !OPENGLES_1
 #endif  // !OPENGLES_1
 
 
-  _supports_framebuffer_multisample = false;
-  if (is_at_least_gles_version(3, 0)) {
-    _supports_framebuffer_multisample = true;
-    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
-      get_extension_func("glRenderbufferStorageMultisample");
-
-#ifdef OPENGLES
-  } else if (has_extension("GL_APPLE_framebuffer_multisample")) {
-    _supports_framebuffer_multisample = true;
-    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEAPPLEPROC)
-      get_extension_func("glRenderbufferStorageMultisampleAPPLE");
-#else
-  } else if (has_extension("GL_EXT_framebuffer_multisample")) {
-    _supports_framebuffer_multisample = true;
-    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
-      get_extension_func("glRenderbufferStorageMultisampleEXT");
-#endif
-  }
-
 #ifndef OPENGLES
 #ifndef OPENGLES
   _supports_framebuffer_multisample_coverage_nv = false;
   _supports_framebuffer_multisample_coverage_nv = false;
-  if (has_extension("GL_NV_framebuffer_multisample_coverage")) {
+  if (_supports_framebuffer_multisample &&
+      has_extension("GL_NV_framebuffer_multisample_coverage")) {
     _supports_framebuffer_multisample_coverage_nv = true;
     _supports_framebuffer_multisample_coverage_nv = true;
     _glRenderbufferStorageMultisampleCoverage = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLECOVERAGENVPROC)
     _glRenderbufferStorageMultisampleCoverage = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLECOVERAGENVPROC)
       get_extension_func("glRenderbufferStorageMultisampleCoverageNV");
       get_extension_func("glRenderbufferStorageMultisampleCoverageNV");
   }
   }
 #endif
 #endif
 
 
-#ifndef OPENGLES_1
-  _supports_framebuffer_blit = false;
-
-  if (is_at_least_gles_version(3, 0)) {
-    _supports_framebuffer_blit = true;
-    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
-      get_extension_func("glBlitFramebuffer");
-
-  } else if (has_extension("GL_EXT_framebuffer_blit")) {
-    _supports_framebuffer_blit = true;
-    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
-      get_extension_func("glBlitFramebufferEXT");
-  }
-#endif
-
 #if defined(OPENGLES_1)
 #if defined(OPENGLES_1)
   _glDrawBuffers = NULL;
   _glDrawBuffers = NULL;
   _max_color_targets = 1;
   _max_color_targets = 1;
@@ -2296,29 +2351,115 @@ reset() {
   }
   }
 #endif
 #endif
 
 
-  // In OpenGL ES 2.x, this is supported in the core.
-#ifndef OPENGLES_2
-  _glBlendEquation = NULL;
-  bool supports_blend_equation = false;
+#ifdef OPENGLES_1
+  // In OpenGL ES 1, blending is supported via extensions.
+  if (has_extension("GL_OES_blend_subtract")) {
+    _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
+      get_extension_func("glBlendEquationOES");
+
+    if (_glBlendEquation == NULL) {
+      _glBlendEquation = null_glBlendEquation;
+      GLCAT.warning()
+        << "BlendEquationOES advertised as supported by OpenGL ES runtime, but "
+           "could not get pointer to extension function.\n";
+    }
+  } else {
+    _glBlendEquation = null_glBlendEquation;
+  }
+
+  if (has_extension("GL_OES_blend_equation_separate")) {
+    _glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEOESPROC)
+      get_extension_func("glBlendEquationSeparateOES");
+
+    if (_glBlendEquation == NULL) {
+      _supports_blend_equation_separate = false;
+      GLCAT.warning()
+        << "BlendEquationSeparateOES advertised as supported by OpenGL ES "
+           "runtime, but could not get pointer to extension function.\n";
+    } else {
+      _supports_blend_equation_separate = true;
+    }
+  } else {
+    _supports_blend_equation_separate = false;
+    _glBlendEquationSeparate = NULL;
+  }
+
+  if (has_extension("GL_OES_blend_func_separate")) {
+    _glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEOESPROC)
+      get_extension_func("glBlendFuncSeparateOES");
+
+    if (_glBlendFuncSeparate == NULL) {
+      _glBlendFuncSeparate = null_glBlendFuncSeparate;
+      GLCAT.warning()
+        << "BlendFuncSeparateOES advertised as supported by OpenGL ES runtime, but "
+           "could not get pointer to extension function.\n";
+    }
+  } else {
+    _glBlendFuncSeparate = null_glBlendFuncSeparate;
+  }
+
+#elif defined(OPENGLES)
+  // In OpenGL ES 2.x and above, this is supported in the core.
+  _supports_blend_equation_separate = false;
+
+#else
   if (is_at_least_gl_version(1, 2)) {
   if (is_at_least_gl_version(1, 2)) {
-    supports_blend_equation = true;
     _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
     _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
       get_extension_func("glBlendEquation");
       get_extension_func("glBlendEquation");
-  } else if (has_extension("GL_OES_blend_subtract")) {
-    supports_blend_equation = true;
-    _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
-      get_extension_func("glBlendEquationOES");
+
   } else if (has_extension("GL_EXT_blend_minmax")) {
   } else if (has_extension("GL_EXT_blend_minmax")) {
-    supports_blend_equation = true;
     _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
     _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
       get_extension_func("glBlendEquationEXT");
       get_extension_func("glBlendEquationEXT");
+
+  } else {
+    _glBlendEquation = null_glBlendEquation;
   }
   }
-  if (supports_blend_equation && _glBlendEquation == NULL) {
-    GLCAT.warning()
-      << "BlendEquation advertised as supported by OpenGL runtime, but could not get pointers to extension function.\n";
-  }
+
   if (_glBlendEquation == NULL) {
   if (_glBlendEquation == NULL) {
     _glBlendEquation = null_glBlendEquation;
     _glBlendEquation = null_glBlendEquation;
+    GLCAT.warning()
+      << "BlendEquation advertised as supported by OpenGL runtime, but could "
+         "not get pointer to extension function.\n";
+  }
+
+  if (is_at_least_gl_version(2, 0)) {
+    _supports_blend_equation_separate = true;
+    _glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEPROC)
+      get_extension_func("glBlendEquationSeparate");
+
+  } else if (has_extension("GL_EXT_blend_equation_separate")) {
+    _supports_blend_equation_separate = true;
+    _glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEEXTPROC)
+      get_extension_func("glBlendEquationSeparateEXT");
+
+  } else {
+    _supports_blend_equation_separate = false;
+    _glBlendEquationSeparate = NULL;
+  }
+
+  if (_supports_blend_equation_separate && _glBlendEquationSeparate == NULL) {
+    _supports_blend_equation_separate = false;
+    GLCAT.warning()
+      << "BlendEquationSeparate advertised as supported by OpenGL runtime, "
+         "but could not get pointer to extension function.\n";
+  }
+
+  if (is_at_least_gl_version(1, 4)) {
+    _glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEPROC)
+      get_extension_func("glBlendFuncSeparate");
+
+  } else if (has_extension("GL_EXT_blend_func_separate")) {
+    _glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEEXTPROC)
+      get_extension_func("glBlendFuncSeparateEXT");
+
+  } else {
+    _glBlendFuncSeparate = null_glBlendFuncSeparate;
+  }
+
+  if (_glBlendFuncSeparate == NULL) {
+    _glBlendFuncSeparate = null_glBlendFuncSeparate;
+    GLCAT.warning()
+      << "BlendFuncSeparate advertised as supported by OpenGL runtime, but could not get pointers to extension function.\n";
   }
   }
 #endif
 #endif
 
 
@@ -2344,6 +2485,15 @@ reset() {
   }
   }
 #endif
 #endif
 
 
+#ifdef OPENGLES_1
+  // OpenGL ES 1 doesn't support dual-source blending.
+#elif defined(OPENGLES)
+  _supports_dual_source_blending = has_extension("GL_EXT_blend_func_extended");
+#else
+  _supports_dual_source_blending =
+    is_at_least_gl_version(3, 3) || has_extension("GL_ARB_blend_func_extended");
+#endif
+
 #ifdef OPENGLES
 #ifdef OPENGLES
   _edge_clamp = GL_CLAMP_TO_EDGE;
   _edge_clamp = GL_CLAMP_TO_EDGE;
 #else
 #else
@@ -6914,6 +7064,7 @@ do_issue_blending() {
   _target_rs->get_attrib_def(target_color_blend);
   _target_rs->get_attrib_def(target_color_blend);
   CPT(ColorBlendAttrib) color_blend = target_color_blend;
   CPT(ColorBlendAttrib) color_blend = target_color_blend;
   ColorBlendAttrib::Mode color_blend_mode = target_color_blend->get_mode();
   ColorBlendAttrib::Mode color_blend_mode = target_color_blend->get_mode();
+  ColorBlendAttrib::Mode alpha_blend_mode = target_color_blend->get_alpha_mode();
 
 
   const TransparencyAttrib *target_transparency;
   const TransparencyAttrib *target_transparency;
   _target_rs->get_attrib_def(target_transparency);
   _target_rs->get_attrib_def(target_transparency);
@@ -6926,9 +7077,17 @@ do_issue_blending() {
     enable_multisample_alpha_one(false);
     enable_multisample_alpha_one(false);
     enable_multisample_alpha_mask(false);
     enable_multisample_alpha_mask(false);
     enable_blend(true);
     enable_blend(true);
-    _glBlendEquation(get_blend_equation_type(color_blend_mode));
-    glBlendFunc(get_blend_func(color_blend->get_operand_a()),
-                get_blend_func(color_blend->get_operand_b()));
+
+    if (_supports_blend_equation_separate) {
+      _glBlendEquationSeparate(get_blend_equation_type(color_blend_mode),
+                               get_blend_equation_type(alpha_blend_mode));
+    } else {
+      _glBlendEquation(get_blend_equation_type(color_blend_mode));
+    }
+    _glBlendFuncSeparate(get_blend_func(color_blend->get_operand_a()),
+                         get_blend_func(color_blend->get_operand_b()),
+                         get_blend_func(color_blend->get_alpha_operand_a()),
+                         get_blend_func(color_blend->get_alpha_operand_b()));
 
 
 #ifndef OPENGLES_1
 #ifndef OPENGLES_1
     LColor c;
     LColor c;
@@ -6943,9 +7102,17 @@ do_issue_blending() {
 #endif
 #endif
 
 
     if (GLCAT.is_spam()) {
     if (GLCAT.is_spam()) {
-      GLCAT.spam() << "glBlendEquation(" << color_blend_mode << ")\n";
-      GLCAT.spam() << "glBlendFunc(" << color_blend->get_operand_a()
-                                     << color_blend->get_operand_b() << ")\n";
+      if (_supports_blend_equation_separate) {
+        GLCAT.spam() << "glBlendEquationSeparate(" << color_blend_mode << ", "
+                                                   << alpha_blend_mode << ")\n";
+      } else {
+        GLCAT.spam() << "glBlendEquation(" << color_blend_mode << ")\n";
+      }
+      GLCAT.spam() << "glBlendFuncSeparate("
+                   << color_blend->get_operand_a() << ", "
+                   << color_blend->get_operand_b() << ", "
+                   << color_blend->get_alpha_operand_a() << ", "
+                   << color_blend->get_alpha_operand_b() << ")\n";
 #ifndef OPENGLES_1
 #ifndef OPENGLES_1
       GLCAT.spam() << "glBlendColor(" << c << ")\n";
       GLCAT.spam() << "glBlendColor(" << c << ")\n";
 #endif
 #endif
@@ -9325,6 +9492,13 @@ get_blend_func(ColorBlendAttrib::Operand operand) {
   case ColorBlendAttrib::O_one_minus_constant_alpha:
   case ColorBlendAttrib::O_one_minus_constant_alpha:
   case ColorBlendAttrib::O_one_minus_alpha_scale:
   case ColorBlendAttrib::O_one_minus_alpha_scale:
     break;
     break;
+
+  // No dual-source blending, either.
+  case ColorBlendAttrib::O_incoming1_color:
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+  case ColorBlendAttrib::O_incoming1_alpha:
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    break;
 #else
 #else
   case ColorBlendAttrib::O_constant_color:
   case ColorBlendAttrib::O_constant_color:
   case ColorBlendAttrib::O_color_scale:
   case ColorBlendAttrib::O_color_scale:
@@ -9341,6 +9515,18 @@ get_blend_func(ColorBlendAttrib::Operand operand) {
   case ColorBlendAttrib::O_one_minus_constant_alpha:
   case ColorBlendAttrib::O_one_minus_constant_alpha:
   case ColorBlendAttrib::O_one_minus_alpha_scale:
   case ColorBlendAttrib::O_one_minus_alpha_scale:
     return GL_ONE_MINUS_CONSTANT_ALPHA;
     return GL_ONE_MINUS_CONSTANT_ALPHA;
+
+  case ColorBlendAttrib::O_incoming1_color:
+    return GL_SRC1_COLOR;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+    return GL_ONE_MINUS_SRC1_COLOR;
+
+  case ColorBlendAttrib::O_incoming1_alpha:
+    return GL_SRC1_ALPHA;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    return GL_ONE_MINUS_SRC1_ALPHA;
 #endif
 #endif
 
 
   case ColorBlendAttrib::O_incoming_color_saturate:
   case ColorBlendAttrib::O_incoming_color_saturate:
@@ -11337,7 +11523,7 @@ upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
   }
   }
 
 
   if (needs_reload && gtc->_immutable) {
   if (needs_reload && gtc->_immutable) {
-    GLCAT.warning() << "Attempt to modify texture with immutable storage, recreating texture.\n";
+    GLCAT.info() << "Attempt to modify texture with immutable storage, recreating texture.\n";
     gtc->reset_data();
     gtc->reset_data();
     glBindTexture(target, gtc->_index);
     glBindTexture(target, gtc->_index);
 
 

+ 6 - 0
panda/src/glstuff/glGraphicsStateGuardian_src.h

@@ -141,6 +141,8 @@ typedef void (APIENTRYP PFNGLTEXSTORAGE3DPROC) (GLenum target, GLsizei levels, G
 typedef void (APIENTRYP PFNGLBINDVERTEXARRAYPROC) (GLuint array);
 typedef void (APIENTRYP PFNGLBINDVERTEXARRAYPROC) (GLuint array);
 typedef void (APIENTRYP PFNGLDELETEVERTEXARRAYSPROC) (GLsizei n, const GLuint *arrays);
 typedef void (APIENTRYP PFNGLDELETEVERTEXARRAYSPROC) (GLsizei n, const GLuint *arrays);
 typedef void (APIENTRYP PFNGLGENVERTEXARRAYSPROC) (GLsizei n, GLuint *arrays);
 typedef void (APIENTRYP PFNGLGENVERTEXARRAYSPROC) (GLsizei n, GLuint *arrays);
+typedef void (APIENTRYP PFNGLBLENDEQUATIONSEPARATEPROC) (GLenum modeRGB, GLenum modeAlpha);
+typedef void (APIENTRYP PFNGLBLENDFUNCSEPARATEPROC) (GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha);
 
 
 #ifndef OPENGLES_1
 #ifndef OPENGLES_1
 // GLSL shader functions
 // GLSL shader functions
@@ -817,8 +819,12 @@ public:
   PFNGLBUFFERSTORAGEPROC _glBufferStorage;
   PFNGLBUFFERSTORAGEPROC _glBufferStorage;
 #endif
 #endif
 
 
+  bool _supports_blend_equation_separate;
 #ifndef OPENGLES_2
 #ifndef OPENGLES_2
+  // OpenGL ES 2+ has these in the core.
   PFNGLBLENDEQUATIONPROC _glBlendEquation;
   PFNGLBLENDEQUATIONPROC _glBlendEquation;
+  PFNGLBLENDEQUATIONSEPARATEPROC _glBlendEquationSeparate;
+  PFNGLBLENDFUNCSEPARATEPROC _glBlendFuncSeparate;
 #endif
 #endif
 #ifndef OPENGLES
 #ifndef OPENGLES
   PFNGLBLENDCOLORPROC _glBlendColor;
   PFNGLBLENDCOLORPROC _glBlendColor;

+ 9 - 8
panda/src/glstuff/glShaderContext_src.cxx

@@ -677,23 +677,24 @@ reflect_uniform(int i, char *name_buffer, GLsizei name_buflen) {
   _glgsg->_glGetActiveUniform(_glsl_program, i, name_buflen, NULL, &param_size, &param_type, name_buffer);
   _glgsg->_glGetActiveUniform(_glsl_program, i, name_buflen, NULL, &param_size, &param_type, name_buffer);
   GLint p = _glgsg->_glGetUniformLocation(_glsl_program, name_buffer);
   GLint p = _glgsg->_glGetUniformLocation(_glsl_program, name_buffer);
 
 
+  if (GLCAT.is_debug()) {
+    GLCAT.debug()
+      << "Active uniform " << name_buffer << " with size " << param_size
+      << " and type 0x" << hex << param_type << dec
+      << " is bound to location " << p << "\n";
+  }
 
 
   // Some NVidia drivers (361.43 for example) (incorrectly) include "internal"
   // Some NVidia drivers (361.43 for example) (incorrectly) include "internal"
   // uniforms in the list starting with "_main_" (for example,
   // uniforms in the list starting with "_main_" (for example,
   // "_main_0_gp5fp[0]") we need to skip those, because we don't know anything
   // "_main_0_gp5fp[0]") we need to skip those, because we don't know anything
   // about them
   // about them
   if (strncmp(name_buffer, "_main_", 6) == 0) {
   if (strncmp(name_buffer, "_main_", 6) == 0) {
-    GLCAT.warning() << "Ignoring uniform " << name_buffer << " which may be generated by buggy Nvidia driver.\n";
+    if (GLCAT.is_debug()) {
+      GLCAT.debug() << "Ignoring uniform " << name_buffer << " which may be generated by buggy Nvidia driver.\n";
+    }
     return;
     return;
   }
   }
 
 
-  if (GLCAT.is_debug()) {
-    GLCAT.debug()
-      << "Active uniform " << name_buffer << " with size " << param_size
-      << " and type 0x" << hex << param_type << dec
-      << " is bound to location " << p << "\n";
-  }
-
   if (p < 0) {
   if (p < 0) {
     // Special meaning, or it's in a uniform block.  Let it go.
     // Special meaning, or it's in a uniform block.  Let it go.
     return;
     return;

+ 2 - 1
panda/src/gobj/geomPrimitive.cxx

@@ -2231,7 +2231,8 @@ get_num_primitives() const {
  */
  */
 bool GeomPrimitivePipelineReader::
 bool GeomPrimitivePipelineReader::
 check_valid(const GeomVertexDataPipelineReader *data_reader) const {
 check_valid(const GeomVertexDataPipelineReader *data_reader) const {
-  if (get_num_vertices() != 0  &&
+  if (get_num_vertices() != 0 &&
+      data_reader->get_num_arrays() > 0 &&
       get_max_vertex() >= data_reader->get_num_rows()) {
       get_max_vertex() >= data_reader->get_num_rows()) {
 
 
 #ifndef NDEBUG
 #ifndef NDEBUG

+ 9 - 0
panda/src/gobj/geomVertexFormat.I

@@ -235,6 +235,15 @@ get_morph_delta(size_t n) const {
   return _morphs[n]._delta;
   return _morphs[n]._delta;
 }
 }
 
 
+/**
+ * Returns a standard vertex format containing no arrays at all, useful for
+ * pull-style vertex rendering.
+ */
+INLINE const GeomVertexFormat *GeomVertexFormat::
+get_empty() {
+  return get_registry()->_empty;
+}
+
 /**
 /**
  * Returns a standard vertex format with just a 3-component vertex position.
  * Returns a standard vertex format with just a 3-component vertex position.
  */
  */

+ 2 - 4
panda/src/gobj/geomVertexFormat.cxx

@@ -890,6 +890,8 @@ Registry() {
  */
  */
 void GeomVertexFormat::Registry::
 void GeomVertexFormat::Registry::
 make_standard_formats() {
 make_standard_formats() {
+  _empty = register_format(new GeomVertexFormat);
+
   _v3 = register_format(new GeomVertexArrayFormat
   _v3 = register_format(new GeomVertexArrayFormat
                         (InternalName::get_vertex(), 3,
                         (InternalName::get_vertex(), 3,
                          NT_stdfloat, C_point));
                          NT_stdfloat, C_point));
@@ -1011,10 +1013,6 @@ register_format(GeomVertexFormat *format) {
     new_format = (*fi);
     new_format = (*fi);
     if (!new_format->is_registered()) {
     if (!new_format->is_registered()) {
       new_format->do_register();
       new_format->do_register();
-      if (new_format->get_num_arrays() == 0) {
-        gobj_cat.warning()
-          << "Empty GeomVertexFormat registered.\n";
-      }
     }
     }
   }
   }
 
 

+ 4 - 0
panda/src/gobj/geomVertexFormat.h

@@ -125,6 +125,8 @@ PUBLISHED:
   void write_with_data(ostream &out, int indent_level,
   void write_with_data(ostream &out, int indent_level,
                        const GeomVertexData *data) const;
                        const GeomVertexData *data) const;
 
 
+  INLINE static const GeomVertexFormat *get_empty();
+
   // Some standard vertex formats.  No particular requirement to use one of
   // Some standard vertex formats.  No particular requirement to use one of
   // these, but the DirectX renderers can use these formats directly, whereas
   // these, but the DirectX renderers can use these formats directly, whereas
   // any other format will have to be converted first.
   // any other format will have to be converted first.
@@ -227,6 +229,8 @@ private:
     Formats _formats;
     Formats _formats;
     LightReMutex _lock;
     LightReMutex _lock;
 
 
+    CPT(GeomVertexFormat) _empty;
+
     CPT(GeomVertexFormat) _v3;
     CPT(GeomVertexFormat) _v3;
     CPT(GeomVertexFormat) _v3n3;
     CPT(GeomVertexFormat) _v3n3;
     CPT(GeomVertexFormat) _v3t2;
     CPT(GeomVertexFormat) _v3t2;

+ 5 - 3
panda/src/gobj/shader.cxx

@@ -2362,15 +2362,14 @@ r_preprocess_source(ostream &out, const Filename &fn,
   bool had_include = false;
   bool had_include = false;
   int lineno = 0;
   int lineno = 0;
   while (getline(*source, line)) {
   while (getline(*source, line)) {
-    // We always forward the actual line - the GLSL compiler will silently
-    // ignore #pragma lines anyway.
     ++lineno;
     ++lineno;
-    out << line << "\n";
 
 
     // Check if this line contains a #pragma.
     // Check if this line contains a #pragma.
     char pragma[64];
     char pragma[64];
     if (line.size() < 8 ||
     if (line.size() < 8 ||
         sscanf(line.c_str(), " # pragma %63s", pragma) != 1) {
         sscanf(line.c_str(), " # pragma %63s", pragma) != 1) {
+      // Just pass the line through unmodified.
+      out << line << "\n";
 
 
       // One exception: check for an #endif after an include.  We have to
       // One exception: check for an #endif after an include.  We have to
       // restore the line number in case the include happened under an #if
       // restore the line number in case the include happened under an #if
@@ -2435,8 +2434,11 @@ r_preprocess_source(ostream &out, const Filename &fn,
 
 
     } else if (strcmp(pragma, "optionNV") == 0) {
     } else if (strcmp(pragma, "optionNV") == 0) {
       // This is processed by NVIDIA drivers.  Don't touch it.
       // This is processed by NVIDIA drivers.  Don't touch it.
+      out << line << "\n";
 
 
     } else {
     } else {
+      // Forward it, the driver will ignore it if it doesn't know it.
+      out << line << "\n";
       shader_cat.warning()
       shader_cat.warning()
         << "Ignoring unknown pragma directive \"" << pragma << "\" at line "
         << "Ignoring unknown pragma directive \"" << pragma << "\" at line "
         << lineno << " of file " << fn << ":\n  " << line << "\n";
         << lineno << " of file " << fn << ":\n  " << line << "\n";

+ 174 - 78
panda/src/gobj/texture.cxx

@@ -830,8 +830,11 @@ set_ram_image_as(CPTA_uchar image, const string &supplied_format) {
         } else if (format.at(s) == 'R') {
         } else if (format.at(s) == 'R') {
           component = 2;
           component = 2;
         } else if (format.at(s) == 'A') {
         } else if (format.at(s) == 'A') {
-          nassertv(cdata->_num_components != 3);
-          component = cdata->_num_components - 1;
+          if (cdata->_num_components != 3) {
+            component = cdata->_num_components - 1;
+          } else {
+            // Ignore.
+          }
         } else if (format.at(s) == '0') {
         } else if (format.at(s) == '0') {
           // Ignore.
           // Ignore.
         } else if (format.at(s) == '1') {
         } else if (format.at(s) == '1') {
@@ -859,8 +862,11 @@ set_ram_image_as(CPTA_uchar image, const string &supplied_format) {
       } else if (format.at(s) == 'R') {
       } else if (format.at(s) == 'R') {
         component = 2;
         component = 2;
       } else if (format.at(s) == 'A') {
       } else if (format.at(s) == 'A') {
-        nassertv(cdata->_num_components != 3);
-        component = cdata->_num_components - 1;
+        if (cdata->_num_components != 3) {
+          component = cdata->_num_components - 1;
+        } else {
+          // Ignore.
+        }
       } else if (format.at(s) == '0') {
       } else if (format.at(s) == '0') {
         // Ignore.
         // Ignore.
       } else if (format.at(s) == '1') {
       } else if (format.at(s) == '1') {
@@ -6088,18 +6094,23 @@ do_get_uncompressed_ram_image(CData *cdata) {
  * Rather than just returning a pointer to the data, like
  * Rather than just returning a pointer to the data, like
  * get_uncompressed_ram_image, this function first processes the data and
  * get_uncompressed_ram_image, this function first processes the data and
  * reorders the components using the specified format string, and places these
  * reorders the components using the specified format string, and places these
- * into a new char array.  The 'format' argument should specify in which order
- * the components of the texture must be.  For example, valid format strings
- * are "RGBA", "GA", "ABRG" or "AAA". A component can also be written as "0"
- * or "1", which means an empty/black or a full/white channel, respectively.
+ * into a new char array.
+ *
+ * The 'format' argument should specify in which order the components of the
+ * texture must be.  For example, valid format strings are "RGBA", "GA",
+ * "ABRG" or "AAA".  A component can also be written as "0" or "1", which
+ * means an empty/black or a full/white channel, respectively.
+ *
  * This function is particularly useful to copy an image in-memory to a
  * This function is particularly useful to copy an image in-memory to a
  * different library (for example, PIL or wxWidgets) that require a different
  * different library (for example, PIL or wxWidgets) that require a different
  * component order than Panda's internal format, BGRA. Note, however, that
  * component order than Panda's internal format, BGRA. Note, however, that
  * this conversion can still be too slow if you want to do it every frame, and
  * this conversion can still be too slow if you want to do it every frame, and
- * should thus be avoided for that purpose.  The only requirement for the
- * reordering is that an uncompressed image must be available.  If the RAM
- * image is compressed, it will attempt to re-load the texture from disk, if
- * it doesn't find an uncompressed image there, it will return NULL.
+ * should thus be avoided for that purpose.
+ *
+ * The only requirement for the reordering is that an uncompressed image must
+ * be available.  If the RAM image is compressed, it will attempt to re-load
+ * the texture from disk, if it doesn't find an uncompressed image there, it
+ * will return NULL.
  */
  */
 CPTA_uchar Texture::
 CPTA_uchar Texture::
 get_ram_image_as(const string &requested_format) {
 get_ram_image_as(const string &requested_format) {
@@ -6125,92 +6136,177 @@ get_ram_image_as(const string &requested_format) {
     return CPTA_uchar(data);
     return CPTA_uchar(data);
   }
   }
 
 
+  // Check if we have an alpha channel, and remember which channel we use.
+  int alpha = -1;
+  if (Texture::has_alpha(cdata->_format)) {
+    alpha = cdata->_num_components - 1;
+  }
+
+  // Validate the format beforehand.
+  for (size_t i = 0; i < format.size(); ++i) {
+    if (format[i] != 'B' && format[i] != 'G' && format[i] != 'R' &&
+        format[i] != 'A' && format[i] != '0' && format[i] != '1') {
+      gobj_cat.error() << "Unexpected component character '"
+        << format[i] << "', expected one of RGBA01!\n";
+      return CPTA_uchar(get_class_type());
+    }
+  }
+
   // Create a new empty array that can hold our image.
   // Create a new empty array that can hold our image.
   PTA_uchar newdata = PTA_uchar::empty_array(imgsize * format.size() * cdata->_component_width, get_class_type());
   PTA_uchar newdata = PTA_uchar::empty_array(imgsize * format.size() * cdata->_component_width, get_class_type());
 
 
   // These ifs are for optimization of commonly used image types.
   // These ifs are for optimization of commonly used image types.
-  if (format == "RGBA" && cdata->_num_components == 4 && cdata->_component_width == 1) {
-    imgsize *= 4;
-    for (int p = 0; p < imgsize; p += 4) {
-      newdata[p    ] = data[p + 2];
-      newdata[p + 1] = data[p + 1];
-      newdata[p + 2] = data[p    ];
-      newdata[p + 3] = data[p + 3];
+  if (cdata->_component_width == 1) {
+    if (format == "RGBA" && cdata->_num_components == 4) {
+      const PN_uint32 *src = (const PN_uint32 *)data.p();
+      PN_uint32 *dst = (PN_uint32 *)newdata.p();
+
+      for (int p = 0; p < imgsize; ++p) {
+        PN_uint32 v = *src++;
+        *dst++ = ((v & 0xff00ff00u)) |
+                 ((v & 0x00ff0000u) >> 16) |
+                 ((v & 0x000000ffu) << 16);
+      }
+      return newdata;
+    }
+    if (format == "RGB" && cdata->_num_components == 4) {
+      const PN_uint32 *src = (const PN_uint32 *)data.p();
+      PN_uint32 *dst = (PN_uint32 *)newdata.p();
+
+      // Convert blocks of 4 pixels at a time, so that we can treat both the
+      // source and destination as 32-bit integers.
+      int blocks = imgsize >> 2;
+      for (int i = 0; i < blocks; ++i) {
+        PN_uint32 v0 = *src++;
+        PN_uint32 v1 = *src++;
+        PN_uint32 v2 = *src++;
+        PN_uint32 v3 = *src++;
+        *dst++ = ((v0 & 0x00ff0000u) >> 16) |
+                 ((v0 & 0x0000ff00u)) |
+                 ((v0 & 0x000000ffu) << 16) |
+                 ((v1 & 0x00ff0000u) << 8);
+        *dst++ = ((v1 & 0x0000ff00u) >> 8) |
+                 ((v1 & 0x000000ffu) << 8) |
+                 ((v2 & 0x00ff0000u)) |
+                 ((v2 & 0x0000ff00u) << 16);
+        *dst++ = ((v2 & 0x000000ffu)) |
+                 ((v3 & 0x00ff0000u) >> 8) |
+                 ((v3 & 0x0000ff00u) << 8) |
+                 ((v3 & 0x000000ffu) << 24);
+      }
+
+      // If the image size wasn't a multiple of 4, we may have a handful of
+      // pixels left over.  Convert those the slower way.
+      PN_uint8 *tail = (PN_uint8 *)dst;
+      for (int i = (imgsize & ~0x3); i < imgsize; ++i) {
+        PN_uint32 v = *src++;
+        *tail++ = (v & 0x00ff0000u) >> 16;
+        *tail++ = (v & 0x0000ff00u) >> 8;
+        *tail++ = (v & 0x000000ffu);
+      }
+      return newdata;
+    }
+    if (format == "BGR" && cdata->_num_components == 4) {
+      const PN_uint32 *src = (const PN_uint32 *)data.p();
+      PN_uint32 *dst = (PN_uint32 *)newdata.p();
+
+      // Convert blocks of 4 pixels at a time, so that we can treat both the
+      // source and destination as 32-bit integers.
+      int blocks = imgsize >> 2;
+      for (int i = 0; i < blocks; ++i) {
+        PN_uint32 v0 = *src++;
+        PN_uint32 v1 = *src++;
+        PN_uint32 v2 = *src++;
+        PN_uint32 v3 = *src++;
+        *dst++ = (v0 & 0x00ffffffu) | ((v1 & 0x000000ffu) << 24);
+        *dst++ = ((v1 & 0x00ffff00u) >> 8) |  ((v2 & 0x0000ffffu) << 16);
+        *dst++ = ((v2 & 0x00ff0000u) >> 16) | ((v3 & 0x00ffffffu) << 8);
+      }
+
+      // If the image size wasn't a multiple of 4, we may have a handful of
+      // pixels left over.  Convert those the slower way.
+      PN_uint8 *tail = (PN_uint8 *)dst;
+      for (int i = (imgsize & ~0x3); i < imgsize; ++i) {
+        PN_uint32 v = *src++;
+        *tail++ = (v & 0x000000ffu);
+        *tail++ = (v & 0x0000ff00u) >> 8;
+        *tail++ = (v & 0x00ff0000u) >> 16;
+      }
+      return newdata;
     }
     }
-    return newdata;
-  }
-  if (format == "RGB" && cdata->_num_components == 3 && cdata->_component_width == 1) {
-    imgsize *= 3;
-    for (int p = 0; p < imgsize; p += 3) {
-      newdata[p    ] = data[p + 2];
-      newdata[p + 1] = data[p + 1];
-      newdata[p + 2] = data[p    ];
+    const PN_uint8 *src = (const PN_uint8 *)data.p();
+    PN_uint8 *dst = (PN_uint8 *)newdata.p();
+
+    if (format == "RGB" && cdata->_num_components == 3) {
+      for (int i = 0; i < imgsize; ++i) {
+        *dst++ = src[2];
+        *dst++ = src[1];
+        *dst++ = src[0];
+        src += 3;
+      }
+      return newdata;
     }
     }
-    return newdata;
-  }
-  if (format == "A" && cdata->_component_width == 1 && cdata->_num_components != 3) {
-    // We can generally rely on alpha to be the last component.
-    int component = cdata->_num_components - 1;
-    for (int p = 0; p < imgsize; ++p) {
-      newdata[p] = data[component];
+    if (format == "A" && cdata->_num_components != 3) {
+      // We can generally rely on alpha to be the last component.
+      for (int p = 0; p < imgsize; ++p) {
+        dst[p] = src[alpha];
+        src += cdata->_num_components;
+      }
+      return newdata;
     }
     }
-    return newdata;
-  }
-  if (cdata->_component_width == 1) {
+    // Fallback case for other 8-bit-per-channel formats.
     for (int p = 0; p < imgsize; ++p) {
     for (int p = 0; p < imgsize; ++p) {
-      for (uchar s = 0; s < format.size(); ++s) {
-        signed char component = -1;
-        if (format.at(s) == 'B' || (cdata->_num_components <= 2 && format.at(s) != 'A')) {
-          component = 0;
-        } else if (format.at(s) == 'G') {
-          component = 1;
-        } else if (format.at(s) == 'R') {
-          component = 2;
-        } else if (format.at(s) == 'A') {
-          nassertr(cdata->_num_components != 3, CPTA_uchar(get_class_type()));
-          component = cdata->_num_components - 1;
-        } else if (format.at(s) == '0') {
-          newdata[p * format.size() + s] = 0x00;
-        } else if (format.at(s) == '1') {
-          newdata[p * format.size() + s] = 0xff;
+      for (size_t i = 0; i < format.size(); ++i) {
+        if (format[i] == 'B' || (cdata->_num_components <= 2 && format[i] != 'A')) {
+          *dst++ = src[0];
+        } else if (format[i] == 'G') {
+          *dst++ = src[1];
+        } else if (format[i] == 'R') {
+          *dst++ = src[2];
+        } else if (format[i] == 'A') {
+          if (alpha >= 0) {
+            *dst++ = src[alpha];
+          } else {
+            *dst++ = 0xff;
+          }
+        } else if (format[i] == '1') {
+          *dst++ = 0xff;
         } else {
         } else {
-          gobj_cat.error() << "Unexpected component character '"
-            << format.at(s) << "', expected one of RGBA!\n";
-          return CPTA_uchar(get_class_type());
-        }
-        if (component >= 0) {
-          newdata[p * format.size() + s] = data[p * cdata->_num_components + component];
+          *dst++ = 0x00;
         }
         }
       }
       }
+      src += cdata->_num_components;
     }
     }
     return newdata;
     return newdata;
   }
   }
+
+  // The slow and general case.
   for (int p = 0; p < imgsize; ++p) {
   for (int p = 0; p < imgsize; ++p) {
-    for (uchar s = 0; s < format.size(); ++s) {
-      signed char component = -1;
-      if (format.at(s) == 'B' || (cdata->_num_components <= 2 && format.at(s) != 'A')) {
+    for (size_t i = 0; i < format.size(); ++i) {
+      int component = 0;
+      if (format[i] == 'B' || (cdata->_num_components <= 2 && format[i] != 'A')) {
         component = 0;
         component = 0;
-      } else if (format.at(s) == 'G') {
+      } else if (format[i] == 'G') {
         component = 1;
         component = 1;
-      } else if (format.at(s) == 'R') {
+      } else if (format[i] == 'R') {
         component = 2;
         component = 2;
-      } else if (format.at(s) == 'A') {
-        nassertr(cdata->_num_components != 3, CPTA_uchar(get_class_type()));
-        component = cdata->_num_components - 1;
-      } else if (format.at(s) == '0') {
-        memset((void*)(newdata + (p * format.size() + s) * cdata->_component_width),  0, cdata->_component_width);
-      } else if (format.at(s) == '1') {
-        memset((void*)(newdata + (p * format.size() + s) * cdata->_component_width), -1, cdata->_component_width);
+      } else if (format[i] == 'A') {
+        if (alpha >= 0) {
+          component = alpha;
+        } else {
+          memset((void*)(newdata + (p * format.size() + i) * cdata->_component_width), -1, cdata->_component_width);
+          continue;
+        }
+      } else if (format[i] == '1') {
+        memset((void*)(newdata + (p * format.size() + i) * cdata->_component_width), -1, cdata->_component_width);
+        continue;
       } else {
       } else {
-        gobj_cat.error() << "Unexpected component character '"
-          << format.at(s) << "', expected one of RGBA!\n";
-        return CPTA_uchar(get_class_type());
-      }
-      if (component >= 0) {
-        memcpy((void*)(newdata + (p * format.size() + s) * cdata->_component_width),
-               (void*)(data + (p * cdata->_num_components + component) * cdata->_component_width),
-               cdata->_component_width);
+        memset((void*)(newdata + (p * format.size() + i) * cdata->_component_width),  0, cdata->_component_width);
+        continue;
       }
       }
+      memcpy((void*)(newdata + (p * format.size() + i) * cdata->_component_width),
+             (void*)(data + (p * cdata->_num_components + component) * cdata->_component_width),
+             cdata->_component_width);
     }
     }
   }
   }
   return newdata;
   return newdata;

+ 2 - 0
panda/src/grutil/config_grutil.cxx

@@ -23,6 +23,7 @@
 #include "nodeVertexTransform.h"
 #include "nodeVertexTransform.h"
 #include "rigidBodyCombiner.h"
 #include "rigidBodyCombiner.h"
 #include "pipeOcclusionCullTraverser.h"
 #include "pipeOcclusionCullTraverser.h"
+#include "shaderTerrainMesh.h"
 
 
 #include "dconfig.h"
 #include "dconfig.h"
 
 
@@ -123,6 +124,7 @@ init_libgrutil() {
   RigidBodyCombiner::init_type();
   RigidBodyCombiner::init_type();
   PipeOcclusionCullTraverser::init_type();
   PipeOcclusionCullTraverser::init_type();
   SceneGraphAnalyzerMeter::init_type();
   SceneGraphAnalyzerMeter::init_type();
+  ShaderTerrainMesh::init_type();
 
 
 #ifdef HAVE_AUDIO
 #ifdef HAVE_AUDIO
   MovieTexture::init_type();
   MovieTexture::init_type();

+ 1 - 0
panda/src/grutil/p3grutil_composite1.cxx

@@ -1,6 +1,7 @@
 #include "cardMaker.cxx"
 #include "cardMaker.cxx"
 #include "heightfieldTesselator.cxx"
 #include "heightfieldTesselator.cxx"
 #include "geoMipTerrain.cxx"
 #include "geoMipTerrain.cxx"
+#include "shaderTerrainMesh.cxx"
 #include "config_grutil.cxx"
 #include "config_grutil.cxx"
 #include "lineSegs.cxx"
 #include "lineSegs.cxx"
 #include "fisheyeMaker.cxx"
 #include "fisheyeMaker.cxx"

+ 191 - 0
panda/src/grutil/shaderTerrainMesh.I

@@ -0,0 +1,191 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file shaderTerrainMesh.I
+ * @author tobspr
+ * @date 2016-02-16
+ */
+
+/**
+ * @brief Sets the path to the heightfield
+ * @details This sets the path to the terrain heightfield. It should be 16bit
+ *   single channel, and have a power-of-two resolution greater than 32.
+ *   Common sizes are 2048x2048 or 4096x4096.
+ *
+ * @param filename Path to the heightfield
+ */
+INLINE void ShaderTerrainMesh::set_heightfield_filename(const Filename& filename) {
+  _heightfield_source = filename;
+}
+
+/**
+ * @brief Returns the heightfield path
+ * @details This returns the path of the terrain heightfield, previously set with
+ *   set_heightfield()
+ *
+ * @return Path to the heightfield
+ */
+INLINE const Filename& ShaderTerrainMesh::get_heightfield_filename() const {
+  return _heightfield_source;
+}
+
+/**
+ * @brief Sets the chunk size
+ * @details This sets the chunk size of the terrain. A chunk is basically the
+ *   smallest unit in LOD. If the chunk size is too small, the terrain will
+ *   perform bad, since there will be way too many chunks. If the chunk size
+ *   is too big, you will not get proper LOD, and might also get bad performance.
+ *
+ *   For terrains of the size 4096x4096 or 8192x8192, a chunk size of 32 seems
+ *   to produce good results. For smaller resolutions, you should try out a
+ *   size of 16 or even 8 for very small terrains.
+ *
+ *   The amount of chunks generated for the last level equals to
+ *   (heightfield_size / chunk_size) ** 2. The chunk size has to be a power
+ *   of two.
+ *
+ * @param chunk_size Size of the chunks, has to be a power of two
+ */
+INLINE void ShaderTerrainMesh::set_chunk_size(size_t chunk_size) {
+  _chunk_size = chunk_size;
+}
+
+/**
+ * @brief Returns the chunk size
+ * @details This returns the chunk size, previously set with set_chunk_size()
+ * @return Chunk size
+ */
+INLINE size_t ShaderTerrainMesh::get_chunk_size() const {
+  return _chunk_size;
+}
+
+/**
+ * @brief Sets whether to generate patches
+ * @details If this option is set to true, GeomPatches will be used instead of
+ *   GeomTriangles. This is required when the terrain is used with tesselation
+ *   shaders, since patches are required for tesselation, whereas triangles
+ *   are required for regular rendering.
+ *
+ *   If this option is set to true while not using a tesselation shader, the
+ *   terrain will not get rendered, or even produce errors. The same applies
+ *   when this is option is not set, but the terrain is used with tesselation
+ *   shaders.
+ *
+ * @param generate_patches [description]
+ */
+INLINE void ShaderTerrainMesh::set_generate_patches(bool generate_patches) {
+  _generate_patches = generate_patches;
+}
+
+/**
+ * @brief Returns whether to generate patches
+ * @details This returns whether patches are generated, previously set with
+ *   set_generate_patches()
+ *
+ * @return Whether to generate patches
+ */
+INLINE bool ShaderTerrainMesh::get_generate_patches() const {
+  return _generate_patches;
+}
+
+
+/**
+ * @brief Sets the desired triangle width
+ * @details This sets the desired width a triangle should have in pixels.
+ *   A value of 10.0 for example will make the terrain tesselate everything
+ *   in a way that each triangle edge roughly is 10 pixels wide.
+ *   Of course this will not always accurately match, however you can use this
+ *   setting to control the LOD algorithm of the terrain.
+ *
+ * @param target_triangle_width Desired triangle width in pixels
+ */
+INLINE void ShaderTerrainMesh::set_target_triangle_width(PN_stdfloat target_triangle_width) {
+  _target_triangle_width = target_triangle_width;
+}
+
+/**
+ * @brief Returns the target triangle width
+ * @details This returns the target triangle width, previously set with
+ *   ShaderTerrainMesh::set_target_triangle_width()
+ *
+ * @return Target triangle width
+ */
+INLINE PN_stdfloat ShaderTerrainMesh::get_target_triangle_width() const {
+  return _target_triangle_width;
+}
+
+
+/**
+ * @brief Sets whether to enable terrain updates
+ * @details This flag controls whether the terrain should be updated. If this value
+ *   is set to false, no updating of the terrain will happen. This can be useful
+ *   to debug the culling algorithm used by the terrain.
+ *
+ * @param update_enabled Whether to update the terrain
+ */
+INLINE void ShaderTerrainMesh::set_update_enabled(bool update_enabled) {
+  _update_enabled = update_enabled;
+}
+
+/**
+ * @brief Returns whether the terrain is getting updated
+ * @details This returns whether the terrain is getting updates, previously set with
+ *   set_update_enabled()
+ *
+ * @return Whether to update the terrain
+ */
+INLINE bool ShaderTerrainMesh::get_update_enabled() const {
+  return _update_enabled;
+}
+
+/**
+ * @brief Returns a handle to the heightfield texture
+ * @details This returns a handle to the internally used heightfield texture. This
+ *   can be used to set the heightfield as a shader input.
+ *
+ * @return Handle to the heightfield texture
+ */
+INLINE Texture* ShaderTerrainMesh::get_heightfield_tex() const {
+  return _heightfield_tex;
+}
+
+/**
+ * @brief Clears all children
+ * @details This clears all children on the chunk and sets them to NULL. This will
+ *   effectively free all memory consumed by this chunk and its children.
+ */
+INLINE void ShaderTerrainMesh::Chunk::clear_children() {
+  for (size_t i = 0; i < 4; ++i) {
+    delete children[i];
+    children[i] = NULL;
+  }
+}
+
+/**
+ * @brief Chunk constructor
+ * @details This constructs a new chunk, and sets all children to NULL.
+ */
+INLINE ShaderTerrainMesh::Chunk::Chunk() {
+  for (size_t i = 0; i < 4; ++i)
+    children[i] = NULL;
+}
+
+/**
+ * @brief Chunk destructor
+ * @details This destructs the chunk, freeing all used resources
+ */
+INLINE ShaderTerrainMesh::Chunk::~Chunk() {
+  clear_children();
+}
+
+/**
+ * @see ShaderTerrainMesh::uv_to_world(LTexCoord)
+ */
+INLINE LPoint3 ShaderTerrainMesh::uv_to_world(PN_stdfloat u, PN_stdfloat v) const {
+  return uv_to_world(LTexCoord(u, v));
+}

+ 715 - 0
panda/src/grutil/shaderTerrainMesh.cxx

@@ -0,0 +1,715 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file shaderTerrainMesh.cxx
+ * @author tobspr
+ * @date 2016-02-16
+ */
+
+
+#include "shaderTerrainMesh.h"
+#include "geom.h"
+#include "geomVertexFormat.h"
+#include "geomVertexData.h"
+#include "geomVertexWriter.h"
+#include "geomNode.h"
+#include "geomTriangles.h"
+#include "geomPatches.h"
+#include "omniBoundingVolume.h"
+#include "cullableObject.h"
+#include "cullTraverser.h"
+#include "cullHandler.h"
+#include "cullTraverserData.h"
+#include "clockObject.h"
+#include "shaderAttrib.h"
+#include "renderAttrib.h"
+#include "shaderInput.h"
+#include "boundingBox.h"
+#include "samplerState.h"
+#include "config_grutil.h"
+#include "typeHandle.h"
+
+ConfigVariableBool stm_use_hexagonal_layout
+("stm-use-hexagonal-layout", true,
+ PRC_DESC("Set this to true to use a hexagonal vertex layout. This approximates "
+          "the heightfield in a better way, however the CLOD transitions might be "
+          "visible due to the vertices not matching exactly."));
+
+ConfigVariableInt stm_max_chunk_count
+("stm-max-chunk-count", 2048,
+ PRC_DESC("Controls the maximum amount of chunks the Terrain can display. If you use "
+          "a high LOD, you might have to increment this value. The lower this value is "
+          "the less data has to be transferred to the GPU."));
+
+ConfigVariableInt stm_max_views
+("stm-max-views", 8,
+ PRC_DESC("Controls the maximum amount of different views the Terrain can be rendered "
+          "with. Each camera rendering the terrain corresponds to a view. Lowering this "
+          "value will reduce the data that has to be transferred to the GPU."));
+
+PStatCollector ShaderTerrainMesh::_basic_collector("Cull:ShaderTerrainMesh:Setup");
+PStatCollector ShaderTerrainMesh::_lod_collector("Cull:ShaderTerrainMesh:CollectLOD");
+
+NotifyCategoryDef(shader_terrain, "");
+
+TypeHandle ShaderTerrainMesh::_type_handle;
+
+/**
+ * @brief Helper function to check for a power of two
+ * @details This method checks for a power of two by using bitmasks
+ *
+ * @param x Number to check
+ * @return true if x is a power of two, false otherwise
+ */
+int check_power_of_two(size_t x)
+{
+  return ((x != 0) && ((x & (~x + 1)) == x));
+}
+
+/**
+ * @brief Constructs a new Terrain Mesh
+ * @details This constructs a new terrain mesh. By default, no transform is set
+ *   on the mesh, causing it to range over the unit box from (0, 0, 0) to
+ *   (1, 1, 1). Usually you want to set a custom transform with NodePath::set_scale()
+ */
+ShaderTerrainMesh::ShaderTerrainMesh() :
+  PandaNode("ShaderTerrainMesh"),
+  _size(0),
+  _chunk_size(32),
+  _heightfield_source(""),
+  _generate_patches(false),
+  _data_texture(NULL),
+  _chunk_geom(NULL),
+  _current_view_index(0),
+  _last_frame_count(-1),
+  _target_triangle_width(10.0f),
+  _update_enabled(true),
+  _heightfield_tex(NULL)
+{
+  set_final(true);
+  set_bounds(new OmniBoundingVolume());
+}
+
+/**
+ * @brief Generates the terrain mesh
+ * @details This generates the terrain mesh, initializing all chunks of the
+ *   internal used quadtree. At this point, a heightfield and a chunk size should
+ *   have been set, otherwise an error is thrown.
+ *
+ *   If anything goes wrong, like a missing heightfield, then an error is printed
+ *   and false is returned.
+ *
+ * @return true if the terrain was initialized, false if an error occured
+ */
+bool ShaderTerrainMesh::generate() {
+  if (!do_load_heightfield())
+    return false;
+
+  if (_chunk_size < 8 || !check_power_of_two(_chunk_size)) {
+    shader_terrain_cat.error() << "Invalid chunk size! Has to be >= 8 and a power of two!" << endl;
+    return false;
+  }
+
+  if (_chunk_size > _size / 4) {
+    shader_terrain_cat.error() << "Chunk size too close or greater than the actual terrain size!" << endl;
+    return false;
+  }
+
+  do_create_chunks();
+  do_compute_bounds(&_base_chunk);
+  do_create_chunk_geom();
+  do_init_data_texture();
+  do_convert_heightfield();
+
+  return true;
+}
+
+/**
+ * @brief Converts the internal used PNMImage to a Texture
+ * @details This converts the internal used PNMImage to a texture object. The
+ *   reason for this is, that we need the PNMimage for computing the chunk
+ *   bounds, but don't need it afterwards. However, since we have it in ram,
+ *   we can just put its contents into a Texture object, which enables the
+ *   user to call get_heightfield() instead of manually loading the texture
+ *   from disk again to set it as shader input (Panda does not cache PNMImages)
+ */
+void ShaderTerrainMesh::do_convert_heightfield() {
+  _heightfield_tex = new Texture();
+  _heightfield_tex->load(_heightfield);
+  _heightfield_tex->set_keep_ram_image(true);
+
+  if (_heightfield.get_maxval() != 65535) {
+    shader_terrain_cat.warning() << "Using non 16-bit heightfield!" << endl;
+  } else {
+    _heightfield_tex->set_format(Texture::F_r16);
+  }
+  _heightfield_tex->set_minfilter(SamplerState::FT_linear);
+  _heightfield_tex->set_magfilter(SamplerState::FT_linear);
+  _heightfield.clear();
+}
+
+/**
+ * @brief Intermal method to load the heightfield
+ * @details This method loads the heightfield from the heightfield path,
+ *   and performs some basic checks, including a check for a power of two,
+ *   and same width and height.
+ *
+ * @return true if the heightfield was loaded and meets the requirements
+ */
+bool ShaderTerrainMesh::do_load_heightfield() {
+
+  if(!_heightfield.read(_heightfield_source)) {
+    shader_terrain_cat.error() << "Could not load heightfield from " << _heightfield_source << endl;
+    return false;
+  }
+
+  if (_heightfield.get_x_size() != _heightfield.get_y_size()) {
+    shader_terrain_cat.error() << "Only square heightfields are supported!";
+    return false;
+  }
+
+  _size = _heightfield.get_x_size();
+
+  if (_size < 32 || !check_power_of_two(_size)) {
+    shader_terrain_cat.error() << "Invalid heightfield! Needs to be >= 32 and a power of two (was: "
+         << _size << ")!" << endl;
+    return false;
+  }
+
+  return true;
+}
+
+/**
+ * @brief Internal method to init the terrain data texture
+ * @details This method creates the data texture, used to store all chunk data.
+ *   The data texture is set as a shader input later on, and stores the position
+ *   and scale of each chunk. Every row in the data texture denotes a view on
+ *   the terrain.
+ */
+void ShaderTerrainMesh::do_init_data_texture() {
+  _data_texture = new Texture("TerrainDataTexture");
+  _data_texture->setup_2d_texture(stm_max_chunk_count, stm_max_views, Texture::T_float, Texture::F_rgba32);
+  _data_texture->set_clear_color(LVector4(0));
+  _data_texture->clear_image();
+}
+
+/**
+ * @brief Internal method to init the quadtree
+ * @details This method creates the base chunk and then inits all chunks recursively
+ *   by using ShaderTerrainMesh::do_init_chunk().
+ */
+void ShaderTerrainMesh::do_create_chunks() {
+
+  // Release any previously stored children
+  _base_chunk.clear_children();
+
+  // Create the base chunk
+  _base_chunk.depth = 0;
+  _base_chunk.x = 0;
+  _base_chunk.y = 0;
+  _base_chunk.size = _size;
+  _base_chunk.edges.set(0, 0, 0, 0);
+  _base_chunk.avg_height = 0.5;
+  _base_chunk.min_height = 0.0;
+  _base_chunk.max_height = 1.0;
+  _base_chunk.last_clod = 0.0;
+  do_init_chunk(&_base_chunk);
+}
+
+/**
+ * @brief Internal method to recursively init the quadtree
+ * @details This method inits the quadtree. Starting from a given node, it
+ *   first examines if that node should be subdivided.
+ *
+ *   If the node should be subdivided, four children are created and this method
+ *   is called on the children again. If the node is a leaf, all children are
+ *   set to NULL and nothing else happens.
+ *
+ *   The chunk parameter may not be zero or undefined behaviour occurs.
+ *
+ * @param chunk The parent chunk
+ */
+void ShaderTerrainMesh::do_init_chunk(Chunk* chunk) {
+  if (chunk->size > _chunk_size) {
+
+    // Compute children chunk size
+    size_t child_chunk_size = chunk->size / 2;
+
+    // Subdivide chunk into 4 children
+    for (size_t y = 0; y < 2; ++y) {
+      for (size_t x = 0; x < 2; ++x) {
+        Chunk* child = new Chunk();
+        child->size = child_chunk_size;
+        child->depth = chunk->depth + 1;
+        child->x = chunk->x + x * child_chunk_size;
+        child->y = chunk->y + y * child_chunk_size;
+        do_init_chunk(child);
+        chunk->children[x + 2*y] = child;
+      }
+    }
+  } else {
+    // Final chunk, initialize all children to zero
+    for (size_t i = 0; i < 4; ++i) {
+      chunk->children[i] = NULL;
+    }
+  }
+}
+
+/**
+ * @brief Recursively computes the bounds for a given chunk
+ * @details This method takes a parent chunk, and computes the bounds recursively,
+ *   depending on whether the chunk is a leaf or a node.
+ *
+ *   If the chunk is a leaf, then the average, min and max values for that chunk
+ *   are computed by iterating over the heightfield region of that chunk.
+ *
+ *   If the chunk is a node, this method is called recursively on all children
+ *   first, and after that, the average, min and max values for that chunk
+ *   are computed by merging those values of the children.
+ *
+ *   If chunk is NULL, undefined behaviour occurs.
+ *
+ * @param chunk The parent chunk
+ */
+void ShaderTerrainMesh::do_compute_bounds(Chunk* chunk) {
+
+  // Final chunk (Leaf)
+  if (chunk->size == _chunk_size) {
+
+    // Get a pointer to the PNMImage data, this is faster than using get_xel()
+    // for all pixels, since get_xel() also includes bounds checks and so on.
+    xel* data = _heightfield.get_array();
+
+    // Pixel getter function. Note that we have to flip the Y-component, since
+    // panda itself also flips it
+    // auto get_xel = [&](size_t x, size_t y){ return data[x + (_size - 1 - y) * _size].b / (PN_stdfloat)PGM_MAXMAXVAL; };
+    #define get_xel(x, y) (data[(x) + (_size - 1 - (y)) * _size].b / (PN_stdfloat)PGM_MAXMAXVAL)
+
+    // Iterate over all pixels
+    PN_stdfloat avg_height = 0.0, min_height = 1.0, max_height = 0.0;
+    for (size_t x = 0; x < _chunk_size; ++x) {
+      for (size_t y = 0; y < _chunk_size; ++y) {
+
+        // Access data directly, to improve performance
+        PN_stdfloat height = get_xel(chunk->x + x, chunk->y + y);
+        avg_height += height;
+        min_height = min(min_height, height);
+        max_height = max(max_height, height);
+      }
+    }
+
+    // Normalize average height
+    avg_height /= _chunk_size * _chunk_size;
+
+    // Store values
+    chunk->min_height = min_height;
+    chunk->max_height = max_height;
+    chunk->avg_height = avg_height;
+
+    // Get edges in the order (0, 0) (1, 0) (0, 1) (1, 1)
+    for (size_t y = 0; y < 2; ++y) {
+      for (size_t x = 0; x < 2; ++x) {
+        chunk->edges.set_cell(x + 2 * y, get_xel(
+            chunk->x + x * (_chunk_size - 1),
+            chunk->y + y * (_chunk_size - 1)
+          ));
+      }
+    }
+
+    #undef get_xel
+
+  } else {
+
+    // Reset heights
+    chunk->avg_height = 0.0;
+    chunk->min_height = 1.0;
+    chunk->max_height = 0.0;
+
+    // Perform bounds computation for every children and merge the children values
+    for (size_t i = 0; i < 4; ++i) {
+      do_compute_bounds(chunk->children[i]);
+      chunk->avg_height += chunk->children[i]->avg_height / 4.0;
+      chunk->min_height = min(chunk->min_height, chunk->children[i]->min_height);
+      chunk->max_height = max(chunk->max_height, chunk->children[i]->max_height);
+    }
+
+    // Also take the edge points from the children
+    chunk->edges.set_x(chunk->children[0]->edges.get_x());
+    chunk->edges.set_y(chunk->children[1]->edges.get_y());
+    chunk->edges.set_z(chunk->children[2]->edges.get_z());
+    chunk->edges.set_w(chunk->children[3]->edges.get_w());
+  }
+}
+
+/**
+ * @brief Internal method to create the chunk geom
+ * @details This method generates the internal used base chunk. The base chunk geom
+ *   is used to render the actual terrain, and will get instanced for every chunk.
+ *
+ *   The chunk has a size of (size+3) * (size+3), since additional triangles are
+ *   inserted at the borders to prevent holes between chunks of a different LOD.
+ *
+ *   If the generate patches option is set, patches will be generated instead
+ *   of triangles, which allows the terrain to use a tesselation shader.
+ */
+void ShaderTerrainMesh::do_create_chunk_geom() {
+
+  // Convert chunk size to an integer, because we operate on integers and get
+  // signed/unsigned mismatches otherwise
+  int size = (int)_chunk_size;
+
+  // Create vertex data
+  PT(GeomVertexData) gvd = new GeomVertexData("vertices", GeomVertexFormat::get_v3(), Geom::UH_static);
+  gvd->reserve_num_rows( (size + 3) * (size + 3) );
+  GeomVertexWriter vertex_writer(gvd, "vertex");
+
+  // Create primitive
+  PT(GeomPrimitive) triangles = NULL;
+  if (_generate_patches) {
+    triangles = new GeomPatches(3, Geom::UH_static);
+  } else {
+    triangles = new GeomTriangles(Geom::UH_static);
+  }
+
+  // Insert chunk vertices
+  for (int y = -1; y <= size + 1; ++y) {
+    for (int x = -1; x <= size + 1; ++x) {
+      LVector3 vtx_pos(x / (PN_stdfloat)size, y / (PN_stdfloat)size, 0.0f);
+      // Stitched vertices at the cornders
+      if (x == -1 || y == -1 || x == size + 1 || y == size + 1) {
+        vtx_pos.set_z(-1.0f / (PN_stdfloat)size);
+        vtx_pos.set_x(max(0.0f, min(1.0f, vtx_pos.get_x())));
+        vtx_pos.set_y(max(0.0f, min(1.0f, vtx_pos.get_y())));
+      }
+      vertex_writer.add_data3f(vtx_pos);
+    }
+  }
+
+  // Its important to use int and not size_t here, since we do store negative values
+  // auto get_point_index = [&size](int x, int y){ return (x + 1) + (size + 3) * (y + 1); };
+  #define get_point_index(x, y) (((x) + 1) + (size + 3) * ((y) + 1))
+
+  // Create triangles
+  for (int y = -1; y <= size; ++y) {
+    for (int x = -1; x <= size; ++x) {
+      // Get point indices of the quad vertices
+      int tl = get_point_index(x, y);
+      int tr = get_point_index(x + 1, y);
+      int bl = get_point_index(x, y + 1);
+      int br = get_point_index(x + 1, y + 1);
+
+      // Vary triangle scheme on each uneven quad
+      if (stm_use_hexagonal_layout && (x + y) % 2 == 0 ) {
+        triangles->add_vertices(tl, tr, br);
+        triangles->add_vertices(tl, br, bl);
+      } else {
+        triangles->add_vertices(tl, tr, bl);
+        triangles->add_vertices(bl, tr, br);
+      }
+    }
+  }
+
+  #undef get_point_index
+
+  // Construct geom
+  PT(Geom) geom = new Geom(gvd);
+  geom->add_primitive(triangles);
+
+  // Do not set any bounds, we do culling ourself
+  geom->clear_bounds();
+  geom->set_bounds(new OmniBoundingVolume());
+  _chunk_geom = geom;
+}
+
+/**
+ * @copydoc PandaNode::is_renderable()
+ */
+bool ShaderTerrainMesh::is_renderable() const {
+  return true;
+}
+
+/**
+ * @copydoc PandaNode::is_renderable()
+ */
+bool ShaderTerrainMesh::safe_to_flatten() const {
+  return false;
+}
+
+/**
+ * @copydoc PandaNode::safe_to_combine()
+ */
+bool ShaderTerrainMesh::safe_to_combine() const {
+  return false;
+}
+
+/**
+ * @copydoc PandaNode::add_for_draw()
+ */
+void ShaderTerrainMesh::add_for_draw(CullTraverser *trav, CullTraverserData &data) {
+
+  // Make sure the terrain was properly initialized, and the geom was created
+  // successfully
+  nassertv(_data_texture != NULL);
+  nassertv(_chunk_geom != NULL);
+
+  _basic_collector.start();
+
+  // Get current frame count
+  int frame_count = ClockObject::get_global_clock()->get_frame_count();
+
+  if (_last_frame_count != frame_count) {
+    // Frame count changed, this means we are at the beginning of a new frame.
+    // In this case, update the frame count and reset the view index.
+    _last_frame_count = frame_count;
+    _current_view_index = 0;
+  }
+
+  // Get transform and render state for this render pass
+  CPT(TransformState) modelview_transform = data.get_internal_transform(trav);
+  CPT(RenderState) state = data._state->compose(get_state());
+
+  // Store a handle to the scene setup
+  const SceneSetup* scene = trav->get_scene();
+
+  // Get the MVP matrix, this is required for the LOD
+  const Lens* current_lens = scene->get_lens();
+  const LMatrix4& projection_mat = current_lens->get_projection_mat();
+
+  // Get the current lens bounds
+  PT(BoundingVolume) cam_bounds = scene->get_cull_bounds();
+
+  // Transform the camera bounds with the main camera transform
+  DCAST(GeometricBoundingVolume, cam_bounds)->xform(scene->get_camera_transform()->get_mat());
+
+  TraversalData traversal_data;
+  traversal_data.cam_bounds = cam_bounds;
+  traversal_data.model_mat = get_transform()->get_mat();
+  traversal_data.mvp_mat = modelview_transform->get_mat() * projection_mat;
+  traversal_data.emitted_chunks = 0;
+  traversal_data.storage_ptr = (ChunkDataEntry*)_data_texture->modify_ram_image().p();
+  traversal_data.screen_size.set(scene->get_viewport_width(), scene->get_viewport_height());
+
+  // Move write pointer so it points to the beginning of the current view
+  traversal_data.storage_ptr += _data_texture->get_x_size() * _current_view_index;
+
+  if (_update_enabled) {
+    // Traverse recursively
+    _lod_collector.start();
+    do_traverse(&_base_chunk, &traversal_data);
+    _lod_collector.stop();
+  } else {
+    // Do a rough guess of the emitted chunks, we don't know the actual count
+    // (we would have to store it). This is only for debugging anyways, so
+    // its not important we get an accurate count here.
+    traversal_data.emitted_chunks = _data_texture->get_x_size();
+  }
+
+  // Set shader inputs
+  CPT(RenderAttrib) current_shader_attrib = state->get_attrib_def(ShaderAttrib::get_class_slot());
+
+  // Make sure the user didn't forget to set a shader
+  if (!DCAST(ShaderAttrib, current_shader_attrib)->has_shader()) {
+    shader_terrain_cat.warning() << "No shader set on the terrain! You need to set the appropriate shader!" << endl;
+  }
+
+  // Should never happen
+  nassertv(current_shader_attrib != NULL);
+
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.terrain_size", LVecBase2i(_size)) );
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.chunk_size", LVecBase2i(_chunk_size)));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.view_index", LVecBase2i(_current_view_index)));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.data_texture", _data_texture));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.heightfield", _heightfield_tex));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_instance_count(
+    traversal_data.emitted_chunks);
+
+  state = state->set_attrib(current_shader_attrib, 10000);
+
+  // Emit chunk
+  CullableObject *object = new CullableObject(_chunk_geom, state, modelview_transform);
+  trav->get_cull_handler()->record_object(object, trav);
+
+  // After rendering, increment the view index
+  ++_current_view_index;
+
+  if (_current_view_index > stm_max_views) {
+    shader_terrain_cat.error() << "More views than supported! Increase the stm-max-views config variable!" << endl;
+  }
+
+  _basic_collector.stop();
+}
+
+/**
+ * @brief Traverses the quadtree
+ * @details This method traverses the given chunk, deciding whether it should
+ *   be rendered or subdivided.
+ *
+ *   In case the chunk is decided to be subdivided, this method is called on
+ *   all children.
+ *
+ *   In case the chunk is decided to be rendered, ShaderTerrainMesh::do_emit_chunk() is
+ *   called. Otherwise nothing happens, and the chunk does not get rendered.
+ *
+ * @param chunk Chunk to traverse
+ * @param data Traversal data
+ */
+void ShaderTerrainMesh::do_traverse(Chunk* chunk, TraversalData* data, bool fully_visible) {
+
+  // Don't check bounds if we are fully visible
+  if (!fully_visible) {
+
+    // Construct chunk bounding volume
+    PN_stdfloat scale = 1.0 / (PN_stdfloat)_size;
+    LPoint3 bb_min(chunk->x * scale, chunk->y * scale, chunk->min_height);
+    LPoint3 bb_max((chunk->x + chunk->size) * scale, (chunk->y + chunk->size) * scale, chunk->max_height);
+
+    BoundingBox bbox = BoundingBox(bb_min, bb_max);
+    DCAST(GeometricBoundingVolume, &bbox)->xform(data->model_mat);
+    int intersection = data->cam_bounds->contains(&bbox);
+
+    if (intersection == BoundingVolume::IF_no_intersection) {
+      // No intersection with frustum
+      return;
+    }
+
+    // If the bounds are fully visible, there is no reason to perform culling
+    // on the children, so we set this flag to prevent any bounding computation
+    // on the child nodes.
+    fully_visible = (intersection & BoundingVolume::IF_all) != 0;
+  }
+
+  // Check if the chunk should be subdivided. In case the chunk is a leaf node,
+  // the chunk will never get subdivided.
+  // NOTE: We still always perform the LOD check. This is for the reason that
+  // the lod check also computes the CLOD factor, which is useful.
+  if (do_check_lod_matches(chunk, data) || chunk->size == _chunk_size) {
+    do_emit_chunk(chunk, data);
+  } else {
+    // Traverse children
+    for (size_t i = 0; i < 4; ++i) {
+      do_traverse(chunk->children[i], data, fully_visible);
+    }
+  }
+}
+
+/**
+ * @brief Checks whether a chunk should get subdivided
+ * @details This method checks whether a chunk fits on screen, or should be
+ *   subdivided in order to provide bigger detail.
+ *
+ *   In case this method returns true, the chunk lod is fine, and the chunk
+ *   can be rendered. If the method returns false, the chunk should be subdivided.
+ *
+ * @param chunk Chunk to check
+ * @param data Traversal data
+ *
+ * @return true if the chunk is sufficient, false if the chunk should be subdivided
+ */
+bool ShaderTerrainMesh::do_check_lod_matches(Chunk* chunk, TraversalData* data) {
+
+  // Project all points to world space
+  LVector2 projected_points[4];
+  for (size_t y = 0; y < 2; ++y) {
+    for (size_t x = 0; x < 2; ++x) {
+
+      // Compute point in model space (0,0,0 to 1,1,1)
+      LVector3 edge_pos = LVector3(
+        (PN_stdfloat)(chunk->x + x * (chunk->size - 1)) / (PN_stdfloat)_size,
+        (PN_stdfloat)(chunk->y + y * (chunk->size - 1)) / (PN_stdfloat)_size,
+        chunk->edges.get_cell(x + 2 * y)
+      );
+      LVector4 projected = data->mvp_mat.xform(LVector4(edge_pos, 1.0));
+      if (projected.get_w() == 0.0) {
+        projected.set(0.0, 0.0, -1.0, 1.0f);
+      }
+      projected *= 1.0 / projected.get_w();
+      projected_points[x + 2 * y].set(
+        projected.get_x() * data->screen_size.get_x(),
+        projected.get_y() * data->screen_size.get_y());
+    }
+  }
+
+  // Compute the length of the edges in screen space
+  PN_stdfloat edge_top = (projected_points[1] - projected_points[3]).length_squared();
+  PN_stdfloat edge_right = (projected_points[0] - projected_points[2]).length_squared();
+  PN_stdfloat edge_bottom = (projected_points[2] - projected_points[3]).length_squared();
+  PN_stdfloat edge_left = (projected_points[0] - projected_points[1]).length_squared();
+
+  // CLOD factor
+  PN_stdfloat max_edge = max(edge_top, max(edge_right, max(edge_bottom, edge_left)));
+
+  // Micro-Optimization: We use length_squared() instead of length() to compute the
+  // maximum edge length. This reduces it to one csqrt instead of four.
+  max_edge = csqrt(max_edge);
+
+  PN_stdfloat tesselation_factor = (max_edge / _target_triangle_width) / (PN_stdfloat)_chunk_size;
+  PN_stdfloat clod_factor = max(0.0, min(1.0, 2.0 - tesselation_factor));
+
+  // Store the clod factor
+  chunk->last_clod = clod_factor;
+
+  return tesselation_factor <= 2.0;
+}
+
+/**
+ * @brief Internal method to spawn a chunk
+ * @details This method is used to spawn a chunk in case the traversal decided
+ *   that the chunk gets rendered. It writes the chunks data to the texture, and
+ *   increments the write pointer
+ *
+ * @param chunk Chunk to spawn
+ * @param data Traversal data
+ */
+void ShaderTerrainMesh::do_emit_chunk(Chunk* chunk, TraversalData* data) {
+  if (data->emitted_chunks >= _data_texture->get_x_size()) {
+
+    // Only print warning once
+    if (data->emitted_chunks == _data_texture->get_x_size()) {
+      shader_terrain_cat.error() << "Too many chunks in the terrain! Consider lowering the desired LOD, or increase the stm-max-chunk-count variable." << endl;
+      data->emitted_chunks++;
+    }
+    return;
+  }
+
+  ChunkDataEntry& data_entry = *data->storage_ptr;
+  data_entry.x = chunk->x;
+  data_entry.y = chunk->y;
+  data_entry.size = chunk->size / _chunk_size;
+  data_entry.clod = chunk->last_clod;
+
+  data->emitted_chunks ++;
+  data->storage_ptr ++;
+}
+
+/**
+ * @brief Transforms a texture coordinate to world space
+ * @details This transforms a texture coordinatefrom uv-space (0 to 1) to world
+ *   space. This takes the terrains transform into account, and also samples the
+ *   heightmap. This method should be called after generate().
+ *
+ * @param coord Coordinate in uv-space from 0, 0 to 1, 1
+ * @return World-Space point
+ */
+LPoint3 ShaderTerrainMesh::uv_to_world(const LTexCoord& coord) const {
+  nassertr(_heightfield_tex != NULL, LPoint3(0));
+  PT(TexturePeeker) peeker = _heightfield_tex->peek();
+  nassertr(peeker != NULL, LPoint3(0));
+
+  LColor result;
+  if (!peeker->lookup_bilinear(result, coord.get_x(), coord.get_y())) {
+    shader_terrain_cat.error() << "UV out of range, cant transform to world!" << endl;
+    return LPoint3(0);
+  }
+  LPoint3 unit_point(coord.get_x(), coord.get_y(), result.get_x());
+  return get_transform()->get_mat().xform_point_general(unit_point);
+}

+ 205 - 0
panda/src/grutil/shaderTerrainMesh.h

@@ -0,0 +1,205 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file shaderTerrainMesh.h
+ * @author tobspr
+ * @date 2016-02-16
+ */
+
+#ifndef SHADER_TERRAIN_MESH_H
+#define SHADER_TERRAIN_MESH_H
+
+#include "pandabase.h"
+#include "luse.h"
+#include "pnmImage.h"
+#include "geom.h"
+#include "pandaNode.h"
+#include "texture.h"
+#include "texturePeeker.h"
+#include "configVariableBool.h"
+#include "configVariableInt.h"
+#include "pStatCollector.h"
+#include "filename.h"
+#include <stdint.h>
+
+extern ConfigVariableBool stm_use_hexagonal_layout;
+extern ConfigVariableInt stm_max_chunk_count;
+extern ConfigVariableInt stm_max_views;
+
+
+NotifyCategoryDecl(shader_terrain, EXPCL_PANDA_GRUTIL, EXPTP_PANDA_GRUTIL);
+
+
+/**
+ * @brief Terrain Renderer class utilizing the GPU
+ * @details This class provides functionality to render heightfields of large
+ *   sizes utilizing the GPU. Internally a quadtree is used to generate the LODs.
+ *   The final terrain is then rendered using instancing on the GPU. This makes
+ *   it possible to use very large heightfields (8192+) with very reasonable
+ *   performance. The terrain provides options to control the LOD using a
+ *   target triangle width, see ShaderTerrainMesh::set_target_triangle_width().
+ *
+ *   Because the Terrain is rendered entirely on the GPU, it needs a special
+ *   vertex shader. There is a default vertex shader available, which you can
+ *   use in your own shaders. IMPORTANT: If you don't set an appropriate shader
+ *   on the terrain, nothing will be visible.
+ */
+class EXPCL_PANDA_GRUTIL ShaderTerrainMesh : public PandaNode {
+
+PUBLISHED:
+
+  ShaderTerrainMesh();
+
+  INLINE void set_heightfield_filename(const Filename& filename);
+  INLINE const Filename& get_heightfield_filename() const;
+  MAKE_PROPERTY(heightfield_filename, get_heightfield_filename, set_heightfield_filename);
+
+  INLINE void set_chunk_size(size_t chunk_size);
+  INLINE size_t get_chunk_size() const;
+  MAKE_PROPERTY(chunk_size, get_chunk_size, set_chunk_size);
+
+  INLINE void set_generate_patches(bool generate_patches);
+  INLINE bool get_generate_patches() const;
+  MAKE_PROPERTY(generate_patches, get_generate_patches, set_generate_patches);
+
+  INLINE void set_update_enabled(bool update_enabled);
+  INLINE bool get_update_enabled() const;
+  MAKE_PROPERTY(update_enabled, get_update_enabled, set_update_enabled);
+
+  INLINE void set_target_triangle_width(PN_stdfloat target_triangle_width);
+  INLINE PN_stdfloat get_target_triangle_width() const;
+  MAKE_PROPERTY(target_triangle_width, get_target_triangle_width, set_target_triangle_width);
+
+  INLINE Texture* get_heightfield_tex() const;
+  MAKE_PROPERTY(heightfield_tex, get_heightfield_tex);
+
+  LPoint3 uv_to_world(const LTexCoord& coord) const;
+  INLINE LPoint3 uv_to_world(PN_stdfloat u, PN_stdfloat v) const;
+
+  bool generate();
+
+public:
+
+  // Methods derived from PandaNode
+  virtual bool is_renderable() const;
+  virtual bool safe_to_flatten() const;
+  virtual bool safe_to_combine() const;
+  virtual void add_for_draw(CullTraverser *trav, CullTraverserData &data);
+
+private:
+
+  // Chunk data
+  struct Chunk {
+    // Depth, starting at 0
+    size_t depth;
+
+    // Chunk position in heightfield space
+    size_t x, y;
+
+    // Chunk size in heightfield space
+    size_t size;
+
+    // Children, in the order (0, 0) (1, 0) (0, 1) (1, 1)
+    Chunk* children[4];
+
+    // Chunk heights, used for culling
+    PN_stdfloat avg_height, min_height, max_height;
+
+    // Edge heights, used for lod computation, in the same order as the children
+    LVector4 edges;
+
+    // Last CLOD factor, stored while computing LOD, used for seamless transitions between lods
+    PN_stdfloat last_clod;
+
+    INLINE void clear_children();
+    INLINE Chunk();
+    INLINE ~Chunk();
+  };
+
+
+  // Single entry in the data block
+  struct ChunkDataEntry {
+    // float x, y, size, clod;
+
+    // Panda uses BGRA, the above layout shows how its actually in texture memory,
+    // the layout below makes it work with BGRA.
+    PN_float32 size, y, x, clod;
+  };
+
+  // Data used while traversing all chunks
+  struct TraversalData {
+    // Global MVP used for LOD
+    LMatrix4 mvp_mat;
+
+    // Local model matrix used for culling
+    LMatrix4 model_mat;
+
+    // Camera bounds in world space
+    BoundingVolume* cam_bounds;
+
+    // Amount of emitted chunks so far
+    int emitted_chunks;
+
+    // Screen resolution, used for LOD
+    LVector2i screen_size;
+
+    // Pointer to the texture memory, where each chunk is written to
+    ChunkDataEntry* storage_ptr;
+  };
+
+  bool do_load_heightfield();
+  void do_convert_heightfield();
+  void do_init_data_texture();
+  void do_create_chunks();
+  void do_init_chunk(Chunk* chunk);
+  void do_compute_bounds(Chunk* chunk);
+  void do_create_chunk_geom();
+  void do_traverse(Chunk* chunk, TraversalData* data, bool fully_visible = false);
+  void do_emit_chunk(Chunk* chunk, TraversalData* data);
+  bool do_check_lod_matches(Chunk* chunk, TraversalData* data);
+
+  Chunk _base_chunk;
+  Filename _heightfield_source;
+  size_t _size;
+  size_t _chunk_size;
+  bool _generate_patches;
+  PNMImage _heightfield;
+  PT(Texture) _heightfield_tex;
+  PT(Geom) _chunk_geom;
+  PT(Texture) _data_texture;
+  size_t _current_view_index;
+  int _last_frame_count;
+  PN_stdfloat _target_triangle_width;
+  bool _update_enabled;
+
+  // PStats stuff
+  static PStatCollector _lod_collector;
+  static PStatCollector _basic_collector;
+
+
+// Type handle stuff
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    PandaNode::init_type();
+    register_type(_type_handle, "ShaderTerrainMesh", PandaNode::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#include "shaderTerrainMesh.I"
+
+#endif // SHADER_TERRAIN_MESH_H

+ 5 - 0
panda/src/movies/config_movies.cxx

@@ -13,6 +13,8 @@
 
 
 #include "config_movies.h"
 #include "config_movies.h"
 #include "dconfig.h"
 #include "dconfig.h"
+#include "flacAudio.h"
+#include "flacAudioCursor.h"
 #include "inkblotVideo.h"
 #include "inkblotVideo.h"
 #include "inkblotVideoCursor.h"
 #include "inkblotVideoCursor.h"
 #include "microphoneAudio.h"
 #include "microphoneAudio.h"
@@ -75,6 +77,8 @@ init_libmovies() {
   }
   }
   initialized = true;
   initialized = true;
 
 
+  FlacAudio::init_type();
+  FlacAudioCursor::init_type();
   InkblotVideo::init_type();
   InkblotVideo::init_type();
   InkblotVideoCursor::init_type();
   InkblotVideoCursor::init_type();
   MicrophoneAudio::init_type();
   MicrophoneAudio::init_type();
@@ -93,6 +97,7 @@ init_libmovies() {
 #endif
 #endif
 
 
   MovieTypeRegistry *reg = MovieTypeRegistry::get_global_ptr();
   MovieTypeRegistry *reg = MovieTypeRegistry::get_global_ptr();
+  reg->register_audio_type(&FlacAudio::make, "flac");
   reg->register_audio_type(&WavAudio::make, "wav wave");
   reg->register_audio_type(&WavAudio::make, "wav wave");
 
 
 #ifdef HAVE_VORBIS
 #ifdef HAVE_VORBIS

+ 2976 - 0
panda/src/movies/dr_flac.h

@@ -0,0 +1,2976 @@
+// Public domain. See "unlicense" statement at the end of this file.
+//NB: modified by rdb to use 16-bit instead of 32-bit samples.
+
+// ABOUT
+//
+// This is a simple library for decoding FLAC files.
+//
+//
+//
+// USAGE
+//
+// This is a single-file library. To use it, do something like the following in one .c file.
+//   #define DR_FLAC_IMPLEMENTATION
+//   #include "dr_flac.h"
+//
+// You can then #include this file in other parts of the program as you would with any other header file. To decode audio data,
+// do something like the following:
+//
+//     drflac* pFlac = drflac_open_file("MySong.flac");
+//     if (pFlac == NULL) {
+//         ... Failed to open FLAC file ...
+//     }
+//
+//     int16_t* pSamples = malloc(pFlac->totalSampleCount * sizeof(int16_t));
+//     uint64_t numberOfSamplesActuallyRead = drflac_read_s16(pFlac, pFlac->totalSampleCount, pSamples);
+//
+//     ... pSamples now contains the decoded samples as interleaved signed 16-bit PCM ...
+//
+// The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of
+// channels and the bits per sample, should be directly accessible - just make sure you don't change their values.
+//
+// You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and
+// the decoder will give you as many samples as it can, up to the amount requested. Later on when you need the next batch of
+// samples, just call it again. Example:
+//
+//     while (drflac_read_s16(pFlac, chunkSize, pChunkSamples) > 0) {
+//         do_something();
+//     }
+//
+// You can seek to a specific sample with drflac_seek_to_sample(). The given sample is based on interleaving. So for example,
+// if you were to seek to the sample at index 0 in a stereo stream, you'll be seeking to the first sample of the left channel.
+// The sample at index 1 will be the first sample of the right channel. The sample at index 2 will be the second sample of the
+// left channel, etc.
+//
+//
+//
+// OPTIONS
+// #define these options before including this file.
+//
+// #define DR_FLAC_NO_STDIO
+//   Disable drflac_open_file().
+//
+// #define DR_FLAC_NO_WIN32_IO
+//   Don't use the Win32 API internally for drflac_open_file(). Setting this will force stdio FILE APIs instead. This is
+//   mainly for testing, but it's left here in case somebody might find use for it. dr_flac will use the Win32 API by
+//   default. Ignored when DR_FLAC_NO_STDIO is #defined.
+//
+// #define DR_FLAC_BUFFER_SIZE <number>
+//   Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls
+//   back to the client for more data. Larger values means more memory, but better performance. My tests show diminishing
+//   returns after about 4KB (which is the default). Consider reducing this if you have a very efficient implementation of
+//   onRead(), or increase it if it's very inefficient.
+//
+//
+//
+// QUICK NOTES
+//
+// - Based on my own tests, the 32-bit build is about about 1.1x-1.25x slower than the reference implementation. The 64-bit
+//   build is at about parity.
+// - This should work fine with valid native FLAC files, but it won't work very well when the STREAMINFO block is unavailable
+//   and when a stream starts in the middle of a frame. This is something I plan on addressing.
+// - Audio data is retrieved as signed 16-bit PCM, regardless of the bits per sample the FLAC stream is encoded as.
+// - This has not been tested on big-endian architectures.
+// - Rice codes in unencoded binary form (see https://xiph.org/flac/format.html#rice_partition) has not been tested. If anybody
+//   knows where I can find some test files for this, let me know.
+// - Perverse and erroneous files have not been tested. Again, if you know where I can get some test files let me know.
+// - dr_flac is not thread-safe, but it's APIs can be called from any thread so long as you do your own synchronization.
+// - dr_flac does not currently do any CRC checks.
+// - Ogg encapsulation is not supported, but I want to add it at some point.
+//
+//
+//
+// TODO
+// - Implement a proper test suite.
+// - Add support for initializing the decoder without a STREAMINFO block. Build a synthethic test to get support working at at least
+//   a basic level.
+// - Add support for retrieving metadata blocks so applications can retrieve the album art or whatnot.
+// - Add support for Ogg encapsulation.
+
+#ifndef dr_flac_h
+#define dr_flac_h
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+// As data is read from the client it is placed into an internal buffer for fast access. This controls the
+// size of that buffer. Larger values means more speed, but also more memory. In my testing there is diminishing
+// returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
+#ifndef DR_FLAC_BUFFER_SIZE
+#define DR_FLAC_BUFFER_SIZE   4096
+#endif
+
+// Check if we can enable 64-bit optimizations.
+#if defined(_WIN64)
+#define DRFLAC_64BIT
+#endif
+
+#if defined(__GNUC__)
+#if defined(__x86_64__) || defined(__ppc64__)
+#define DRFLAC_64BIT
+#endif
+#endif
+
+#ifdef DRFLAC_64BIT
+typedef uint64_t drflac_cache_t;
+#else
+typedef uint32_t drflac_cache_t;
+#endif
+
+
+
+// Callback for when data is read. Return value is the number of bytes actually read.
+typedef size_t (* drflac_read_proc)(void* userData, void* bufferOut, size_t bytesToRead);
+
+// Callback for when data needs to be seeked. Offset is always relative to the current position. Return value is false on failure, true success.
+typedef bool (* drflac_seek_proc)(void* userData, int offset);
+
+
+typedef struct
+{
+    // The absolute position of the first byte of the data of the block. This is just past the block's header.
+    long long pos;
+
+    // The size in bytes of the block's data.
+    unsigned int sizeInBytes;
+
+} drflac_block;
+
+typedef struct
+{
+    // The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC.
+    unsigned char subframeType;
+
+    // The number of wasted bits per sample as specified by the sub-frame header.
+    unsigned char wastedBitsPerSample;
+
+    // The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC.
+    unsigned char lpcOrder;
+
+    // The number of bits per sample for this subframe. This is not always equal to the current frame's bit per sample because
+    // an extra bit is required for side channels when interchannel decorrelation is being used.
+    int bitsPerSample;
+
+    // A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pHeap, or
+    // NULL if the heap is not being used. Note that it's a signed 32-bit integer for each value.
+    int32_t* pDecodedSamples;
+
+} drflac_subframe;
+
+typedef struct
+{
+    // If the stream uses variable block sizes, this will be set to the index of the first sample. If fixed block sizes are used, this will
+    // always be set to 0.
+    unsigned long long sampleNumber;
+
+    // If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0.
+    unsigned int frameNumber;
+
+    // The sample rate of this frame.
+    unsigned int sampleRate;
+
+    // The number of samples in each sub-frame within this frame.
+    unsigned short blockSize;
+
+    // The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
+    // will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
+    unsigned char channelAssignment;
+
+    // The number of bits per sample within this frame.
+    unsigned char bitsPerSample;
+
+    // The frame's CRC. This is set, but unused at the moment.
+    unsigned char crc8;
+
+    // The number of samples left to be read in this frame. This is initially set to the block size multiplied by the channel count. As samples
+    // are read, this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
+    unsigned int samplesRemaining;
+
+    // The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels.
+    drflac_subframe subframes[8];
+
+} drflac_frame;
+
+typedef struct
+{
+    // The function to call when more data needs to be read. This is set by drflac_open().
+    drflac_read_proc onRead;
+
+    // The function to call when the current read position needs to be moved.
+    drflac_seek_proc onSeek;
+
+    // The user data to pass around to onRead and onSeek.
+    void* pUserData;
+
+
+    // The sample rate. Will be set to something like 44100.
+    unsigned int sampleRate;
+
+    // The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
+    // value specified in the STREAMINFO block.
+    unsigned char channels;
+
+    // The bits per sample. Will be set to somthing like 16, 24, etc.
+    unsigned char bitsPerSample;
+
+    // The maximum block size, in samples. This number represents the number of samples in each channel (not combined).
+    unsigned short maxBlockSize;
+
+    // The total number of samples making up the stream. This includes every channel. For example, if the stream has 2 channels,
+    // with each channel having a total of 4096, this value will be set to 2*4096 = 8192.
+    uint64_t totalSampleCount;
+
+
+    // The location and size of the APPLICATION block.
+    drflac_block applicationBlock;
+
+    // The location and size of the SEEKTABLE block.
+    drflac_block seektableBlock;
+
+    // The location and size of the VORBIS_COMMENT block.
+    drflac_block vorbisCommentBlock;
+
+    // The location and size of the CUESHEET block.
+    drflac_block cuesheetBlock;
+
+    // The location and size of the PICTURE block.
+    drflac_block pictureBlock;
+
+
+    // Information about the frame the decoder is currently sitting on.
+    drflac_frame currentFrame;
+
+    // The position of the first frame in the stream. This is only ever used for seeking.
+    unsigned long long firstFramePos;
+
+
+
+    // The current byte position in the client's data stream.
+    uint64_t currentBytePos;
+
+    // The index of the next valid cache line in the "L2" cache.
+    size_t nextL2Line;
+
+    // The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining.
+    size_t consumedBits;
+
+    // Unused L2 lines. This will always be 0 until the end of the stream is hit. Used for correctly calculating the current byte
+    // position of the read pointer in the stream.
+    size_t unusedL2Lines;
+
+    // The cached data which was most recently read from the client. When data is read from the client, it is placed within this
+    // variable. As data is read, it's bit-shifted such that the next valid bit is sitting on the most significant bit.
+    drflac_cache_t cache;
+    drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];
+
+
+    // A pointer to the decoded sample data. This is an offset of pExtraData.
+    int32_t* pDecodedSamples;
+
+    // Variable length extra data. We attach this to the end of the object so we avoid unnecessary mallocs.
+    char pExtraData[1];
+
+} drflac;
+
+
+
+
+// Opens a FLAC decoder.
+//
+// This is the lowest level function for opening a FLAC stream. You can also use drflac_open_file() and drflac_open_memory()
+// to open the stream from a file or from a block of memory respectively.
+//
+// At the moment the STREAMINFO block must be present for this to succeed.
+//
+// The onRead and onSeek callbacks are used to read and seek data provided by the client.
+static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData);
+
+// Closes the given FLAC decoder.
+static void drflac_close(drflac* pFlac);
+
+// Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
+//
+// Returns the number of samples actually read.
+static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* pBufferOut);
+
+// Seeks to the sample at the given index.
+static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex);
+
+
+
+#ifndef DR_FLAC_NO_STDIO
+// Opens a flac decoder from the file at the given path.
+static drflac* drflac_open_file(const char* pFile);
+#endif
+
+// Helper for opening a file from a pre-allocated memory buffer.
+//
+// This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
+// the lifetime of the decoder.
+static drflac* drflac_open_memory(const void* data, size_t dataSize);
+
+#endif  //dr_flac_h
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// IMPLEMENTATION
+//
+///////////////////////////////////////////////////////////////////////////////
+#ifdef DR_FLAC_IMPLEMENTATION
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>     // For _byteswap_ulong and _byteswap_uint64
+#endif
+
+#ifdef __linux__
+#define _BSD_SOURCE
+#include <endian.h>
+#endif
+
+#ifdef _MSC_VER
+#define DRFLAC_INLINE __forceinline
+#else
+#define DRFLAC_INLINE inline
+#endif
+
+#define DRFLAC_BLOCK_TYPE_STREAMINFO                    0
+#define DRFLAC_BLOCK_TYPE_PADDING                       1
+#define DRFLAC_BLOCK_TYPE_APPLICATION                   2
+#define DRFLAC_BLOCK_TYPE_SEEKTABLE                     3
+#define DRFLAC_BLOCK_TYPE_VORBIS_COMMENT                4
+#define DRFLAC_BLOCK_TYPE_CUESHEET                      5
+#define DRFLAC_BLOCK_TYPE_PICTURE                       6
+#define DRFLAC_BLOCK_TYPE_INVALID                       127
+
+#define DRFLAC_SUBFRAME_CONSTANT                        0
+#define DRFLAC_SUBFRAME_VERBATIM                        1
+#define DRFLAC_SUBFRAME_FIXED                           8
+#define DRFLAC_SUBFRAME_LPC                             32
+#define DRFLAC_SUBFRAME_RESERVED                        255
+
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
+
+#define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
+#define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
+#define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
+#define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
+
+typedef struct
+{
+    uint64_t firstSample;
+    uint64_t frameOffset;   // The offset from the first byte of the header of the first frame.
+    uint16_t sampleCount;
+} drflac_seekpoint;
+
+#ifndef DR_FLAC_NO_STDIO
+#if defined(DR_FLAC_NO_WIN32_IO) || !defined(_WIN32)
+#include <stdio.h>
+
+static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+
+static bool drflac__on_seek_stdio(void* pUserData, int offset)
+{
+    return fseek((FILE*)pUserData, offset, SEEK_CUR) == 0;
+}
+
+drflac* drflac_open_file(const char* filename)
+{
+    FILE* pFile;
+#ifdef _MSC_VER
+    if (fopen_s(&pFile, filename, "rb") != 0) {
+        return false;
+    }
+#else
+    pFile = fopen(filename, "rb");
+    if (pFile == NULL) {
+        return false;
+    }
+#endif
+
+    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, pFile);
+}
+#else
+#include <windows.h>
+
+static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    assert(bytesToRead < 0xFFFFFFFF);   // dr_flac will never request huge amounts of data at a time. This is a safe assertion.
+
+    DWORD bytesRead;
+    ReadFile((HANDLE)pUserData, bufferOut, (DWORD)bytesToRead, &bytesRead, NULL);
+
+    return (size_t)bytesRead;
+}
+
+static bool drflac__on_seek_stdio(void* pUserData, int offset)
+{
+    return SetFilePointer((HANDLE)pUserData, offset, NULL, FILE_CURRENT) != INVALID_SET_FILE_POINTER;
+}
+
+static drflac* drflac_open_file(const char* filename)
+{
+    HANDLE hFile = CreateFileA(filename, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        return false;
+    }
+
+    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, (void*)hFile);
+}
+#endif
+#endif  //DR_FLAC_NO_STDIO
+
+
+typedef struct
+{
+    /// A pointer to the beginning of the data. We use a char as the type here for easy offsetting.
+    const unsigned char* data;
+
+    /// The size of the data.
+    size_t dataSize;
+
+    /// The position we're currently sitting at.
+    size_t currentReadPos;
+
+} drflac_memory;
+
+static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    drflac_memory* memory = (drflac_memory*)pUserData;
+    assert(memory != NULL);
+    assert(memory->dataSize >= memory->currentReadPos);
+
+    size_t bytesRemaining = memory->dataSize - memory->currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (bytesToRead > 0) {
+        memcpy(bufferOut, memory->data + memory->currentReadPos, bytesToRead);
+        memory->currentReadPos += bytesToRead;
+    }
+
+    return bytesToRead;
+}
+
+static bool drflac__on_seek_memory(void* pUserData, int offset)
+{
+    drflac_memory* memory = (drflac_memory*)pUserData;
+    assert(memory != NULL);
+
+    if (offset > 0) {
+        if (memory->currentReadPos + offset > memory->dataSize) {
+            offset = (int)(memory->dataSize - memory->currentReadPos);     // Trying to seek too far forward.
+        }
+    } else {
+        if (memory->currentReadPos < (size_t)-offset) {
+            offset = -(int)memory->currentReadPos;                  // Trying to seek too far backwards.
+        }
+    }
+
+    // This will never underflow thanks to the clamps above.
+    memory->currentReadPos += offset;
+
+    return 1;
+}
+
+static drflac* drflac_open_memory(const void* data, size_t dataSize)
+{
+    drflac_memory* pUserData = (drflac_memory*)malloc(sizeof(*pUserData));
+    if (pUserData == NULL) {
+        return false;
+    }
+
+    pUserData->data = (const unsigned char*)data;
+    pUserData->dataSize = dataSize;
+    pUserData->currentReadPos = 0;
+    return drflac_open(drflac__on_read_memory, drflac__on_seek_memory, pUserData);
+}
+
+
+//// Endian Management ////
+static DRFLAC_INLINE bool drflac__is_little_endian()
+{
+    int n = 1;
+    return (*(char*)&n) == 1;
+}
+
+static DRFLAC_INLINE uint32_t drflac__swap_endian_uint32(uint32_t n)
+{
+#ifdef _MSC_VER
+    return _byteswap_ulong(n);
+#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC__ >= 3))
+    return __builtin_bswap32(n);
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+
+static DRFLAC_INLINE uint64_t drflac__swap_endian_uint64(uint64_t n)
+{
+#ifdef _MSC_VER
+    return _byteswap_uint64(n);
+#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC__ >= 3))
+    return __builtin_bswap64(n);
+#else
+    return ((n & 0xFF00000000000000ULL) >> 56) |
+           ((n & 0x00FF000000000000ULL) >> 40) |
+           ((n & 0x0000FF0000000000ULL) >> 24) |
+           ((n & 0x000000FF00000000ULL) >>  8) |
+           ((n & 0x00000000FF000000ULL) <<  8) |
+           ((n & 0x0000000000FF0000ULL) << 24) |
+           ((n & 0x000000000000FF00ULL) << 40) |
+           ((n & 0x00000000000000FFULL) << 56);
+#endif
+}
+
+
+static DRFLAC_INLINE uint32_t drflac__be2host_32(uint32_t n)
+{
+#ifdef __linux__
+    return be32toh(n);
+#else
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint32(n);
+    }
+
+    return n;
+#endif
+}
+
+static DRFLAC_INLINE uint64_t drflac__be2host_64(uint64_t n)
+{
+#ifdef __linux__
+    return be64toh(n);
+#else
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint64(n);
+    }
+
+    return n;
+#endif
+}
+
+#ifdef DRFLAC_64BIT
+#define drflac__be2host__cache_line drflac__be2host_64
+#else
+#define drflac__be2host__cache_line drflac__be2host_32
+#endif
+
+
+// BIT READING ATTEMPT #2
+//
+// This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
+// on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
+// is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
+// array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
+// from onRead() is read into.
+#define DRFLAC_CACHE_L1_SIZE_BYTES                  (sizeof(pFlac->cache))
+#define DRFLAC_CACHE_L1_SIZE_BITS                   (sizeof(pFlac->cache)*8)
+#define DRFLAC_CACHE_L1_BITS_REMAINING              (DRFLAC_CACHE_L1_SIZE_BITS - (pFlac->consumedBits))
+#ifdef DRFLAC_64BIT
+#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint64_t)-1LL) >> (_bitCount)))
+#else
+#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint32_t)-1) >> (_bitCount)))
+#endif
+#define DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount)  (DRFLAC_CACHE_L1_SIZE_BITS - (_bitCount))
+#define DRFLAC_CACHE_L1_SELECT(_bitCount)           ((pFlac->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
+#define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(_bitCount) (DRFLAC_CACHE_L1_SELECT(_bitCount) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount))
+#define DRFLAC_CACHE_L2_SIZE_BYTES                  (sizeof(pFlac->cacheL2))
+#define DRFLAC_CACHE_L2_LINE_COUNT                  (DRFLAC_CACHE_L2_SIZE_BYTES / sizeof(pFlac->cacheL2[0]))
+#define DRFLAC_CACHE_L2_LINES_REMAINING             (DRFLAC_CACHE_L2_LINE_COUNT - pFlac->nextL2Line)
+
+static DRFLAC_INLINE bool drflac__reload_l1_cache_from_l2(drflac* pFlac)
+{
+    // Fast path. Try loading straight from L2.
+    if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+
+    // If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client.
+    size_t bytesRead = pFlac->onRead(pFlac->pUserData, pFlac->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES);
+    pFlac->currentBytePos += bytesRead;
+
+    pFlac->nextL2Line = 0;
+    if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES) {
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+
+
+    // If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
+    // means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
+    // and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
+    // the size of the L1 so we'll need to seek backwards by any misaligned bytes.
+    size_t alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES;
+    if (alignedL1LineCount > 0)
+    {
+        size_t offset = DRFLAC_CACHE_L2_LINE_COUNT - alignedL1LineCount;
+        for (size_t i = alignedL1LineCount; i > 0; --i) {
+            pFlac->cacheL2[i-1 + offset] = pFlac->cacheL2[i-1];
+        }
+
+        pFlac->nextL2Line = offset;
+        pFlac->unusedL2Lines = offset;
+
+        // At this point there may be some leftover unaligned bytes. We need to seek backwards so we don't lose
+        // those bytes.
+        size_t unalignedBytes = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES);
+        if (unalignedBytes > 0) {
+            pFlac->onSeek(pFlac->pUserData, -(int)unalignedBytes);
+            pFlac->currentBytePos -= unalignedBytes;
+        }
+
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+    else
+    {
+        // If we get into this branch it means we weren't able to load any L1-aligned data. We just need to seek
+        // backwards by the leftover bytes and return false.
+        if (bytesRead > 0) {
+            pFlac->onSeek(pFlac->pUserData, -(int)bytesRead);
+            pFlac->currentBytePos -= bytesRead;
+        }
+
+        pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT;
+        return false;
+    }
+}
+
+static bool drflac__reload_cache(drflac* pFlac)
+{
+    // Fast path. Try just moving the next value in the L2 cache to the L1 cache.
+    if (drflac__reload_l1_cache_from_l2(pFlac)) {
+        pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
+        pFlac->consumedBits = 0;
+        return true;
+    }
+
+    // Slow path.
+
+    // If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
+    // few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
+    // data straight from the client into the L1 cache. This should only really happen once per stream so efficiency is not important.
+    size_t bytesRead = pFlac->onRead(pFlac->pUserData, &pFlac->cache, DRFLAC_CACHE_L1_SIZE_BYTES);
+    if (bytesRead == 0) {
+        return false;
+    }
+
+    pFlac->currentBytePos += bytesRead;
+
+    assert(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES);
+    pFlac->consumedBits = (DRFLAC_CACHE_L1_SIZE_BYTES - bytesRead) * 8;
+
+    pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
+    pFlac->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_SIZE_BITS - pFlac->consumedBits);    // <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property.
+    return true;
+}
+
+static bool drflac__seek_bits(drflac* pFlac, size_t bitsToSeek)
+{
+    if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING) {
+        pFlac->consumedBits += bitsToSeek;
+        pFlac->cache <<= bitsToSeek;
+        return true;
+    } else {
+        // It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here.
+        bitsToSeek -= DRFLAC_CACHE_L1_BITS_REMAINING;
+        pFlac->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING;
+        pFlac->cache = 0;
+
+        size_t wholeBytesRemaining = bitsToSeek/8;
+        if (wholeBytesRemaining > 0)
+        {
+            // The next bytes to seek will be located in the L2 cache. The problem is that the L2 cache is not byte aligned,
+            // but rather DRFLAC_CACHE_L1_SIZE_BYTES aligned (usually 4 or 8). If, for example, the number of bytes to seek is
+            // 3, we'll need to handle it in a special way.
+            size_t wholeCacheLinesRemaining = wholeBytesRemaining / DRFLAC_CACHE_L1_SIZE_BYTES;
+            if (wholeCacheLinesRemaining < DRFLAC_CACHE_L2_LINES_REMAINING)
+            {
+                wholeBytesRemaining -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BYTES;
+                bitsToSeek -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BITS;
+                pFlac->nextL2Line += wholeCacheLinesRemaining;
+            }
+            else
+            {
+                wholeBytesRemaining -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BYTES;
+                bitsToSeek -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BITS;
+                pFlac->nextL2Line += DRFLAC_CACHE_L2_LINES_REMAINING;
+
+                pFlac->onSeek(pFlac->pUserData, (int)wholeBytesRemaining);
+                pFlac->currentBytePos += wholeBytesRemaining;
+                bitsToSeek -= wholeBytesRemaining*8;
+            }
+        }
+
+
+        if (bitsToSeek > 0) {
+            if (!drflac__reload_cache(pFlac)) {
+                return false;
+            }
+
+            return drflac__seek_bits(pFlac, bitsToSeek);
+        }
+
+        return true;
+    }
+}
+
+static bool drflac__read_uint32(drflac* pFlac, unsigned int bitCount, uint32_t* pResultOut)
+{
+    assert(pFlac != NULL);
+    assert(pResultOut != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 32);
+
+    if (pFlac->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS) {
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+    }
+
+    if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING) {
+        if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS) {
+            *pResultOut = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCount);
+            pFlac->consumedBits += bitCount;
+            pFlac->cache <<= bitCount;
+        } else {
+            *pResultOut = (uint32_t)pFlac->cache;
+            pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+            pFlac->cache = 0;
+        }
+        return true;
+    } else {
+        // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
+        size_t bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING;
+        size_t bitCountLo = bitCount - bitCountHi;
+        uint32_t resultHi = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountHi);
+
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+
+        *pResultOut = (resultHi << bitCountLo) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo);
+        pFlac->consumedBits += bitCountLo;
+        pFlac->cache <<= bitCountLo;
+        return true;
+    }
+}
+
+static bool drflac__read_int32(drflac* pFlac, unsigned int bitCount, int32_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 32);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    if ((result & (1 << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
+        result |= (-1 << bitCount);
+    }
+
+    *pResult = (int32_t)result;
+    return true;
+}
+
+static bool drflac__read_uint64(drflac* pFlac, unsigned int bitCount, uint64_t* pResultOut)
+{
+    assert(bitCount <= 64);
+    assert(bitCount >  32);
+
+    uint32_t resultHi;
+    if (!drflac__read_uint32(pFlac, bitCount - 32, &resultHi)) {
+        return false;
+    }
+
+    uint32_t resultLo;
+    if (!drflac__read_uint32(pFlac, 32, &resultLo)) {
+        return false;
+    }
+
+    *pResultOut = (((uint64_t)resultHi) << 32) | ((uint64_t)resultLo);
+    return true;
+}
+
+static bool drflac__read_int64(drflac* pFlac, unsigned int bitCount, int64_t* pResultOut)
+{
+    assert(bitCount <= 64);
+
+    uint64_t result;
+    if (!drflac__read_uint64(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    if ((result & (1ULL << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
+        result |= (-1LL << bitCount);
+    }
+
+    *pResultOut = (int64_t)result;
+    return true;
+}
+
+static bool drflac__read_uint16(drflac* pFlac, unsigned int bitCount, uint16_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 16);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (uint16_t)result;
+    return true;
+}
+
+static bool drflac__read_int16(drflac* pFlac, unsigned int bitCount, int16_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 16);
+
+    int32_t result;
+    if (!drflac__read_int32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (int16_t)result;
+    return true;
+}
+
+static bool drflac__read_uint8(drflac* pFlac, unsigned int bitCount, uint8_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 8);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (uint8_t)result;
+    return true;
+}
+
+static bool drflac__read_int8(drflac* pFlac, unsigned int bitCount, int8_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 8);
+
+    int32_t result;
+    if (!drflac__read_int32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (int8_t)result;
+    return true;
+}
+
+
+static inline bool drflac__seek_past_next_set_bit(drflac* pFlac, unsigned int* pOffsetOut)
+{
+    unsigned int zeroCounter = 0;
+    while (pFlac->cache == 0) {
+        zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+    }
+
+    // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
+    // no need for us to perform any cache reloading logic here which should make things much faster.
+    assert(pFlac->cache != 0);
+
+    unsigned int bitOffsetTable[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
+    if (setBitOffsetPlus1 == 0) {
+        if (pFlac->cache == 1) {
+            setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
+        } else {
+            setBitOffsetPlus1 = 5;
+            for (;;)
+            {
+                if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
+                    break;
+                }
+
+                setBitOffsetPlus1 += 1;
+            }
+        }
+    }
+
+    pFlac->consumedBits += setBitOffsetPlus1;
+    pFlac->cache <<= setBitOffsetPlus1;
+
+    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
+    return true;
+}
+
+
+
+static bool drflac__seek_to_byte(drflac* pFlac, long long offsetFromStart)
+{
+    assert(pFlac != NULL);
+
+    long long bytesToMove = offsetFromStart - pFlac->currentBytePos;
+    if (bytesToMove == 0) {
+        return 1;
+    }
+
+    if (bytesToMove > 0x7FFFFFFF) {
+        while (bytesToMove > 0x7FFFFFFF) {
+            if (!pFlac->onSeek(pFlac->pUserData, 0x7FFFFFFF)) {
+                return 0;
+            }
+
+            pFlac->currentBytePos += 0x7FFFFFFF;
+            bytesToMove -= 0x7FFFFFFF;
+        }
+    } else {
+        while (bytesToMove < (int)0x80000000) {
+            if (!pFlac->onSeek(pFlac->pUserData, (int)0x80000000)) {
+                return 0;
+            }
+
+            pFlac->currentBytePos += (int)0x80000000;
+            bytesToMove -= (int)0x80000000;
+        }
+    }
+
+    assert(bytesToMove <= 0x7FFFFFFF && bytesToMove >= (int)0x80000000);
+
+    bool result = pFlac->onSeek(pFlac->pUserData, (int)bytesToMove);    // <-- Safe cast as per the assert above.
+    pFlac->currentBytePos += (int)bytesToMove;
+
+    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+    pFlac->cache = 0;
+    pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT; // <-- This clears the L2 cache.
+
+    return result;
+}
+
+static long long drflac__tell(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    size_t unreadBytesFromL1 = (DRFLAC_CACHE_L1_SIZE_BYTES - (pFlac->consumedBits/8));
+    size_t unreadBytesFromL2 = (DRFLAC_CACHE_L2_SIZE_BYTES - ((pFlac->nextL2Line - pFlac->unusedL2Lines)*DRFLAC_CACHE_L1_SIZE_BYTES));
+
+    return pFlac->currentBytePos - unreadBytesFromL1 - unreadBytesFromL2;
+}
+
+
+
+static bool drflac__read_utf8_coded_number(drflac* pFlac, unsigned long long* pNumberOut)
+{
+    assert(pFlac != NULL);
+    assert(pNumberOut != NULL);
+
+    // We should never need to read UTF-8 data while not being aligned to a byte boundary. Therefore we can grab the data
+    // directly from the input stream rather than using drflac__read_uint8().
+    assert((pFlac->consumedBits & 7) == 0);
+
+    unsigned char utf8[7] = {0};
+    if (!drflac__read_uint8(pFlac, 8, utf8)) {
+        *pNumberOut = 0;
+        return false;
+    }
+
+    if ((utf8[0] & 0x80) == 0) {
+        *pNumberOut = utf8[0];
+        return true;
+    }
+
+    int byteCount = 1;
+    if ((utf8[0] & 0xE0) == 0xC0) {
+        byteCount = 2;
+    } else if ((utf8[0] & 0xF0) == 0xE0) {
+        byteCount = 3;
+    } else if ((utf8[0] & 0xF8) == 0xF0) {
+        byteCount = 4;
+    } else if ((utf8[0] & 0xFC) == 0xF8) {
+        byteCount = 5;
+    } else if ((utf8[0] & 0xFE) == 0xFC) {
+        byteCount = 6;
+    } else if ((utf8[0] & 0xFF) == 0xFE) {
+        byteCount = 7;
+    } else {
+        *pNumberOut = 0;
+        return false;     // Bad UTF-8 encoding.
+    }
+
+    // Read extra bytes.
+    assert(byteCount > 1);
+
+    unsigned long long result = ((long long)(utf8[0] & (0xFF >> (byteCount + 1))));
+    for (int i = 1; i < byteCount; ++i) {
+        if (!drflac__read_uint8(pFlac, 8, utf8 + i)) {
+            *pNumberOut = 0;
+            return false;
+        }
+
+        result = (result << 6) | (utf8[i] & 0x3F);
+    }
+
+    *pNumberOut = result;
+    return true;
+}
+
+
+
+static DRFLAC_INLINE bool drflac__read_and_seek_rice(drflac* pFlac, unsigned char m)
+{
+    unsigned int unused;
+    if (!drflac__seek_past_next_set_bit(pFlac, &unused)) {
+        return false;
+    }
+
+    if (m > 0) {
+        if (!drflac__seek_bits(pFlac, m)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+// The next two functions are responsible for calculating the prediction.
+//
+// When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
+// safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
+//
+//
+// Optimization Experiment #1
+//
+// The first optimization experiment I'm trying here is a loop unroll for the most common LPC orders. I've done a little test
+// and the results are as follows, in order of most common:
+// 1)  order = 8  : 93.1M
+// 2)  order = 7  : 36.6M
+// 3)  order = 3  : 33.2M
+// 4)  order = 6  : 20.9M
+// 5)  order = 5  : 18.1M
+// 6)  order = 4  : 15.8M
+// 7)  order = 12 : 10.8M
+// 8)  order = 2  :  9.8M
+// 9)  order = 1  :  1.6M
+// 10) order = 10 :  1.0M
+// 11) order = 9  :  0.8M
+// 12) order = 11 :  0.8M
+//
+// We'll experiment with unrolling the top 8 most common ones. We'll ignore the least common ones since there seems to be a
+// large drop off there.
+//
+// Result: There's a tiny improvement in some cases, but it could just be within margin of error so unsure if it's worthwhile
+// just yet.
+static DRFLAC_INLINE int32_t drflac__calculate_prediction_32(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+{
+    assert(order <= 32);
+
+    // 32-bit version.
+
+    // This method is slower on both 32- and 64-bit builds with VC++. Leaving this here for now just in case we need it later
+    // for whatever reason.
+#if 0
+    int prediction;
+    if (order == 8)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+        prediction += coefficients[6] * pDecodedSamples[-7];
+        prediction += coefficients[7] * pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+        prediction += coefficients[6] * pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+        prediction += coefficients[10] * pDecodedSamples[-11];
+        prediction += coefficients[11] * pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+        prediction += coefficients[10] * pDecodedSamples[-11];
+    }
+    else
+    {
+        prediction = 0;
+        for (int j = 0; j < (int)order; ++j) {
+            prediction += coefficients[j] * pDecodedSamples[-j-1];
+        }
+    }
+#endif
+
+    // Experiment #2. See if we can use a switch and let the compiler optimize it to a jump table.
+    // Result: VC++ definitely optimizes this to a single jmp as expected. I expect other compilers should do the same, but I've
+    // not verified yet.
+#if 1
+    int prediction = 0;
+
+    switch (order)
+    {
+    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
+    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
+    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
+    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
+    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
+    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
+    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
+    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
+    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
+    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
+    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
+    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
+    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
+    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
+    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
+    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
+    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
+    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
+    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
+    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
+    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
+    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
+    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
+    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
+    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
+    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
+    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
+    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
+    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
+    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
+    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
+    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
+    }
+#endif
+
+    return (int32_t)(prediction >> shift);
+}
+
+static DRFLAC_INLINE int32_t drflac__calculate_prediction(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+{
+    assert(order <= 32);
+
+    // 64-bit version.
+
+    // This method is faster on the 32-bit build when compiling with VC++. See note below.
+#ifndef DRFLAC_64BIT
+    long long prediction;
+    if (order == 8)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7] * (long long)pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+        prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+    }
+    else
+    {
+        prediction = 0;
+        for (int j = 0; j < (int)order; ++j) {
+            prediction += (long long)coefficients[j] * (long long)pDecodedSamples[-j-1];
+        }
+    }
+#endif
+
+    // Experiment #2. See if we can use a switch and let the compiler optimize it to a single jmp instruction.
+    // Result: VC++ optimizes this to a single jmp on the 64-bit build, but for some reason the 32-bit version compiles to less efficient
+    // code. Thus, we use this version on the 64-bit build and the uglier version above for the 32-bit build. If anyone has an idea on how
+    // I can get VC++ to generate an efficient jump table for the 32-bit build let me know.
+#ifdef DRFLAC_64BIT
+    long long prediction = 0;
+
+    switch (order)
+    {
+    case 32: prediction += (long long)coefficients[31] * (long long)pDecodedSamples[-32];
+    case 31: prediction += (long long)coefficients[30] * (long long)pDecodedSamples[-31];
+    case 30: prediction += (long long)coefficients[29] * (long long)pDecodedSamples[-30];
+    case 29: prediction += (long long)coefficients[28] * (long long)pDecodedSamples[-29];
+    case 28: prediction += (long long)coefficients[27] * (long long)pDecodedSamples[-28];
+    case 27: prediction += (long long)coefficients[26] * (long long)pDecodedSamples[-27];
+    case 26: prediction += (long long)coefficients[25] * (long long)pDecodedSamples[-26];
+    case 25: prediction += (long long)coefficients[24] * (long long)pDecodedSamples[-25];
+    case 24: prediction += (long long)coefficients[23] * (long long)pDecodedSamples[-24];
+    case 23: prediction += (long long)coefficients[22] * (long long)pDecodedSamples[-23];
+    case 22: prediction += (long long)coefficients[21] * (long long)pDecodedSamples[-22];
+    case 21: prediction += (long long)coefficients[20] * (long long)pDecodedSamples[-21];
+    case 20: prediction += (long long)coefficients[19] * (long long)pDecodedSamples[-20];
+    case 19: prediction += (long long)coefficients[18] * (long long)pDecodedSamples[-19];
+    case 18: prediction += (long long)coefficients[17] * (long long)pDecodedSamples[-18];
+    case 17: prediction += (long long)coefficients[16] * (long long)pDecodedSamples[-17];
+    case 16: prediction += (long long)coefficients[15] * (long long)pDecodedSamples[-16];
+    case 15: prediction += (long long)coefficients[14] * (long long)pDecodedSamples[-15];
+    case 14: prediction += (long long)coefficients[13] * (long long)pDecodedSamples[-14];
+    case 13: prediction += (long long)coefficients[12] * (long long)pDecodedSamples[-13];
+    case 12: prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
+    case 11: prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+    case 10: prediction += (long long)coefficients[ 9] * (long long)pDecodedSamples[-10];
+    case  9: prediction += (long long)coefficients[ 8] * (long long)pDecodedSamples[- 9];
+    case  8: prediction += (long long)coefficients[ 7] * (long long)pDecodedSamples[- 8];
+    case  7: prediction += (long long)coefficients[ 6] * (long long)pDecodedSamples[- 7];
+    case  6: prediction += (long long)coefficients[ 5] * (long long)pDecodedSamples[- 6];
+    case  5: prediction += (long long)coefficients[ 4] * (long long)pDecodedSamples[- 5];
+    case  4: prediction += (long long)coefficients[ 3] * (long long)pDecodedSamples[- 4];
+    case  3: prediction += (long long)coefficients[ 2] * (long long)pDecodedSamples[- 3];
+    case  2: prediction += (long long)coefficients[ 1] * (long long)pDecodedSamples[- 2];
+    case  1: prediction += (long long)coefficients[ 0] * (long long)pDecodedSamples[- 1];
+    }
+#endif
+
+    return (int32_t)(prediction >> shift);
+}
+
+
+// Reads and decodes a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
+//
+// This is the most frequently called function in the library. It does both the Rice decoding and the prediction in a single loop
+// iteration.
+static bool drflac__decode_samples_with_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+    assert(pSamplesOut != NULL);
+
+    static unsigned int bitOffsetTable[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    drflac_cache_t riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(riceParam);
+    drflac_cache_t resultHiShift = DRFLAC_CACHE_L1_SIZE_BITS - riceParam;
+
+    for (int i = 0; i < (int)count; ++i)
+    {
+        unsigned int zeroCounter = 0;
+        while (pFlac->cache == 0) {
+            zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
+            if (!drflac__reload_cache(pFlac)) {
+                return false;
+            }
+        }
+
+        // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
+        // no need for us to perform any cache reloading logic here which should make things much faster.
+        assert(pFlac->cache != 0);
+        unsigned int decodedRice;
+
+        unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
+        if (setBitOffsetPlus1 > 0) {
+            decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
+        } else {
+            if (pFlac->cache == 1) {
+                setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
+                decodedRice = (zeroCounter + (DRFLAC_CACHE_L1_SIZE_BITS-1)) << riceParam;
+            } else {
+                setBitOffsetPlus1 = 5;
+                for (;;)
+                {
+                    if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
+                        decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
+                        break;
+                    }
+
+                    setBitOffsetPlus1 += 1;
+                }
+            }
+        }
+
+
+        unsigned int bitsLo = 0;
+        unsigned int riceLength = setBitOffsetPlus1 + riceParam;
+        if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING)
+        {
+            bitsLo = (unsigned int)((pFlac->cache & (riceParamMask >> setBitOffsetPlus1)) >> (DRFLAC_CACHE_L1_SIZE_BITS - riceLength));
+
+            pFlac->consumedBits += riceLength;
+            pFlac->cache <<= riceLength;
+        }
+        else
+        {
+            pFlac->consumedBits += riceLength;
+            pFlac->cache <<= setBitOffsetPlus1;
+
+            // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
+            size_t bitCountLo = pFlac->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS;
+            drflac_cache_t resultHi = pFlac->cache & riceParamMask;    // <-- This mask is OK because all bits after the first bits are always zero.
+
+
+            if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
+                pFlac->cache = drflac__be2host__cache_line(pFlac->cacheL2[pFlac->nextL2Line++]);
+            } else {
+                // Slow path. We need to fetch more data from the client.
+                if (!drflac__reload_cache(pFlac)) {
+                    return false;
+                }
+            }
+
+            bitsLo = (unsigned int)((resultHi >> resultHiShift) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo));
+            pFlac->consumedBits = bitCountLo;
+            pFlac->cache <<= bitCountLo;
+        }
+
+
+        decodedRice |= bitsLo;
+        if ((decodedRice & 0x01)) {
+            decodedRice = ~(decodedRice >> 1);
+        } else {
+            decodedRice = (decodedRice >> 1);
+        }
+
+
+        // In order to properly calculate the prediction when the bits per sample is >16 we need to do it using 64-bit arithmetic. We can assume this
+        // is probably going to be slower on 32-bit systems so we'll do a more optimized 32-bit version when the bits per sample is low enough.
+        if (pFlac->currentFrame.bitsPerSample > 16) {
+            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i));
+        } else {
+            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i));
+        }
+    }
+
+    return true;
+}
+
+
+// Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
+static bool drflac__read_and_seek_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+
+    for (unsigned int i = 0; i < count; ++i) {
+        if (!drflac__read_and_seek_rice(pFlac, riceParam)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples_with_residual__unencoded(drflac* pFlac, unsigned int count, unsigned char unencodedBitsPerSample, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+    assert(unencodedBitsPerSample > 0 && unencodedBitsPerSample <= 32);
+    assert(pSamplesOut != NULL);
+
+    for (unsigned int i = 0; i < count; ++i)
+    {
+        if (!drflac__read_int32(pFlac, unencodedBitsPerSample, pSamplesOut + i)) {
+            return false;
+        }
+
+        pSamplesOut[i] += drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i);
+    }
+
+    return true;
+}
+
+
+// Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
+// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
+// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+static bool drflac__decode_samples_with_residual(drflac* pFlac, unsigned int blockSize, unsigned int order, int shift, const short* coefficients, int* pDecodedSamples)
+{
+    assert(pFlac != NULL);
+    assert(blockSize != 0);
+    assert(pDecodedSamples != NULL);       // <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode?
+
+    unsigned char residualMethod;
+    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
+        return false;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return false;    // Unknown or unsupported residual coding method.
+    }
+
+    // Ignore the first <order> values.
+    pDecodedSamples += order;
+
+
+    unsigned char partitionOrder;
+    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
+        return false;
+    }
+
+
+    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    unsigned int partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        unsigned char riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 16) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 32) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__decode_samples_with_residual__rice(pFlac, samplesInPartition, riceParam, order, shift, coefficients, pDecodedSamples)) {
+                return false;
+            }
+        } else {
+            unsigned char unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
+                return false;
+            }
+
+            if (!drflac__decode_samples_with_residual__unencoded(pFlac, samplesInPartition, unencodedBitsPerSample, order, shift, coefficients, pDecodedSamples)) {
+                return false;
+            }
+        }
+
+        pDecodedSamples += samplesInPartition;
+
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+
+    return true;
+}
+
+// Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
+// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
+// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+static bool drflac__read_and_seek_residual(drflac* pFlac, unsigned int blockSize, unsigned int order)
+{
+    assert(pFlac != NULL);
+    assert(blockSize != 0);
+
+    unsigned char residualMethod;
+    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
+        return false;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return false;    // Unknown or unsupported residual coding method.
+    }
+
+    unsigned char partitionOrder;
+    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
+        return false;
+    }
+
+    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    unsigned int partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        unsigned char riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 16) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 32) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__read_and_seek_residual__rice(pFlac, samplesInPartition, riceParam)) {
+                return false;
+            }
+        } else {
+            unsigned char unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
+                return false;
+            }
+
+            if (!drflac__seek_bits(pFlac, unencodedBitsPerSample * samplesInPartition)) {
+                return false;
+            }
+        }
+
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+
+    return true;
+}
+
+
+static bool drflac__decode_samples__constant(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    // Only a single sample needs to be decoded here.
+    int sample;
+    if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+        return false;
+    }
+
+    // We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
+    // we'll want to look at a more efficient way.
+    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__verbatim(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__fixed(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    short lpcCoefficientsTable[5][4] = {
+        {0,  0, 0,  0},
+        {1,  0, 0,  0},
+        {2, -1, 0,  0},
+        {3, -3, 1,  0},
+        {4, -6, 4, -1}
+    };
+
+    // Warm up samples and coefficients.
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+
+    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, 0, lpcCoefficientsTable[pSubframe->lpcOrder], pSubframe->pDecodedSamples)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__lpc(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    // Warm up samples.
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    unsigned char lpcPrecision;
+    if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
+        return false;
+    }
+    if (lpcPrecision == 15) {
+        return false;    // Invalid.
+    }
+    lpcPrecision += 1;
+
+
+    signed char lpcShift;
+    if (!drflac__read_int8(pFlac, 5, &lpcShift)) {
+        return false;
+    }
+
+
+    short coefficients[32];
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        if (!drflac__read_int16(pFlac, lpcPrecision, coefficients + i)) {
+            return false;
+        }
+    }
+
+    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, lpcShift, coefficients, pSubframe->pDecodedSamples)) {
+        return false;
+    }
+
+    return true;
+}
+
+
+static bool drflac__read_next_frame_header(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+    assert(pFlac->onRead != NULL);
+
+    // At the moment the sync code is as a form of basic validation. The CRC is stored, but is unused at the moment. This
+    // should probably be handled better in the future.
+
+    const int sampleRateTable[12]       = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
+    const uint8_t bitsPerSampleTable[8] = {0, 8, 12, (uint8_t)-1, 16, 20, 24, (uint8_t)-1};   // -1 = reserved.
+
+    unsigned short syncCode = 0;
+    if (!drflac__read_uint16(pFlac, 14, &syncCode)) {
+        return false;
+    }
+
+    if (syncCode != 0x3FFE) {
+        // TODO: Try and recover by attempting to seek to and read the next frame?
+        return false;
+    }
+
+    unsigned char reserved;
+    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
+        return false;
+    }
+
+    unsigned char blockingStrategy = 0;
+    if (!drflac__read_uint8(pFlac, 1, &blockingStrategy)) {
+        return false;
+    }
+
+
+
+    unsigned char blockSize = 0;
+    if (!drflac__read_uint8(pFlac, 4, &blockSize)) {
+        return false;
+    }
+
+    unsigned char sampleRate = 0;
+    if (!drflac__read_uint8(pFlac, 4, &sampleRate)) {
+        return false;
+    }
+
+    unsigned char channelAssignment = 0;
+    if (!drflac__read_uint8(pFlac, 4, &channelAssignment)) {
+        return false;
+    }
+
+    unsigned char bitsPerSample = 0;
+    if (!drflac__read_uint8(pFlac, 3, &bitsPerSample)) {
+        return false;
+    }
+
+    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
+        return false;
+    }
+
+
+    unsigned char isVariableBlockSize = blockingStrategy == 1;
+    if (isVariableBlockSize) {
+        pFlac->currentFrame.frameNumber = 0;
+        if (!drflac__read_utf8_coded_number(pFlac, &pFlac->currentFrame.sampleNumber)) {
+            return false;
+        }
+    } else {
+        unsigned long long frameNumber = 0;
+        if (!drflac__read_utf8_coded_number(pFlac, &frameNumber)) {
+            return false;
+        }
+        pFlac->currentFrame.frameNumber  = (unsigned int)frameNumber;   // <-- Safe cast.
+        pFlac->currentFrame.sampleNumber = 0;
+    }
+
+
+    if (blockSize == 1) {
+        pFlac->currentFrame.blockSize = 192;
+    } else if (blockSize >= 2 && blockSize <= 5) {
+        pFlac->currentFrame.blockSize = 576 * (1 << (blockSize - 2));
+    } else if (blockSize == 6) {
+        if (!drflac__read_uint16(pFlac, 8, &pFlac->currentFrame.blockSize)) {
+            return false;
+        }
+        pFlac->currentFrame.blockSize += 1;
+    } else if (blockSize == 7) {
+        if (!drflac__read_uint16(pFlac, 16, &pFlac->currentFrame.blockSize)) {
+            return false;
+        }
+        pFlac->currentFrame.blockSize += 1;
+    } else {
+        pFlac->currentFrame.blockSize = 256 * (1 << (blockSize - 8));
+    }
+
+
+    if (sampleRate <= 11) {
+        pFlac->currentFrame.sampleRate = sampleRateTable[sampleRate];
+    } else if (sampleRate == 12) {
+        if (!drflac__read_uint32(pFlac, 8, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+        pFlac->currentFrame.sampleRate *= 1000;
+    } else if (sampleRate == 13) {
+        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+    } else if (sampleRate == 14) {
+        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+        pFlac->currentFrame.sampleRate *= 10;
+    } else {
+        return false;  // Invalid.
+    }
+
+
+    pFlac->currentFrame.channelAssignment = channelAssignment;
+
+    pFlac->currentFrame.bitsPerSample = bitsPerSampleTable[bitsPerSample];
+    if (pFlac->currentFrame.bitsPerSample == 0) {
+        pFlac->currentFrame.bitsPerSample = pFlac->bitsPerSample;
+    }
+
+    if (drflac__read_uint8(pFlac, 8, &pFlac->currentFrame.crc8) != 1) {
+        return false;
+    }
+
+    memset(pFlac->currentFrame.subframes, 0, sizeof(pFlac->currentFrame.subframes));
+
+    return true;
+}
+
+static bool drflac__read_subframe_header(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    unsigned char header;
+    if (!drflac__read_uint8(pFlac, 8, &header)) {
+        return false;
+    }
+
+    // First bit should always be 0.
+    if ((header & 0x80) != 0) {
+        return false;
+    }
+
+    int type = (header & 0x7E) >> 1;
+    if (type == 0) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
+    } else if (type == 1) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
+    } else {
+        if ((type & 0x20) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
+            pSubframe->lpcOrder = (type & 0x1F) + 1;
+        } else if ((type & 0x08) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
+            pSubframe->lpcOrder = (type & 0x07);
+            if (pSubframe->lpcOrder > 4) {
+                pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+                pSubframe->lpcOrder = 0;
+            }
+        } else {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+        }
+    }
+
+    if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
+        return false;
+    }
+
+    // Wasted bits per sample.
+    pSubframe->wastedBitsPerSample = 0;
+    if ((header & 0x01) == 1) {
+        unsigned int wastedBitsPerSample;
+        if (!drflac__seek_past_next_set_bit(pFlac, &wastedBitsPerSample)) {
+            return false;
+        }
+        pSubframe->wastedBitsPerSample = (unsigned char)wastedBitsPerSample + 1;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_subframe(drflac* pFlac, int subframeIndex)
+{
+    assert(pFlac != NULL);
+
+    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
+    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
+        return false;
+    }
+
+    // Side channels require an extra bit per sample. Took a while to figure that one out...
+    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
+    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        pSubframe->bitsPerSample += 1;
+    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        pSubframe->bitsPerSample += 1;
+    }
+
+    // Need to handle wasted bits per sample.
+    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            drflac__decode_samples__constant(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            drflac__decode_samples__verbatim(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            drflac__decode_samples__fixed(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            drflac__decode_samples__lpc(pFlac, pSubframe);
+        } break;
+
+        default: return false;
+    }
+
+    return true;
+}
+
+static bool drflac__seek_subframe(drflac* pFlac, int subframeIndex)
+{
+    assert(pFlac != NULL);
+
+    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
+    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
+        return false;
+    }
+
+    // Side channels require an extra bit per sample. Took a while to figure that one out...
+    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
+    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        pSubframe->bitsPerSample += 1;
+    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        pSubframe->bitsPerSample += 1;
+    }
+
+    // Need to handle wasted bits per sample.
+    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            if (!drflac__seek_bits(pFlac, pSubframe->bitsPerSample)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            unsigned int bitsToSeek = pFlac->currentFrame.blockSize * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            unsigned char lpcPrecision;
+            if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
+                return false;
+            }
+            if (lpcPrecision == 15) {
+                return false;    // Invalid.
+            }
+            lpcPrecision += 1;
+
+
+            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    // +5 for shift.
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
+                return false;
+            }
+        } break;
+
+        default: return false;
+    }
+
+    return true;
+}
+
+
+static DRFLAC_INLINE int drflac__get_channel_count_from_channel_assignment(int channelAssignment)
+{
+    assert(channelAssignment <= 10);
+
+    int lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
+    return lookup[channelAssignment];
+}
+
+static bool drflac__decode_frame(drflac* pFlac)
+{
+    // This function should be called while the stream is sitting on the first byte after the frame header.
+
+    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+    for (int i = 0; i < channelCount; ++i)
+    {
+        if (!drflac__decode_subframe(pFlac, i)) {
+            return false;
+        }
+    }
+
+    // At the end of the frame sits the padding and CRC. We don't use these so we can just seek past.
+    if (!drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16)) {
+        return false;
+    }
+
+
+    pFlac->currentFrame.samplesRemaining = pFlac->currentFrame.blockSize * channelCount;
+
+    return true;
+}
+
+static bool drflac__seek_frame(drflac* pFlac)
+{
+    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+    for (int i = 0; i < channelCount; ++i)
+    {
+        if (!drflac__seek_subframe(pFlac, i)) {
+            return false;
+        }
+    }
+
+    // Padding and CRC.
+    return drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16);
+}
+
+static bool drflac__read_and_decode_next_frame(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    if (!drflac__read_next_frame_header(pFlac)) {
+        return false;
+    }
+
+    return drflac__decode_frame(pFlac);
+}
+
+static unsigned int drflac__read_block_header(drflac* pFlac, unsigned int* pBlockSizeOut, bool* pIsLastBlockOut)    // Returns the block type.
+{
+    assert(pFlac != NULL);
+
+    unsigned char isLastBlock = 1;
+    unsigned char blockType = DRFLAC_BLOCK_TYPE_INVALID;
+    unsigned int blockSize = 0;
+
+    if (!drflac__read_uint8(pFlac, 1, &isLastBlock)) {
+        goto done_reading_block_header;
+    }
+
+    if (!drflac__read_uint8(pFlac, 7, &blockType)) {
+        goto done_reading_block_header;
+    }
+
+    if (!drflac__read_uint32(pFlac, 24, &blockSize)) {
+        goto done_reading_block_header;
+    }
+
+
+done_reading_block_header:
+    if (pBlockSizeOut) {
+        *pBlockSizeOut = blockSize;
+    }
+
+    if (pIsLastBlockOut) {
+        *pIsLastBlockOut = isLastBlock;
+    }
+
+    return blockType;
+}
+
+
+static void drflac__get_current_frame_sample_range(drflac* pFlac, uint64_t* pFirstSampleInFrameOut, uint64_t* pLastSampleInFrameOut)
+{
+    assert(pFlac != NULL);
+
+    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+
+    uint64_t firstSampleInFrame = pFlac->currentFrame.sampleNumber;
+    if (firstSampleInFrame == 0) {
+        firstSampleInFrame = pFlac->currentFrame.frameNumber * pFlac->maxBlockSize*channelCount;
+    }
+
+    uint64_t lastSampleInFrame = firstSampleInFrame + (pFlac->currentFrame.blockSize*channelCount);
+    if (lastSampleInFrame > 0) {
+        lastSampleInFrame -= 1; // Needs to be zero based.
+    }
+
+
+    if (pFirstSampleInFrameOut) {
+        *pFirstSampleInFrameOut = firstSampleInFrame;
+    }
+    if (pLastSampleInFrameOut) {
+        *pLastSampleInFrameOut = lastSampleInFrame;
+    }
+}
+
+static bool drflac__seek_to_first_frame(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    bool result = drflac__seek_to_byte(pFlac, (long long)pFlac->firstFramePos);
+    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+    pFlac->cache = 0;
+
+    memset(&pFlac->currentFrame, 0, sizeof(pFlac->currentFrame));
+
+
+    return result;
+}
+
+static DRFLAC_INLINE bool drflac__seek_to_next_frame(drflac* pFlac)
+{
+    // This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section.
+    assert(pFlac != NULL);
+    return drflac__seek_frame(pFlac);
+}
+
+static bool drflac__seek_to_frame_containing_sample(drflac* pFlac, uint64_t sampleIndex)
+{
+    assert(pFlac != NULL);
+
+    if (!drflac__seek_to_first_frame(pFlac)) {
+        return false;
+    }
+
+    uint64_t firstSampleInFrame = 0;
+    uint64_t lastSampleInFrame = 0;
+    for (;;)
+    {
+        // We need to read the frame's header in order to determine the range of samples it contains.
+        if (!drflac__read_next_frame_header(pFlac)) {
+            return false;
+        }
+
+        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
+        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
+            break;  // The sample is in this frame.
+        }
+
+        if (!drflac__seek_to_next_frame(pFlac)) {
+            return false;
+        }
+    }
+
+    // If we get here we should be right at the start of the frame containing the sample.
+    return true;
+}
+
+static bool drflac__seek_to_sample__brute_force(drflac* pFlac, uint64_t sampleIndex)
+{
+    if (!drflac__seek_to_frame_containing_sample(pFlac, sampleIndex)) {
+        return false;
+    }
+
+    // At this point we should be sitting on the first byte of the frame containing the sample. We need to decode every sample up to (but
+    // not including) the sample we're seeking to.
+    uint64_t firstSampleInFrame = 0;
+    drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, NULL);
+
+    assert(firstSampleInFrame <= sampleIndex);
+    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
+    if (samplesToDecode == 0) {
+        return true;
+    }
+
+    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
+    if (!drflac__decode_frame(pFlac)) {
+        return false;
+    }
+
+    return drflac_read_s16(pFlac, samplesToDecode, NULL);
+}
+
+static bool drflac__seek_to_sample__seek_table(drflac* pFlac, uint64_t sampleIndex)
+{
+    assert(pFlac != NULL);
+
+    if (pFlac->seektableBlock.pos == 0) {
+        return false;
+    }
+
+    if (!drflac__seek_to_byte(pFlac, pFlac->seektableBlock.pos)) {
+        return false;
+    }
+
+    // The number of seek points is derived from the size of the SEEKTABLE block.
+    unsigned int seekpointCount = pFlac->seektableBlock.sizeInBytes / 18;   // 18 = the size of each seek point.
+    if (seekpointCount == 0) {
+        return false;   // Would this ever happen?
+    }
+
+
+    drflac_seekpoint closestSeekpoint = {0};
+
+    unsigned int seekpointsRemaining = seekpointCount;
+    while (seekpointsRemaining > 0)
+    {
+        drflac_seekpoint seekpoint;
+        if (!drflac__read_uint64(pFlac, 64, &seekpoint.firstSample)) {
+            break;
+        }
+        if (!drflac__read_uint64(pFlac, 64, &seekpoint.frameOffset)) {
+            break;
+        }
+        if (!drflac__read_uint16(pFlac, 16, &seekpoint.sampleCount)) {
+            break;
+        }
+
+        if (seekpoint.firstSample * pFlac->channels > sampleIndex) {
+            break;
+        }
+
+        closestSeekpoint = seekpoint;
+        seekpointsRemaining -= 1;
+    }
+
+    // At this point we should have found the seekpoint closest to our sample. We need to seek to it using basically the same
+    // technique as we use with the brute force method.
+    drflac__seek_to_byte(pFlac, pFlac->firstFramePos + closestSeekpoint.frameOffset);
+
+    uint64_t firstSampleInFrame = 0;
+    uint64_t lastSampleInFrame = 0;
+    for (;;)
+    {
+        // We need to read the frame's header in order to determine the range of samples it contains.
+        if (!drflac__read_next_frame_header(pFlac)) {
+            return false;
+        }
+
+        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
+        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
+            break;  // The sample is in this frame.
+        }
+
+        if (!drflac__seek_to_next_frame(pFlac)) {
+            return false;
+        }
+    }
+
+    assert(firstSampleInFrame <= sampleIndex);
+
+    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
+    if (!drflac__decode_frame(pFlac)) {
+        return false;
+    }
+
+    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
+    return drflac_read_s16(pFlac, samplesToDecode, NULL) == samplesToDecode;
+}
+
+
+static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData)
+{
+    if (onRead == NULL || onSeek == NULL) {
+        return false;
+    }
+
+    unsigned char id[4];
+    if (onRead(pUserData, id, 4) != 4 || id[0] != 'f' || id[1] != 'L' || id[2] != 'a' || id[3] != 'C') {
+        return false;    // Not a FLAC stream.
+    }
+
+    drflac tempFlac;
+    memset(&tempFlac, 0, sizeof(tempFlac));
+    tempFlac.onRead         = onRead;
+    tempFlac.onSeek         = onSeek;
+    tempFlac.pUserData      = pUserData;
+    tempFlac.currentBytePos = 4;
+    tempFlac.nextL2Line     = sizeof(tempFlac.cacheL2) / sizeof(tempFlac.cacheL2[0]); // <-- Initialize to this to force a client-side data retrieval right from the start.
+    tempFlac.consumedBits   = sizeof(tempFlac.cache)*8;
+
+    // The first metadata block should be the STREAMINFO block. We don't care about everything in here.
+    unsigned int blockSize;
+    bool isLastBlock;
+    int blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
+    if (blockType != DRFLAC_BLOCK_TYPE_STREAMINFO && blockSize != 34) {
+        return false;
+    }
+
+    if (!drflac__seek_bits(&tempFlac, 16)) {   // minBlockSize
+        return false;
+    }
+    if (!drflac__read_uint16(&tempFlac, 16, &tempFlac.maxBlockSize)) {
+        return false;
+    }
+    if (!drflac__seek_bits(&tempFlac, 48)) {   // minFrameSize + maxFrameSize
+        return false;
+    }
+    if (!drflac__read_uint32(&tempFlac, 20, &tempFlac.sampleRate)) {
+        return false;
+    }
+    if (!drflac__read_uint8(&tempFlac, 3, &tempFlac.channels)) {
+        return false;
+    }
+    if (!drflac__read_uint8(&tempFlac, 5, &tempFlac.bitsPerSample)) {
+        return false;
+    }
+    if (!drflac__read_uint64(&tempFlac, 36, &tempFlac.totalSampleCount)) {
+        return false;
+    }
+    if (!drflac__seek_bits(&tempFlac, 128)) {  // MD5
+        return false;
+    }
+
+    tempFlac.channels += 1;
+    tempFlac.bitsPerSample += 1;
+    tempFlac.totalSampleCount *= tempFlac.channels;
+
+    while (!isLastBlock)
+    {
+        blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
+
+        switch (blockType)
+        {
+            case DRFLAC_BLOCK_TYPE_APPLICATION:
+            {
+                tempFlac.applicationBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.applicationBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_SEEKTABLE:
+            {
+                tempFlac.seektableBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.seektableBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_VORBIS_COMMENT:
+            {
+                tempFlac.vorbisCommentBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.vorbisCommentBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_CUESHEET:
+            {
+                tempFlac.cuesheetBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.cuesheetBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_PICTURE:
+            {
+                tempFlac.pictureBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.pictureBlock.sizeInBytes = blockSize;
+            } break;
+
+
+            // These blocks we either don't care about or aren't supporting.
+            case DRFLAC_BLOCK_TYPE_PADDING:
+            case DRFLAC_BLOCK_TYPE_INVALID:
+            default: break;
+        }
+
+        if (!drflac__seek_bits(&tempFlac, blockSize*8)) {
+            return false;
+        }
+    }
+
+
+    // At this point we should be sitting right at the start of the very first frame.
+    tempFlac.firstFramePos = drflac__tell(&tempFlac);
+
+    drflac* pFlac = (drflac*)malloc(sizeof(*pFlac) - sizeof(pFlac->pExtraData) + (tempFlac.maxBlockSize * tempFlac.channels * sizeof(int32_t)));
+    memcpy(pFlac, &tempFlac, sizeof(tempFlac) - sizeof(pFlac->pExtraData));
+    pFlac->pDecodedSamples = (int32_t*)pFlac->pExtraData;
+
+    return pFlac;
+}
+
+static void drflac_close(drflac* pFlac)
+{
+    if (pFlac == NULL) {
+        return;
+    }
+
+#ifndef DR_FLAC_NO_STDIO
+    // If we opened the file with drflac_open_file() we will want to close the file handle. We can know whether or not drflac_open_file()
+    // was used by looking at the callbacks.
+    if (pFlac->onRead == drflac__on_read_stdio) {
+#if defined(DR_OPUS_NO_WIN32_IO) || !defined(_WIN32)
+        fclose((FILE*)pFlac->pUserData);
+#else
+        CloseHandle((HANDLE)pFlac->pUserData);
+#endif
+    }
+#endif
+
+    // If we opened the file with drflac_open_memory() we will want to free() the user data.
+    if (pFlac->onRead == drflac__on_read_memory) {
+        free(pFlac->pUserData);
+    }
+
+    free(pFlac);
+}
+
+static uint64_t drflac__read_s16__misaligned(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
+{
+    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+
+    // We should never be calling this when the number of samples to read is >= the sample count.
+    assert(samplesToRead < channelCount);
+    assert(pFlac->currentFrame.samplesRemaining > 0 && samplesToRead <= pFlac->currentFrame.samplesRemaining);
+
+
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
+        uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
+        unsigned int channelIndex = samplesReadFromFrameSoFar % channelCount;
+
+        unsigned long long nextSampleInFrame = samplesReadFromFrameSoFar / channelCount;
+
+        int decodedSample = 0;
+        switch (pFlac->currentFrame.channelAssignment)
+        {
+            case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+            {
+                if (channelIndex == 0) {
+                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+                } else {
+                    int side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    int left = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
+                    decodedSample = left - side;
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+            {
+                if (channelIndex == 0) {
+                    int side  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    int right = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
+                    decodedSample = side + right;
+                } else {
+                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+            {
+                int mid;
+                int side;
+                if (channelIndex == 0) {
+                    mid  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    side = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
+
+                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
+                    decodedSample = (mid + side) >> 1;
+                } else {
+                    mid  = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
+                    side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+
+                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
+                    decodedSample = (mid - side) >> 1;
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+            default:
+            {
+                decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+            } break;
+        }
+
+        int shift = (16 - pFlac->bitsPerSample) + pFlac->currentFrame.subframes[channelIndex].wastedBitsPerSample;
+        if (shift >= 0) {
+            decodedSample <<= shift;
+        } else {
+            decodedSample >>= -shift;
+        }
+
+        if (bufferOut) {
+            *bufferOut++ = decodedSample;
+        }
+
+        samplesRead += 1;
+        pFlac->currentFrame.samplesRemaining -= 1;
+        samplesToRead -= 1;
+    }
+
+    return samplesRead;
+}
+
+static uint64_t drflac__seek_forward_by_samples(drflac* pFlac, uint64_t samplesToRead)
+{
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        if (pFlac->currentFrame.samplesRemaining == 0)
+        {
+            if (!drflac__read_and_decode_next_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        }
+        else
+        {
+            samplesRead += 1;
+            pFlac->currentFrame.samplesRemaining -= 1;
+            samplesToRead -= 1;
+        }
+    }
+
+    return samplesRead;
+}
+
+static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
+{
+    // Note that <bufferOut> is allowed to be null, in which case this will be treated as something like a seek.
+    if (pFlac == NULL || samplesToRead == 0) {
+        return 0;
+    }
+
+    if (bufferOut == NULL) {
+        return drflac__seek_forward_by_samples(pFlac, samplesToRead);
+    }
+
+
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        // If we've run out of samples in this frame, go to the next.
+        if (pFlac->currentFrame.samplesRemaining == 0)
+        {
+            if (!drflac__read_and_decode_next_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        }
+        else
+        {
+            // Here is where we grab the samples and interleave them.
+
+            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+            uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
+            uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
+
+            int misalignedSampleCount = samplesReadFromFrameSoFar % channelCount;
+            if (misalignedSampleCount > 0) {
+                uint64_t misalignedSamplesRead = drflac__read_s16__misaligned(pFlac, misalignedSampleCount, bufferOut);
+                samplesRead   += misalignedSamplesRead;
+                samplesReadFromFrameSoFar += misalignedSamplesRead;
+                bufferOut     += misalignedSamplesRead;
+                samplesToRead -= misalignedSamplesRead;
+            }
+
+
+            uint64_t alignedSampleCountPerChannel = samplesToRead / channelCount;
+            if (alignedSampleCountPerChannel > pFlac->currentFrame.samplesRemaining / channelCount) {
+                alignedSampleCountPerChannel = pFlac->currentFrame.samplesRemaining / channelCount;
+            }
+
+            uint64_t firstAlignedSampleInFrame = samplesReadFromFrameSoFar / channelCount;
+            int unusedBitsPerSample = 16 - pFlac->bitsPerSample;
+
+            if (unusedBitsPerSample >= 0) {
+                int lshift0 = unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+                int lshift1 = unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+                switch (pFlac->currentFrame.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int left  = pDecodedSamples0[i];
+                            int side  = pDecodedSamples1[i];
+                            int right = left - side;
+
+                            bufferOut[i*2+0] = left  << lshift0;
+                            bufferOut[i*2+1] = right << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side  = pDecodedSamples0[i];
+                            int right = pDecodedSamples1[i];
+                            int left  = right + side;
+
+                            bufferOut[i*2+0] = left  << lshift0;
+                            bufferOut[i*2+1] = right << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side = pDecodedSamples1[i];
+                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+
+                            bufferOut[i*2+0] = ((mid + side) >> 1) << lshift0;
+                            bufferOut[i*2+1] = ((mid - side) >> 1) << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
+                        {
+                            // Stereo optimized inner loop unroll.
+                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                bufferOut[i*2+0] = pDecodedSamples0[i] << lshift0;
+                                bufferOut[i*2+1] = pDecodedSamples1[i] << lshift1;
+                            }
+                        }
+                        else
+                        {
+                            // Generic interleaving.
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                for (unsigned int j = 0; j < channelCount; ++j) {
+                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) << (unusedBitsPerSample + pFlac->currentFrame.subframes[j].wastedBitsPerSample);
+                                }
+                            }
+                        }
+                    } break;
+                }
+            } else {
+                int rshift0 = -unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+                int rshift1 = -unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+                switch (pFlac->currentFrame.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int left  = pDecodedSamples0[i];
+                            int side  = pDecodedSamples1[i];
+                            int right = left - side;
+
+                            bufferOut[i*2+0] = left  >> rshift0;
+                            bufferOut[i*2+1] = right >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side  = pDecodedSamples0[i];
+                            int right = pDecodedSamples1[i];
+                            int left  = right + side;
+
+                            bufferOut[i*2+0] = left  >> rshift0;
+                            bufferOut[i*2+1] = right >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side = pDecodedSamples1[i];
+                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+
+                            bufferOut[i*2+0] = ((mid + side) >> 1) >> rshift0;
+                            bufferOut[i*2+1] = ((mid - side) >> 1) >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
+                        {
+                            // Stereo optimized inner loop unroll.
+                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                bufferOut[i*2+0] = pDecodedSamples0[i] >> rshift0;
+                                bufferOut[i*2+1] = pDecodedSamples1[i] >> rshift1;
+                            }
+                        }
+                        else
+                        {
+                            // Generic interleaving.
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                for (unsigned int j = 0; j < channelCount; ++j) {
+                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) >> (pFlac->currentFrame.subframes[j].wastedBitsPerSample - unusedBitsPerSample);
+                                }
+                            }
+                        }
+                    } break;
+                }
+            }
+
+            uint64_t alignedSamplesRead = alignedSampleCountPerChannel * channelCount;
+            samplesRead   += alignedSamplesRead;
+            samplesReadFromFrameSoFar += alignedSamplesRead;
+            bufferOut     += alignedSamplesRead;
+            samplesToRead -= alignedSamplesRead;
+            pFlac->currentFrame.samplesRemaining -= (unsigned int)alignedSamplesRead;
+
+
+
+            // At this point we may still have some excess samples left to read.
+            if (samplesToRead > 0 && pFlac->currentFrame.samplesRemaining > 0)
+            {
+                uint64_t excessSamplesRead = 0;
+                if (samplesToRead < pFlac->currentFrame.samplesRemaining) {
+                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, samplesToRead, bufferOut);
+                } else {
+                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, pFlac->currentFrame.samplesRemaining, bufferOut);
+                }
+
+                samplesRead   += excessSamplesRead;
+                samplesReadFromFrameSoFar += excessSamplesRead;
+                bufferOut     += excessSamplesRead;
+                samplesToRead -= excessSamplesRead;
+            }
+        }
+    }
+
+    return samplesRead;
+}
+
+static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex)
+{
+    if (pFlac == NULL) {
+        return false;
+    }
+
+    if (sampleIndex == 0) {
+        return drflac__seek_to_first_frame(pFlac);
+    }
+
+    // Clamp the sample to the end.
+    if (sampleIndex >= pFlac->totalSampleCount) {
+        sampleIndex  = pFlac->totalSampleCount - 1;
+    }
+
+
+    // First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower.
+    if (!drflac__seek_to_sample__seek_table(pFlac, sampleIndex)) {
+        return drflac__seek_to_sample__brute_force(pFlac, sampleIndex);
+    }
+
+    return true;
+}
+
+
+#endif  //DR_FLAC_IMPLEMENTATION
+
+
+/*
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+*/

+ 12 - 0
panda/src/movies/flacAudio.I

@@ -0,0 +1,12 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.I
+ * @author rdb
+ * @date 2016-04-27
+ */

+ 64 - 0
panda/src/movies/flacAudio.cxx

@@ -0,0 +1,64 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.cxx
+ * @author rdb
+ * @date 2016-04-27
+ */
+
+#include "flacAudio.h"
+#include "flacAudioCursor.h"
+#include "virtualFileSystem.h"
+#include "dcast.h"
+
+TypeHandle FlacAudio::_type_handle;
+
+/**
+ * xxx
+ */
+FlacAudio::
+FlacAudio(const Filename &name) :
+  MovieAudio(name)
+{
+  _filename = name;
+}
+
+/**
+ * xxx
+ */
+FlacAudio::
+~FlacAudio() {
+}
+
+/**
+ * Open this audio, returning a MovieAudioCursor
+ */
+PT(MovieAudioCursor) FlacAudio::
+open() {
+  VirtualFileSystem *vfs = VirtualFileSystem::get_global_ptr();
+  istream *stream = vfs->open_read_file(_filename, true);
+
+  if (stream == NULL) {
+    return NULL;
+  } else {
+    PT(FlacAudioCursor) cursor = new FlacAudioCursor(this, stream);
+    if (cursor == NULL || !cursor->_is_valid) {
+      return NULL;
+    } else {
+      return DCAST(MovieAudioCursor, cursor);
+    }
+  }
+}
+
+/**
+ * Obtains a MovieAudio that references a file.
+ */
+PT(MovieAudio) FlacAudio::
+make(const Filename &name) {
+  return DCAST(MovieAudio, new FlacAudio(name));
+}

+ 54 - 0
panda/src/movies/flacAudio.h

@@ -0,0 +1,54 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.h
+ * @author rdb
+ * @date 2016-04-27
+ */
+
+#ifndef FLACAUDIO_H
+#define FLACAUDIO_H
+
+#include "pandabase.h"
+#include "movieAudio.h"
+
+class FlacAudioCursor;
+
+/**
+ * Reads FLAC audio files.  Ogg-encapsulated FLAC files are not supported.
+ */
+class EXPCL_PANDA_MOVIES FlacAudio : public MovieAudio {
+PUBLISHED:
+  FlacAudio(const Filename &name);
+  virtual ~FlacAudio();
+  virtual PT(MovieAudioCursor) open();
+
+  static PT(MovieAudio) make(const Filename &name);
+
+private:
+  friend class FlacAudioCursor;
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    MovieAudio::init_type();
+    register_type(_type_handle, "FlacAudio",
+                  MovieAudio::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#endif // FLACAUDIO_H

+ 12 - 0
panda/src/movies/flacAudioCursor.I

@@ -0,0 +1,12 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file vorbisAudioCursor.I
+ * @author rdb
+ * @date 2013-08-23
+ */

+ 120 - 0
panda/src/movies/flacAudioCursor.cxx

@@ -0,0 +1,120 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudioCursor.cxx
+ * @author rdb
+ * @date 2013-08-23
+ */
+
+#include "flacAudioCursor.h"
+#include "virtualFileSystem.h"
+
+#define DR_FLAC_IMPLEMENTATION
+#define DR_FLAC_NO_STDIO
+extern "C" {
+  #include "dr_flac.h"
+}
+
+/**
+ * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
+ */
+static size_t cb_read_proc(void *user, void *buffer, size_t size) {
+  istream *stream = (istream *)user;
+  nassertr(stream != NULL, false);
+
+  stream->read((char *)buffer, size);
+
+  if (stream->eof()) {
+    // Gracefully handle EOF.
+    stream->clear();
+  }
+
+  return stream->gcount();
+}
+
+/**
+ * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
+ */
+static bool cb_seek_proc(void *user, int offset) {
+  istream *stream = (istream *)user;
+  nassertr(stream != NULL, false);
+
+  stream->seekg(offset, ios::cur);
+  return !stream->fail();
+}
+
+TypeHandle FlacAudioCursor::_type_handle;
+
+/**
+ * Reads the .wav header from the indicated stream.  This leaves the read
+ * pointer positioned at the start of the data.
+ */
+FlacAudioCursor::
+FlacAudioCursor(FlacAudio *src, istream *stream) :
+  MovieAudioCursor(src),
+  _is_valid(false),
+  _drflac(NULL)
+{
+  nassertv(stream != NULL);
+  nassertv(stream->good());
+
+  _drflac = drflac_open(&cb_read_proc, &cb_seek_proc, (void *)stream);
+
+  if (_drflac == NULL) {
+    movies_cat.error()
+      << "Failed to open FLAC file.\n";
+    _is_valid = false;
+  }
+
+  _length = (_drflac->totalSampleCount / _drflac->channels) / (double)_drflac->sampleRate;
+
+  _audio_channels = _drflac->channels;
+  _audio_rate = _drflac->sampleRate;
+
+  _can_seek = true;
+  _can_seek_fast = _can_seek;
+
+  _is_valid = true;
+}
+
+/**
+ * xxx
+ */
+FlacAudioCursor::
+~FlacAudioCursor() {
+  if (_drflac != NULL) {
+    drflac_close(_drflac);
+  }
+}
+
+/**
+ * Seeks to a target location.  Afterward, the packet_time is guaranteed to be
+ * less than or equal to the specified time.
+ */
+void FlacAudioCursor::
+seek(double t) {
+  t = max(t, 0.0);
+
+  uint64_t sample = t * _drflac->sampleRate;
+
+  if (drflac_seek_to_sample(_drflac, sample * _drflac->channels)) {
+    _last_seek = sample / (double)_drflac->sampleRate;
+    _samples_read = 0;
+  }
+}
+
+/**
+ * Read audio samples from the stream.  N is the number of samples you wish to
+ * read.  Your buffer must be equal in size to N * channels.  Multiple-channel
+ * audio will be interleaved.
+ */
+void FlacAudioCursor::
+read_samples(int n, PN_int16 *data) {
+  int desired = n * _audio_channels;
+  _samples_read += drflac_read_s16(_drflac, desired, data) / _audio_channels;
+}

+ 65 - 0
panda/src/movies/flacAudioCursor.h

@@ -0,0 +1,65 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudioCursor.h
+ * @author rdb
+ * @date 2013-08-23
+ */
+
+#ifndef FLACAUDIOCURSOR_H
+#define FLACAUDIOCURSOR_H
+
+#include "pandabase.h"
+#include "movieAudioCursor.h"
+
+#define DR_FLAC_NO_STDIO
+extern "C" {
+  #include "dr_flac.h"
+}
+
+class FlacAudio;
+
+/**
+ * Interfaces with the libvorbisfile library to implement decoding of Ogg
+ * Vorbis audio files.
+ */
+class EXPCL_PANDA_MOVIES FlacAudioCursor : public MovieAudioCursor {
+PUBLISHED:
+  FlacAudioCursor(FlacAudio *src, istream *stream);
+  virtual ~FlacAudioCursor();
+  virtual void seek(double offset);
+
+public:
+  virtual void read_samples(int n, PN_int16 *data);
+
+  bool _is_valid;
+
+protected:
+  drflac *_drflac;
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    MovieAudioCursor::init_type();
+    register_type(_type_handle, "FlacAudioCursor",
+                  MovieAudioCursor::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#include "flacAudioCursor.I"
+
+#endif // FLACAUDIOCURSOR_H

+ 2 - 0
panda/src/movies/p3movies_composite1.cxx

@@ -1,4 +1,6 @@
 #include "config_movies.cxx"
 #include "config_movies.cxx"
+#include "flacAudio.cxx"
+#include "flacAudioCursor.cxx"
 #include "inkblotVideo.cxx"
 #include "inkblotVideo.cxx"
 #include "inkblotVideoCursor.cxx"
 #include "inkblotVideoCursor.cxx"
 #include "microphoneAudio.cxx"
 #include "microphoneAudio.cxx"

+ 4 - 0
panda/src/pgraph/alphaTestAttrib.h

@@ -36,6 +36,10 @@ PUBLISHED:
   INLINE PN_stdfloat get_reference_alpha() const;
   INLINE PN_stdfloat get_reference_alpha() const;
   INLINE PandaCompareFunc get_mode() const;
   INLINE PandaCompareFunc get_mode() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(reference_alpha, get_reference_alpha);
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 5 - 0
panda/src/pgraph/antialiasAttrib.h

@@ -52,6 +52,11 @@ PUBLISHED:
   INLINE unsigned short get_mode_type() const;
   INLINE unsigned short get_mode_type() const;
   INLINE unsigned short get_mode_quality() const;
   INLINE unsigned short get_mode_quality() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+  MAKE_PROPERTY(mode_type, get_mode_type);
+  MAKE_PROPERTY(mode_quality, get_mode_quality);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 3 - 0
panda/src/pgraph/audioVolumeAttrib.h

@@ -40,6 +40,9 @@ PUBLISHED:
   INLINE PN_stdfloat get_volume() const;
   INLINE PN_stdfloat get_volume() const;
   CPT(RenderAttrib) set_volume(PN_stdfloat volume) const;
   CPT(RenderAttrib) set_volume(PN_stdfloat volume) const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY2(volume, has_volume, get_volume);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 3 - 0
panda/src/pgraph/auxBitplaneAttrib.h

@@ -63,6 +63,9 @@ PUBLISHED:
 
 
   INLINE int get_outputs() const;
   INLINE int get_outputs() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(outputs, get_outputs);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 8 - 3
panda/src/pgraph/camera.cxx

@@ -272,8 +272,10 @@ write_datagram(BamWriter *manager, Datagram &dg) {
   dg.add_bool(_active);
   dg.add_bool(_active);
   dg.add_uint32(_camera_mask.get_word());
   dg.add_uint32(_camera_mask.get_word());
 
 
-  manager->write_pointer(dg, _initial_state);
-  dg.add_stdfloat(_lod_scale);
+  if (manager->get_file_minor_ver() >= 41) {
+    manager->write_pointer(dg, _initial_state);
+    dg.add_stdfloat(_lod_scale);
+  }
 }
 }
 
 
 ////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////
@@ -286,7 +288,10 @@ write_datagram(BamWriter *manager, Datagram &dg) {
 int Camera::
 int Camera::
 complete_pointers(TypedWritable **p_list, BamReader *manager) {
 complete_pointers(TypedWritable **p_list, BamReader *manager) {
   int pi = LensNode::complete_pointers(p_list, manager);
   int pi = LensNode::complete_pointers(p_list, manager);
-  _initial_state = DCAST(RenderState, p_list[pi++]);
+
+  if (manager->get_file_minor_ver() >= 41) {
+    _initial_state = DCAST(RenderState, p_list[pi++]);
+  }
   return pi;
   return pi;
 }
 }
 
 

+ 4 - 0
panda/src/pgraph/colorAttrib.h

@@ -42,6 +42,10 @@ PUBLISHED:
   INLINE Type get_color_type() const;
   INLINE Type get_color_type() const;
   INLINE const LColor &get_color() const;
   INLINE const LColor &get_color() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(color_type, get_color_type);
+  MAKE_PROPERTY(color, get_color);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 44 - 15
panda/src/pgraph/colorBlendAttrib.I

@@ -19,6 +19,9 @@ ColorBlendAttrib() :
   _mode(M_none),
   _mode(M_none),
   _a(O_one),
   _a(O_one),
   _b(O_one),
   _b(O_one),
+  _alpha_mode(M_none),
+  _alpha_a(O_one),
+  _alpha_b(O_one),
   _color(LColor::zero()),
   _color(LColor::zero()),
   _involves_constant_color(false),
   _involves_constant_color(false),
   _involves_color_scale(false)
   _involves_color_scale(false)
@@ -31,18 +34,29 @@ ColorBlendAttrib() :
 INLINE ColorBlendAttrib::
 INLINE ColorBlendAttrib::
 ColorBlendAttrib(ColorBlendAttrib::Mode mode,
 ColorBlendAttrib(ColorBlendAttrib::Mode mode,
                  ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
                  ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
+                 ColorBlendAttrib::Mode alpha_mode,
+                 ColorBlendAttrib::Operand alpha_a, ColorBlendAttrib::Operand alpha_b,
                  const LColor &color) :
                  const LColor &color) :
   _mode(mode),
   _mode(mode),
   _a(a),
   _a(a),
   _b(b),
   _b(b),
+  _alpha_mode(alpha_mode),
+  _alpha_a(alpha_a),
+  _alpha_b(alpha_b),
   _color(color),
   _color(color),
-  _involves_constant_color(involves_constant_color(a) || involves_constant_color(b)),
-  _involves_color_scale(involves_color_scale(a) || involves_color_scale(b))
+  _involves_constant_color(involves_constant_color(a) ||
+                           involves_constant_color(b) ||
+                           involves_constant_color(alpha_a) ||
+                           involves_constant_color(alpha_b)),
+  _involves_color_scale(involves_color_scale(a) ||
+                        involves_color_scale(b) ||
+                        involves_color_scale(alpha_a) ||
+                        involves_color_scale(alpha_b))
 {
 {
 }
 }
 
 
 /**
 /**
- * Returns the colorBlend mode.
+ * Returns the blending mode for the RGB channels.
  */
  */
 INLINE ColorBlendAttrib::Mode ColorBlendAttrib::
 INLINE ColorBlendAttrib::Mode ColorBlendAttrib::
 get_mode() const {
 get_mode() const {
@@ -50,7 +64,7 @@ get_mode() const {
 }
 }
 
 
 /**
 /**
- * Returns the multiplier for the first component.
+ * Returns the RGB multiplier for the first component.
  */
  */
 INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
 INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
 get_operand_a() const {
 get_operand_a() const {
@@ -58,13 +72,37 @@ get_operand_a() const {
 }
 }
 
 
 /**
 /**
- * Returns the multiplier for the second component.
+ * Returns the RGB multiplier for the second component.
  */
  */
 INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
 INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
 get_operand_b() const {
 get_operand_b() const {
   return _b;
   return _b;
 }
 }
 
 
+/**
+ * Returns the blending mode for the alpha channel.
+ */
+INLINE ColorBlendAttrib::Mode ColorBlendAttrib::
+get_alpha_mode() const {
+  return _alpha_mode;
+}
+
+/**
+ * Returns the alpha multiplier for the first component.
+ */
+INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
+get_alpha_operand_a() const {
+  return _alpha_a;
+}
+
+/**
+ * Returns the alpha multiplier for the second component.
+ */
+INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
+get_alpha_operand_b() const {
+  return _alpha_b;
+}
+
 /**
 /**
  * Returns the constant color associated with the attrib.
  * Returns the constant color associated with the attrib.
  */
  */
@@ -114,14 +152,5 @@ involves_constant_color(ColorBlendAttrib::Operand operand) {
  */
  */
 INLINE bool ColorBlendAttrib::
 INLINE bool ColorBlendAttrib::
 involves_color_scale(ColorBlendAttrib::Operand operand) {
 involves_color_scale(ColorBlendAttrib::Operand operand) {
-  switch (operand) {
-  case O_color_scale:
-  case O_one_minus_color_scale:
-  case O_alpha_scale:
-  case O_one_minus_alpha_scale:
-    return true;
-
-  default:
-    return false;
-  }
+  return (operand >= O_color_scale);
 }
 }

+ 67 - 5
panda/src/pgraph/colorBlendAttrib.cxx

@@ -39,19 +39,38 @@ make_off() {
 CPT(RenderAttrib) ColorBlendAttrib::
 CPT(RenderAttrib) ColorBlendAttrib::
 make(ColorBlendAttrib::Mode mode) {
 make(ColorBlendAttrib::Mode mode) {
   ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, O_one, O_one,
   ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, O_one, O_one,
+                                                  mode, O_one, O_one,
                                                   LColor::zero());
                                                   LColor::zero());
   return return_new(attrib);
   return return_new(attrib);
 }
 }
 
 
 /**
 /**
  * Constructs a new ColorBlendAttrib object that enables special-effect
  * Constructs a new ColorBlendAttrib object that enables special-effect
- * blending.  This supercedes transparency.
+ * blending.  This supercedes transparency.  The given mode and operands are
+ * used for both the RGB and alpha channels.
  */
  */
 CPT(RenderAttrib) ColorBlendAttrib::
 CPT(RenderAttrib) ColorBlendAttrib::
 make(ColorBlendAttrib::Mode mode,
 make(ColorBlendAttrib::Mode mode,
      ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
      ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
      const LColor &color) {
      const LColor &color) {
-  ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, a, b, color);
+  ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, a, b, mode, a, b, color);
+  return return_new(attrib);
+}
+
+/**
+ * Constructs a new ColorBlendAttrib object that enables special-effect
+ * blending.  This supercedes transparency.  This form is used to specify
+ * separate blending parameters for the RGB and alpha channels.
+ */
+CPT(RenderAttrib) ColorBlendAttrib::
+make(ColorBlendAttrib::Mode mode,
+     ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
+     ColorBlendAttrib::Mode alpha_mode,
+     ColorBlendAttrib::Operand alpha_a, ColorBlendAttrib::Operand alpha_b,
+     const LColor &color) {
+  ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, a, b,
+                                                  alpha_mode, alpha_a, alpha_b,
+                                                  color);
   return return_new(attrib);
   return return_new(attrib);
 }
 }
 
 
@@ -156,6 +175,13 @@ write_datagram(BamWriter *manager, Datagram &dg) {
   dg.add_uint8(_mode);
   dg.add_uint8(_mode);
   dg.add_uint8(_a);
   dg.add_uint8(_a);
   dg.add_uint8(_b);
   dg.add_uint8(_b);
+
+  if (manager->get_file_minor_ver() >= 42) {
+    dg.add_uint8(_alpha_mode);
+    dg.add_uint8(_alpha_a);
+    dg.add_uint8(_alpha_b);
+  }
+
   _color.write_datagram(dg);
   _color.write_datagram(dg);
 }
 }
 
 
@@ -187,10 +213,34 @@ fillin(DatagramIterator &scan, BamReader *manager) {
   _mode = (Mode)scan.get_uint8();
   _mode = (Mode)scan.get_uint8();
   _a = (Operand)scan.get_uint8();
   _a = (Operand)scan.get_uint8();
   _b = (Operand)scan.get_uint8();
   _b = (Operand)scan.get_uint8();
+
+  if (manager->get_file_minor_ver() >= 42) {
+    _alpha_mode = (Mode)scan.get_uint8();
+    _alpha_a = (Operand)scan.get_uint8();
+    _alpha_b = (Operand)scan.get_uint8();
+  } else {
+    // Before bam 6.42, these were shifted by four.
+    if (_a >= O_incoming1_color) {
+      _a = (Operand)(_a + 4);
+    }
+    if (_b >= O_incoming1_color) {
+      _b = (Operand)(_b + 4);
+    }
+
+    // And there was only one set of blend constants for both RGB and alpha.
+    _alpha_mode = _mode;
+    _alpha_a = _a;
+    _alpha_b = _b;
+  }
+
   _color.read_datagram(scan);
   _color.read_datagram(scan);
 
 
-  _involves_constant_color = involves_constant_color(_a) || involves_constant_color(_b);
-  _involves_color_scale = involves_color_scale(_a) || involves_color_scale(_b);
+  _involves_constant_color =
+    involves_constant_color(_a) || involves_constant_color(_alpha_a) ||
+    involves_constant_color(_b) || involves_constant_color(_alpha_b);
+  _involves_color_scale =
+    involves_color_scale(_a) || involves_color_scale(_alpha_a) ||
+    involves_color_scale(_b) || involves_color_scale(_alpha_b);
 }
 }
 
 
 /**
 /**
@@ -234,7 +284,7 @@ operator << (ostream &out, ColorBlendAttrib::Operand operand) {
     return out << "one";
     return out << "one";
 
 
   case ColorBlendAttrib::O_incoming_color:
   case ColorBlendAttrib::O_incoming_color:
-    return out << "incomfing_color";
+    return out << "incoming_color";
 
 
   case ColorBlendAttrib::O_one_minus_incoming_color:
   case ColorBlendAttrib::O_one_minus_incoming_color:
     return out << "one_minus_incoming_color";
     return out << "one_minus_incoming_color";
@@ -283,6 +333,18 @@ operator << (ostream &out, ColorBlendAttrib::Operand operand) {
 
 
   case ColorBlendAttrib::O_one_minus_alpha_scale:
   case ColorBlendAttrib::O_one_minus_alpha_scale:
     return out << "one_minus_alpha_scale";
     return out << "one_minus_alpha_scale";
+
+  case ColorBlendAttrib::O_incoming1_color:
+    return out << "incoming1_color";
+
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+    return out << "one_minus_incoming1_color";
+
+  case ColorBlendAttrib::O_incoming1_alpha:
+    return out << "incoming1_alpha";
+
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    return out << "one_minus_incoming1_alpha";
   }
   }
 
 
   return out << "**invalid ColorBlendAttrib::Operand(" << (int)operand << ")**";
   return out << "**invalid ColorBlendAttrib::Operand(" << (int)operand << ")**";

+ 36 - 5
panda/src/pgraph/colorBlendAttrib.h

@@ -52,11 +52,20 @@ PUBLISHED:
     O_one_minus_constant_alpha,
     O_one_minus_constant_alpha,
     O_incoming_color_saturate,  // valid only for operand a
     O_incoming_color_saturate,  // valid only for operand a
 
 
-    // If you set either of the operands to any of the below, the blend color
-    // is taken from the current ColorScaleAttrib.  This also inhibits the
-    // normal behavior of the ColorScaleAttrib; it no longer directly scales
-    // the vertex colors, on the assumption that you will instead take care of
-    // the scale here, in the blend mode.
+    // The following are used for dual-source blending, where the fragment
+    // shader outputs a second color that will be used for blending.
+    O_incoming1_color,
+    O_one_minus_incoming1_color,
+    O_incoming1_alpha,
+    O_one_minus_incoming1_alpha,
+
+    // If you set any of the operands to any of the below, the blend color is
+    // taken from the current ColorScaleAttrib.  This also inhibits the normal
+    // behavior of the ColorScaleAttrib; it no longer directly scales the
+    // vertex colors, on the assumption that you will instead take care of the
+    // scale here, in the blend mode.
+    //
+    // These modes are being considered for deprecation.
     O_color_scale,
     O_color_scale,
     O_one_minus_color_scale,
     O_one_minus_color_scale,
     O_alpha_scale,
     O_alpha_scale,
@@ -66,6 +75,7 @@ PUBLISHED:
 private:
 private:
   INLINE ColorBlendAttrib();
   INLINE ColorBlendAttrib();
   INLINE ColorBlendAttrib(Mode mode, Operand a, Operand b,
   INLINE ColorBlendAttrib(Mode mode, Operand a, Operand b,
+                          Mode alpha_mode, Operand alpha_a, Operand alpha_b,
                           const LColor &color);
                           const LColor &color);
 
 
 PUBLISHED:
 PUBLISHED:
@@ -73,11 +83,19 @@ PUBLISHED:
   static CPT(RenderAttrib) make(Mode mode);
   static CPT(RenderAttrib) make(Mode mode);
   static CPT(RenderAttrib) make(Mode mode, Operand a, Operand b,
   static CPT(RenderAttrib) make(Mode mode, Operand a, Operand b,
                                 const LColor &color = LColor::zero());
                                 const LColor &color = LColor::zero());
+  static CPT(RenderAttrib) make(Mode rgb_mode, Operand rgb_a, Operand rgb_b,
+                                Mode alpha_mode, Operand alpha_a, Operand alpha_b,
+                                const LColor &color = LColor::zero());
   static CPT(RenderAttrib) make_default();
   static CPT(RenderAttrib) make_default();
 
 
   INLINE Mode get_mode() const;
   INLINE Mode get_mode() const;
   INLINE Operand get_operand_a() const;
   INLINE Operand get_operand_a() const;
   INLINE Operand get_operand_b() const;
   INLINE Operand get_operand_b() const;
+
+  INLINE Mode get_alpha_mode() const;
+  INLINE Operand get_alpha_operand_a() const;
+  INLINE Operand get_alpha_operand_b() const;
+
   INLINE LColor get_color() const;
   INLINE LColor get_color() const;
 
 
   INLINE bool involves_constant_color() const;
   INLINE bool involves_constant_color() const;
@@ -86,6 +104,17 @@ PUBLISHED:
   INLINE static bool involves_constant_color(Operand operand);
   INLINE static bool involves_constant_color(Operand operand);
   INLINE static bool involves_color_scale(Operand operand);
   INLINE static bool involves_color_scale(Operand operand);
 
 
+PUBLISHED:
+  MAKE_PROPERTY(rgb_mode, get_mode);
+  MAKE_PROPERTY(rgb_operand_a, get_operand_a);
+  MAKE_PROPERTY(rgb_operand_b, get_operand_b);
+
+  MAKE_PROPERTY(alpha_mode, get_alpha_mode);
+  MAKE_PROPERTY(alpha_operand_a, get_alpha_operand_a);
+  MAKE_PROPERTY(alpha_operand_b, get_alpha_operand_b);
+
+  MAKE_PROPERTY(color, get_color);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 
@@ -97,6 +126,8 @@ protected:
 private:
 private:
   Mode _mode;
   Mode _mode;
   Operand _a, _b;
   Operand _a, _b;
+  Mode _alpha_mode;
+  Operand _alpha_a, _alpha_b;
   LColor _color;
   LColor _color;
   bool _involves_constant_color;
   bool _involves_constant_color;
   bool _involves_color_scale;
   bool _involves_color_scale;

+ 3 - 0
panda/src/pgraph/colorScaleAttrib.h

@@ -43,6 +43,9 @@ PUBLISHED:
   INLINE const LVecBase4 &get_scale() const;
   INLINE const LVecBase4 &get_scale() const;
   CPT(RenderAttrib) set_scale(const LVecBase4 &scale) const;
   CPT(RenderAttrib) set_scale(const LVecBase4 &scale) const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY2(scale, has_scale, get_scale);
+
 public:
 public:
   virtual bool lower_attrib_can_override() const;
   virtual bool lower_attrib_can_override() const;
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;

+ 3 - 0
panda/src/pgraph/colorWriteAttrib.h

@@ -48,6 +48,9 @@ PUBLISHED:
 
 
   INLINE unsigned int get_channels() const;
   INLINE unsigned int get_channels() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(channels, get_channels);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 4 - 0
panda/src/pgraph/cullBinAttrib.h

@@ -35,6 +35,10 @@ PUBLISHED:
   INLINE const string &get_bin_name() const;
   INLINE const string &get_bin_name() const;
   INLINE int get_draw_order() const;
   INLINE int get_draw_order() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(bin_name, get_bin_name);
+  MAKE_PROPERTY(draw_order, get_draw_order);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 5 - 0
panda/src/pgraph/cullFaceAttrib.h

@@ -44,6 +44,11 @@ PUBLISHED:
   INLINE bool get_reverse() const;
   INLINE bool get_reverse() const;
   Mode get_effective_mode() const;
   Mode get_effective_mode() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_actual_mode);
+  MAKE_PROPERTY(reverse, get_reverse);
+  MAKE_PROPERTY(effective_mode, get_effective_mode);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 5 - 0
panda/src/pgraph/depthOffsetAttrib.h

@@ -60,6 +60,11 @@ PUBLISHED:
   INLINE PN_stdfloat get_min_value() const;
   INLINE PN_stdfloat get_min_value() const;
   INLINE PN_stdfloat get_max_value() const;
   INLINE PN_stdfloat get_max_value() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(offset, get_offset);
+  MAKE_PROPERTY(min_value, get_min_value);
+  MAKE_PROPERTY(max_value, get_max_value);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 3 - 0
panda/src/pgraph/depthTestAttrib.h

@@ -33,6 +33,9 @@ PUBLISHED:
 
 
   INLINE PandaCompareFunc get_mode() const;
   INLINE PandaCompareFunc get_mode() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 3 - 0
panda/src/pgraph/depthWriteAttrib.h

@@ -39,6 +39,9 @@ PUBLISHED:
 
 
   INLINE Mode get_mode() const;
   INLINE Mode get_mode() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 3 - 0
panda/src/pgraph/fogAttrib.h

@@ -34,6 +34,9 @@ PUBLISHED:
   INLINE bool is_off() const;
   INLINE bool is_off() const;
   INLINE Fog *get_fog() const;
   INLINE Fog *get_fog() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(fog, get_fog);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 3 - 0
panda/src/pgraph/lightRampAttrib.h

@@ -52,6 +52,9 @@ PUBLISHED:
   INLINE PN_stdfloat get_level(int n) const;
   INLINE PN_stdfloat get_level(int n) const;
   INLINE PN_stdfloat get_threshold(int n) const;
   INLINE PN_stdfloat get_threshold(int n) const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 3 - 0
panda/src/pgraph/materialAttrib.h

@@ -36,6 +36,9 @@ PUBLISHED:
   INLINE bool is_off() const;
   INLINE bool is_off() const;
   INLINE Material *get_material() const;
   INLINE Material *get_material() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(material, get_material);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 6 - 1
panda/src/pgraph/renderModeAttrib.h

@@ -63,9 +63,14 @@ PUBLISHED:
   INLINE PN_stdfloat get_thickness() const;
   INLINE PN_stdfloat get_thickness() const;
   INLINE bool get_perspective() const;
   INLINE bool get_perspective() const;
   INLINE const LColor &get_wireframe_color() const;
   INLINE const LColor &get_wireframe_color() const;
-
   INLINE int get_geom_rendering(int geom_rendering) const;
   INLINE int get_geom_rendering(int geom_rendering) const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+  MAKE_PROPERTY(thickness, get_thickness);
+  MAKE_PROPERTY(perspective, get_perspective);
+  MAKE_PROPERTY(wireframe_color, get_wireframe_color);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 1 - 0
panda/src/pgraph/rescaleNormalAttrib.h

@@ -49,6 +49,7 @@ PUBLISHED:
   INLINE static CPT(RenderAttrib) make_default();
   INLINE static CPT(RenderAttrib) make_default();
 
 
   INLINE Mode get_mode() const;
   INLINE Mode get_mode() const;
+  MAKE_PROPERTY(mode, get_mode);
 
 
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;

+ 3 - 0
panda/src/pgraph/scissorAttrib.h

@@ -47,6 +47,9 @@ PUBLISHED:
 
 
   INLINE const LVecBase4 &get_frame() const;
   INLINE const LVecBase4 &get_frame() const;
 
 
+PUBLISHED:
+  MAKE_PROPERTY(frame, get_frame);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 1 - 0
panda/src/pgraph/shadeModelAttrib.h

@@ -39,6 +39,7 @@ PUBLISHED:
   static CPT(RenderAttrib) make_default();
   static CPT(RenderAttrib) make_default();
 
 
   INLINE Mode get_mode() const;
   INLINE Mode get_mode() const;
+  MAKE_PROPERTY(mode, get_mode);
 
 
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;

+ 4 - 0
panda/src/pgraph/shaderAttrib.h

@@ -114,6 +114,10 @@ PUBLISHED:
 
 
   static void register_with_read_factory();
   static void register_with_read_factory();
 
 
+PUBLISHED:
+  MAKE_PROPERTY(shader, get_shader);
+  MAKE_PROPERTY(instance_count, get_instance_count);
+
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;
 
 

+ 1 - 0
panda/src/pgraph/transparencyAttrib.h

@@ -51,6 +51,7 @@ PUBLISHED:
   static CPT(RenderAttrib) make_default();
   static CPT(RenderAttrib) make_default();
 
 
   INLINE Mode get_mode() const;
   INLINE Mode get_mode() const;
+  MAKE_PROPERTY(mode, get_mode);
 
 
 public:
 public:
   virtual void output(ostream &out) const;
   virtual void output(ostream &out) const;

+ 3 - 0
panda/src/pgraphnodes/config_pgraphnodes.cxx

@@ -29,6 +29,7 @@
 #include "selectiveChildNode.h"
 #include "selectiveChildNode.h"
 #include "sequenceNode.h"
 #include "sequenceNode.h"
 #include "shaderGenerator.h"
 #include "shaderGenerator.h"
+#include "sphereLight.h"
 #include "spotlight.h"
 #include "spotlight.h"
 #include "switchNode.h"
 #include "switchNode.h"
 #include "uvScrollNode.h"
 #include "uvScrollNode.h"
@@ -123,6 +124,7 @@ init_libpgraphnodes() {
   SelectiveChildNode::init_type();
   SelectiveChildNode::init_type();
   SequenceNode::init_type();
   SequenceNode::init_type();
   ShaderGenerator::init_type();
   ShaderGenerator::init_type();
+  SphereLight::init_type();
   Spotlight::init_type();
   Spotlight::init_type();
   SwitchNode::init_type();
   SwitchNode::init_type();
   UvScrollNode::init_type();
   UvScrollNode::init_type();
@@ -137,6 +139,7 @@ init_libpgraphnodes() {
   PointLight::register_with_read_factory();
   PointLight::register_with_read_factory();
   SelectiveChildNode::register_with_read_factory();
   SelectiveChildNode::register_with_read_factory();
   SequenceNode::register_with_read_factory();
   SequenceNode::register_with_read_factory();
+  SphereLight::register_with_read_factory();
   Spotlight::register_with_read_factory();
   Spotlight::register_with_read_factory();
   SwitchNode::register_with_read_factory();
   SwitchNode::register_with_read_factory();
   UvScrollNode::register_with_read_factory();
   UvScrollNode::register_with_read_factory();

+ 2 - 0
panda/src/pgraphnodes/p3pgraphnodes_composite1.cxx

@@ -7,3 +7,5 @@
 #include "fadeLodNodeData.cxx"
 #include "fadeLodNodeData.cxx"
 #include "lightLensNode.cxx"
 #include "lightLensNode.cxx"
 #include "lightNode.cxx"
 #include "lightNode.cxx"
+#include "lodNode.cxx"
+#include "lodNodeType.cxx"

+ 1 - 2
panda/src/pgraphnodes/p3pgraphnodes_composite2.cxx

@@ -1,11 +1,10 @@
-#include "lodNode.cxx"
-#include "lodNodeType.cxx"
 #include "nodeCullCallbackData.cxx"
 #include "nodeCullCallbackData.cxx"
 #include "pointLight.cxx"
 #include "pointLight.cxx"
 #include "sceneGraphAnalyzer.cxx"
 #include "sceneGraphAnalyzer.cxx"
 #include "selectiveChildNode.cxx"
 #include "selectiveChildNode.cxx"
 #include "sequenceNode.cxx"
 #include "sequenceNode.cxx"
 #include "shaderGenerator.cxx"
 #include "shaderGenerator.cxx"
+#include "sphereLight.cxx"
 #include "spotlight.cxx"
 #include "spotlight.cxx"
 #include "switchNode.cxx"
 #include "switchNode.cxx"
 #include "uvScrollNode.cxx"
 #include "uvScrollNode.cxx"

+ 48 - 0
panda/src/pgraphnodes/sphereLight.I

@@ -0,0 +1,48 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file sphereLight.I
+ * @author rdb
+ * @date 2016-04-15
+ */
+
+/**
+ *
+ */
+INLINE SphereLight::CData::
+CData() :
+  _radius(0.01f)
+{
+}
+
+/**
+ *
+ */
+INLINE SphereLight::CData::
+CData(const SphereLight::CData &copy) :
+  _radius(copy._radius)
+{
+}
+
+/**
+ * Returns the radius of the sphere.
+ */
+INLINE PN_stdfloat SphereLight::
+get_radius() const {
+  CDReader cdata(_cycler);
+  return cdata->_radius;
+}
+
+/**
+ * Sets the radius of the sphere.
+ */
+INLINE void SphereLight::
+set_radius(PN_stdfloat radius) {
+  CDWriter cdata(_cycler);
+  cdata->_radius = radius;
+}

+ 146 - 0
panda/src/pgraphnodes/sphereLight.cxx

@@ -0,0 +1,146 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file sphereLight.cxx
+ * @author rdb
+ * @date 2016-04-15
+ */
+
+#include "sphereLight.h"
+#include "graphicsStateGuardianBase.h"
+#include "bamWriter.h"
+#include "bamReader.h"
+#include "datagram.h"
+#include "datagramIterator.h"
+
+TypeHandle SphereLight::_type_handle;
+
+/**
+ *
+ */
+CycleData *SphereLight::CData::
+make_copy() const {
+  return new CData(*this);
+}
+
+/**
+ * Writes the contents of this object to the datagram for shipping out to a
+ * Bam file.
+ */
+void SphereLight::CData::
+write_datagram(BamWriter *manager, Datagram &dg) const {
+  dg.add_stdfloat(_radius);
+}
+
+/**
+ * This internal function is called by make_from_bam to read in all of the
+ * relevant data from the BamFile for the new Light.
+ */
+void SphereLight::CData::
+fillin(DatagramIterator &scan, BamReader *manager) {
+  _radius = scan.get_stdfloat();
+}
+
+/**
+ *
+ */
+SphereLight::
+SphereLight(const string &name) :
+  PointLight(name)
+{
+}
+
+/**
+ * Do not call the copy constructor directly; instead, use make_copy() or
+ * copy_subgraph() to make a copy of a node.
+ */
+SphereLight::
+SphereLight(const SphereLight &copy) :
+  PointLight(copy),
+  _cycler(copy._cycler)
+{
+}
+
+/**
+ * Returns a newly-allocated PandaNode that is a shallow copy of this one.  It
+ * will be a different pointer, but its internal data may or may not be shared
+ * with that of the original PandaNode.  No children will be copied.
+ */
+PandaNode *SphereLight::
+make_copy() const {
+  return new SphereLight(*this);
+}
+
+/**
+ * Transforms the contents of this PandaNode by the indicated matrix, if it
+ * means anything to do so.  For most kinds of PandaNodes, this does nothing.
+ */
+void SphereLight::
+xform(const LMatrix4 &mat) {
+  PointLight::xform(mat);
+  CDWriter cdata(_cycler);
+  cdata->_radius = mat.xform_vec(LVector3(0, 0, cdata->_radius)).length();
+  mark_viz_stale();
+}
+
+/**
+ *
+ */
+void SphereLight::
+write(ostream &out, int indent_level) const {
+  PointLight::write(out, indent_level);
+  indent(out, indent_level) << *this << ":\n";
+  indent(out, indent_level + 2)
+    << "radius " << get_radius() << "\n";
+}
+
+/**
+ * Tells the BamReader how to create objects of type SphereLight.
+ */
+void SphereLight::
+register_with_read_factory() {
+  BamReader::get_factory()->register_factory(get_class_type(), make_from_bam);
+}
+
+/**
+ * Writes the contents of this object to the datagram for shipping out to a
+ * Bam file.
+ */
+void SphereLight::
+write_datagram(BamWriter *manager, Datagram &dg) {
+  PointLight::write_datagram(manager, dg);
+  manager->write_cdata(dg, _cycler);
+}
+
+/**
+ * This function is called by the BamReader's factory when a new object of
+ * type SphereLight is encountered in the Bam file.  It should create the
+ * SphereLight and extract its information from the file.
+ */
+TypedWritable *SphereLight::
+make_from_bam(const FactoryParams &params) {
+  SphereLight *node = new SphereLight("");
+  DatagramIterator scan;
+  BamReader *manager;
+
+  parse_params(params, scan, manager);
+  node->fillin(scan, manager);
+
+  return node;
+}
+
+/**
+ * This internal function is called by make_from_bam to read in all of the
+ * relevant data from the BamFile for the new SphereLight.
+ */
+void SphereLight::
+fillin(DatagramIterator &scan, BamReader *manager) {
+  PointLight::fillin(scan, manager);
+
+  manager->read_cdata(scan, _cycler);
+}

+ 90 - 0
panda/src/pgraphnodes/sphereLight.h

@@ -0,0 +1,90 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file sphereLight.h
+ * @author rdb
+ * @date 2016-04-15
+ */
+
+#ifndef SPHERELIGHT_H
+#define SPHERELIGHT_H
+
+#include "pandabase.h"
+
+#include "lightLensNode.h"
+
+/**
+ * A sphere light is like a point light, except that it represents a sphere
+ * with a radius, rather than being an infinitely thin point in space.
+ */
+class EXPCL_PANDA_PGRAPHNODES SphereLight : public PointLight {
+PUBLISHED:
+  SphereLight(const string &name);
+
+protected:
+  SphereLight(const SphereLight &copy);
+
+public:
+  virtual PandaNode *make_copy() const;
+  virtual void xform(const LMatrix4 &mat);
+  virtual void write(ostream &out, int indent_level) const;
+
+PUBLISHED:
+  INLINE PN_stdfloat get_radius() const;
+  INLINE void set_radius(PN_stdfloat radius);
+  MAKE_PROPERTY(radius, get_radius, set_radius);
+
+private:
+  // This is the data that must be cycled between pipeline stages.
+  class EXPCL_PANDA_PGRAPHNODES CData : public CycleData {
+  public:
+    INLINE CData();
+    INLINE CData(const CData &copy);
+    virtual CycleData *make_copy() const;
+    virtual void write_datagram(BamWriter *manager, Datagram &dg) const;
+    virtual void fillin(DatagramIterator &scan, BamReader *manager);
+    virtual TypeHandle get_parent_type() const {
+      return SphereLight::get_class_type();
+    }
+
+    PN_stdfloat _radius;
+  };
+
+  PipelineCycler<CData> _cycler;
+  typedef CycleDataReader<CData> CDReader;
+  typedef CycleDataWriter<CData> CDWriter;
+
+public:
+  static void register_with_read_factory();
+  virtual void write_datagram(BamWriter *manager, Datagram &dg);
+
+protected:
+  static TypedWritable *make_from_bam(const FactoryParams &params);
+  void fillin(DatagramIterator &scan, BamReader *manager);
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    PointLight::init_type();
+    register_type(_type_handle, "SphereLight",
+                  PointLight::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#include "sphereLight.I"
+
+#endif

+ 7 - 0
panda/src/pnmimagetypes/config_pnmimagetypes.cxx

@@ -22,6 +22,7 @@
 #include "pnmFileTypePNM.h"
 #include "pnmFileTypePNM.h"
 #include "pnmFileTypePfm.h"
 #include "pnmFileTypePfm.h"
 #include "pnmFileTypeTIFF.h"
 #include "pnmFileTypeTIFF.h"
+#include "pnmFileTypeStbImage.h"
 #include "sgi.h"
 #include "sgi.h"
 
 
 #include "config_pnmimage.h"
 #include "config_pnmimage.h"
@@ -240,6 +241,12 @@ init_libpnmimagetypes() {
   tr->register_type(new PNMFileTypeTIFF);
   tr->register_type(new PNMFileTypeTIFF);
 #endif
 #endif
 
 
+#ifdef HAVE_STB_IMAGE
+  PNMFileTypeStbImage::init_type();
+  PNMFileTypeStbImage::register_with_read_factory();
+  tr->register_type(new PNMFileTypeStbImage);
+#endif
+
   // And register with the PandaSystem.
   // And register with the PandaSystem.
   PandaSystem *ps = PandaSystem::get_global_ptr();
   PandaSystem *ps = PandaSystem::get_global_ptr();
 
 

+ 1 - 0
panda/src/pnmimagetypes/p3pnmimagetypes_composite2.cxx

@@ -6,5 +6,6 @@
 #include "pnmFileTypeSGIReader.cxx"
 #include "pnmFileTypeSGIReader.cxx"
 #include "pnmFileTypeSGIWriter.cxx"
 #include "pnmFileTypeSGIWriter.cxx"
 #include "pnmFileTypeSoftImage.cxx"
 #include "pnmFileTypeSoftImage.cxx"
+#include "pnmFileTypeStbImage.cxx"
 #include "pnmFileTypeTGA.cxx"
 #include "pnmFileTypeTGA.cxx"
 
 

+ 509 - 0
panda/src/pnmimagetypes/pnmFileTypeStbImage.cxx

@@ -0,0 +1,509 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file pnmFileTypeStbImage.cxx
+ * @author rdb
+ * @date 2016-03-31
+ */
+
+#include "pnmFileTypeStbImage.h"
+
+#ifdef HAVE_STB_IMAGE
+
+#include "config_pnmimagetypes.h"
+#include "pnmFileTypeRegistry.h"
+#include "bamReader.h"
+
+// We use the public domain stb_image library for loading images.  Define the
+// stb_image implementation.  We only use it in this unit.
+#define STB_IMAGE_STATIC
+#define STB_IMAGE_IMPLEMENTATION
+
+// Disable the stb_image implementation of these formats if we already support
+// it through different loaders.
+#ifndef HAVE_JPEG
+#define STBI_ONLY_JPEG
+#endif
+#ifndef HAVE_PNG
+#define STBI_ONLY_PNG
+#endif
+#ifndef HAVE_BMP
+#define STBI_ONLY_BMP
+#endif
+#ifndef HAVE_TGA
+#define STBI_ONLY_TGA
+#endif
+#ifndef HAVE_SOFTIMAGE_PIC
+#define STBI_ONLY_PIC
+#endif
+#ifndef HAVE_PNM
+#define STBI_ONLY_PNM
+#endif
+
+// These are always enabled because we don't support these via other means.
+#define STBI_ONLY_PSD
+#define STBI_ONLY_HDR
+#define STBI_ONLY_GIF
+
+#ifndef NDEBUG
+// Get friendlier error messages in development builds.
+#define STBI_FAILURE_USERMSG
+#endif
+
+// We read via callbacks, so no need for stbi_load_from_file.
+#define STBI_NO_STDIO
+
+#include "stb_image.h"
+
+static const char *const stb_extensions[] = {
+  // Expose the extensions that we don't already expose through other loaders.
+#ifndef HAVE_JPEG
+  "jpg", "jpeg",
+#endif
+#ifndef HAVE_PNG
+  "png",
+#endif
+#ifndef HAVE_BMP
+  "bmp",
+#endif
+#ifndef HAVE_TGA
+  "tga",
+#endif
+#ifndef HAVE_SOFTIMAGE_PIC
+  "pic",
+#endif
+#ifndef HAVE_PNM
+  "ppm", "pgm",
+#endif
+
+  // We don't have other loaders for these, so add them unconditionally.
+  "psd",
+  "hdr",
+  "gif",
+};
+static const int num_stb_extensions = sizeof(stb_extensions) / sizeof(const char *);
+
+// Callbacks to allow stb_image to read from VFS.
+static int cb_read(void *user, char *data, int size) {
+  istream *in = (istream *)user;
+  nassertr(in != NULL, 0);
+
+  in->read(data, size);
+
+  if (in->eof()) {
+    // Gracefully handle EOF.
+    in->clear();
+  }
+
+  return (int)in->gcount();
+}
+
+static void cb_skip(void *user, int n) {
+  istream *in = (istream *)user;
+  nassertv(in != NULL);
+
+  in->seekg(n, ios::cur);
+
+  if (in->fail()) {
+    in->clear();
+
+    // Implement skip by just reading and discarding the result.
+    static const int size = 4096;
+    char data[4096];
+    while (n > 4096) {
+      in->read(data, 4096);
+      n -= 4096;
+    }
+    if (n > 0) {
+      in->read(data, n);
+    }
+  }
+}
+
+static int cb_eof(void *user) {
+  istream *in = (istream *)user;
+  nassertr(in != NULL, 1);
+
+  return in->eof();
+}
+
+static stbi_io_callbacks io_callbacks = {cb_read, cb_skip, cb_eof};
+
+/**
+ * This is defined in the .cxx file so we have access to stbi_context.
+ */
+class StbImageReader : public PNMReader {
+public:
+  StbImageReader(PNMFileType *type, istream *file, bool owns_file, string magic_number);
+
+  virtual bool is_floating_point();
+  virtual bool read_pfm(PfmFile &pfm);
+  virtual int read_data(xel *array, xelval *alpha);
+
+private:
+  bool _is_float;
+  stbi__context _context;
+};
+
+TypeHandle PNMFileTypeStbImage::_type_handle;
+
+/**
+ *
+ */
+PNMFileTypeStbImage::
+PNMFileTypeStbImage() {
+}
+
+/**
+ * Returns a few words describing the file type.
+ */
+string PNMFileTypeStbImage::
+get_name() const {
+  return "stb_image";
+}
+
+/**
+ * Returns the number of different possible filename extensions associated
+ * with this particular file type.
+ */
+int PNMFileTypeStbImage::
+get_num_extensions() const {
+  return num_stb_extensions;
+}
+
+/**
+ * Returns the nth possible filename extension associated with this particular
+ * file type, without a leading dot.
+ */
+string PNMFileTypeStbImage::
+get_extension(int n) const {
+  nassertr(n >= 0 && n < num_stb_extensions, string());
+  return stb_extensions[n];
+}
+
+/**
+ * Returns true if this particular file type uses a magic number to identify
+ * it, false otherwise.
+ */
+bool PNMFileTypeStbImage::
+has_magic_number() const {
+  return false;
+}
+
+/**
+ * Returns true if the indicated "magic number" byte stream (the initial few
+ * bytes read from the file) matches this particular file type, false
+ * otherwise.
+ */
+bool PNMFileTypeStbImage::
+matches_magic_number(const string &magic_number) const {
+  return false;
+}
+
+/**
+ * Allocates and returns a new PNMReader suitable for reading from this file
+ * type, if possible.  If reading from this file type is not supported,
+ * returns NULL.
+ */
+PNMReader *PNMFileTypeStbImage::
+make_reader(istream *file, bool owns_file, const string &magic_number) {
+  init_pnm();
+  return new StbImageReader(this, file, owns_file, magic_number);
+}
+
+/**
+ *
+ */
+StbImageReader::
+StbImageReader(PNMFileType *type, istream *file, bool owns_file, string magic_number) :
+  PNMReader(type, file, owns_file),
+  _is_float(false)
+{
+  // Hope we can putback() more than one character.
+  for (string::reverse_iterator mi = magic_number.rbegin();
+       mi != magic_number.rend();
+       mi++) {
+    _file->putback(*mi);
+  }
+  if (_file->fail()) {
+    pnmimage_cat.error()
+      << "Unable to put back magic number.\n";
+    _is_valid = false;
+    return;
+  }
+
+  stbi__start_callbacks(&_context, &io_callbacks, (void *)file);
+
+  if (strncmp(magic_number.c_str(), "#?", 2) == 0 &&
+      stbi__hdr_info(&_context, &_x_size, &_y_size, &_num_channels)) {
+    _is_valid = true;
+    _is_float = true;
+  } else if (stbi__info_main(&_context, &_x_size, &_y_size, &_num_channels)) {
+    _is_valid = true;
+  } else {
+    _is_valid = false;
+    pnmimage_cat.error()
+      << "stb_info failure: " << stbi_failure_reason() << "\n";
+  }
+
+  _maxval = 255;
+}
+
+/**
+ * Returns true if this PNMFileType represents a floating-point image type,
+ * false if it is a normal, integer type.  If this returns true, read_pfm() is
+ * implemented instead of read_data().
+ */
+bool StbImageReader::
+is_floating_point() {
+  return _is_float;
+}
+
+/**
+ * Reads floating-point data directly into the indicated PfmFile.  Returns
+ * true on success, false on failure.
+ */
+bool StbImageReader::
+read_pfm(PfmFile &pfm) {
+  if (!is_valid()) {
+    return false;
+  }
+
+  // Reposition the file at the beginning.
+  _file->seekg(0, ios::beg);
+  if (_file->tellg() != 0) {
+    pnmimage_cat.error()
+      << "Could not reposition file pointer to the beginning.\n";
+    return false;
+  }
+
+  stbi__start_callbacks(&_context, &io_callbacks, (void *)_file);
+
+  nassertr(_num_channels == 3, false);
+
+  // This next bit is copied and pasted from stbi__hdr_load so that we can
+  // avoid making an unnecessary extra copy of the data.
+  char buffer[STBI__HDR_BUFLEN];
+  char *token;
+  int valid = 0;
+  int width, height;
+  stbi_uc *scanline;
+  int len;
+  unsigned char count, value;
+  int i, j, k, c1, c2, z;
+
+  // Check identifier
+  if (strcmp(stbi__hdr_gettoken(&_context, buffer), "#?RADIANCE") != 0) {
+    pnmimage_cat.error()
+      << "Missing #?RADIANCE header.\n";
+    return false;
+  }
+
+  // Parse header
+  for(;;) {
+    token = stbi__hdr_gettoken(&_context, buffer);
+    if (token[0] == 0) break;
+    if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+  }
+
+  if (!valid) {
+    pnmimage_cat.error() << "Unsupported HDR format.\n";
+    return false;
+  }
+
+  // Parse width and height
+  // can't use sscanf() if we're not using stdio!
+  token = stbi__hdr_gettoken(&_context, buffer);
+  if (strncmp(token, "-Y ", 3)) {
+    pnmimage_cat.error() << "Unsupported HDR data layout.\n";
+    return false;
+  }
+  token += 3;
+  height = (int) strtol(token, &token, 10);
+  while (*token == ' ') ++token;
+  if (strncmp(token, "+X ", 3)) {
+    pnmimage_cat.error() << "Unsupported HDR data layout.\n";
+    return false;
+  }
+  token += 3;
+  width = (int) strtol(token, NULL, 10);
+
+  // Read data
+  pfm.clear(width, height, 3);
+  vector_float table;
+  pfm.swap_table(table);
+  float *hdr_data = (float *)&table[0];
+
+  // Load image data
+  // image data is stored as some number of sca
+  if (width < 8 || width >= 32768) {
+    // Read flat data
+    for (j = 0; j < height; ++j) {
+      for (i = 0; i < width; ++i) {
+        stbi_uc rgbe[4];
+main_decode_loop:
+        stbi__getn(&_context, rgbe, 4);
+        stbi__hdr_convert(hdr_data + j * width * 3 + i * 3, rgbe, 3);
+      }
+    }
+  } else {
+    // Read RLE-encoded data
+    scanline = NULL;
+
+    for (j = 0; j < height; ++j) {
+      c1 = stbi__get8(&_context);
+      c2 = stbi__get8(&_context);
+      len = stbi__get8(&_context);
+      if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+        // not run-length encoded, so we have to actually use THIS data as a decoded
+        // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+        stbi_uc rgbe[4];
+        rgbe[0] = (stbi_uc) c1;
+        rgbe[1] = (stbi_uc) c2;
+        rgbe[2] = (stbi_uc) len;
+        rgbe[3] = (stbi_uc) stbi__get8(&_context);
+        stbi__hdr_convert(hdr_data, rgbe, 3);
+        i = 1;
+        j = 0;
+        STBI_FREE(scanline);
+        goto main_decode_loop; // yes, this makes no sense
+      }
+      len <<= 8;
+      len |= stbi__get8(&_context);
+      if (len != width) {
+        STBI_FREE(scanline);
+        pnmimage_cat.error() << "Corrupt HDR: invalid decoded scanline length.\n";
+        return false;
+      }
+      if (scanline == NULL) {
+        scanline = (stbi_uc *) stbi__malloc(width * 4);
+      }
+
+      for (k = 0; k < 4; ++k) {
+        i = 0;
+        while (i < width) {
+          count = stbi__get8(&_context);
+          if (count > 128) {
+            // Run
+            value = stbi__get8(&_context);
+            count -= 128;
+            for (z = 0; z < count; ++z) {
+              scanline[i++ * 4 + k] = value;
+            }
+          } else {
+            // Dump
+            for (z = 0; z < count; ++z) {
+              scanline[i++ * 4 + k] = stbi__get8(&_context);
+            }
+          }
+        }
+      }
+      for (i = 0; i < width; ++i) {
+        stbi__hdr_convert(hdr_data+(j*width + i)*3, scanline + i*4, 3);
+      }
+    }
+    STBI_FREE(scanline);
+  }
+
+  pfm.swap_table(table);
+  return true;
+}
+
+/**
+ * Reads in an entire image all at once, storing it in the pre-allocated
+ * _x_size * _y_size array and alpha pointers.  (If the image type has no
+ * alpha channel, alpha is ignored.)  Returns the number of rows correctly
+ * read.
+ *
+ * Derived classes need not override this if they instead provide
+ * supports_read_row() and read_row(), below.
+ */
+int StbImageReader::
+read_data(xel *array, xelval *alpha) {
+  // Reposition the file at the beginning.
+  _file->seekg(0, ios::beg);
+  if (_file->tellg() != 0) {
+    pnmimage_cat.error()
+      << "Could not reposition file pointer to the beginning.\n";
+    return 0;
+  }
+
+  stbi__start_callbacks(&_context, &io_callbacks, (void *)_file);
+
+  int cols = 0;
+  int rows = 0;
+  stbi_uc *data = stbi__load_main(&_context, &cols, &rows, NULL, _num_channels);
+
+  if (data == NULL) {
+    pnmimage_cat.error()
+      << "stbi_load failure: " << stbi_failure_reason() << "\n";
+    return 0;
+  }
+
+  nassertr(cols == _x_size, 0);
+
+  size_t pixels = (size_t)_x_size * (size_t)rows;
+  stbi_uc *ptr = data;
+  switch (_num_channels) {
+  case 1:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[i], ptr[i], ptr[i]);
+    }
+    break;
+
+  case 2:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[0], ptr[0], ptr[0]);
+      alpha[i] = ptr[1];
+      ptr += 2;
+    }
+    break;
+
+  case 3:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[0], ptr[1], ptr[2]);
+      ptr += 3;
+    }
+    break;
+
+  case 4:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[0], ptr[1], ptr[2]);
+      alpha[i] = ptr[3];
+      ptr += 4;
+    }
+    break;
+  }
+
+  stbi_image_free(data);
+  return rows;
+}
+
+/**
+ * Registers the current object as something that can be read from a Bam file.
+ */
+void PNMFileTypeStbImage::
+register_with_read_factory() {
+  BamReader::get_factory()->
+    register_factory(get_class_type(), make_PNMFileTypeStbImage);
+}
+
+/**
+ * This method is called by the BamReader when an object of this type is
+ * encountered in a Bam file; it should allocate and return a new object with
+ * all the data read.
+ *
+ * In the case of the PNMFileType objects, since these objects are all shared,
+ * we just pull the object from the registry.
+ */
+TypedWritable *PNMFileTypeStbImage::
+make_PNMFileTypeStbImage(const FactoryParams &params) {
+  return PNMFileTypeRegistry::get_global_ptr()->get_type_by_handle(get_class_type());
+}
+
+#endif  // HAVE_STB_IMAGE

+ 73 - 0
panda/src/pnmimagetypes/pnmFileTypeStbImage.h

@@ -0,0 +1,73 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file pnmFileTypeStbImage.h
+ * @author rdb
+ * @date 2016-03-31
+ */
+
+#ifndef PNMFILETYPESTBIMAGE_H
+#define PNMFILETYPESTBIMAGE_H
+
+#include "pandabase.h"
+
+#ifdef HAVE_STB_IMAGE
+
+#include "pnmFileType.h"
+#include "pnmReader.h"
+#include "pnmWriter.h"
+
+#include "stb_image.h"
+
+/**
+ * For reading images via the public domain stb_image.h library.  This is used
+ * when compiling without support for more specific libraries that are more
+ * full-featured, such as libpng or libjpeg.
+ */
+class EXPCL_PANDA_PNMIMAGETYPES PNMFileTypeStbImage : public PNMFileType {
+public:
+  PNMFileTypeStbImage();
+
+  virtual string get_name() const;
+
+  virtual int get_num_extensions() const;
+  virtual string get_extension(int n) const;
+
+  virtual bool has_magic_number() const;
+  virtual bool matches_magic_number(const string &magic_number) const;
+
+  virtual PNMReader *make_reader(istream *file, bool owns_file = true,
+                                 const string &magic_number = string());
+
+public:
+  static void register_with_read_factory();
+
+protected:
+  static TypedWritable *make_PNMFileTypeStbImage(const FactoryParams &params);
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    PNMFileType::init_type();
+    register_type(_type_handle, "PNMFileTypeStbImage",
+                  PNMFileType::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#endif  // HAVE_STB_IMAGE
+
+#endif

+ 6755 - 0
panda/src/pnmimagetypes/stb_image.h

@@ -0,0 +1,6755 @@
+/* stb_image - v2.12 - public domain image loader - http://nothings.org/stb_image.h
+                                     no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8-bit-per-channel (16 bpc not supported)
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+   Revision 2.00 release notes:
+
+      - Progressive JPEG is now supported.
+
+      - PPM and PGM binary formats are now supported, thanks to Ken Miller.
+
+      - x86 platforms now make use of SSE2 SIMD instructions for
+        JPEG decoding, and ARM platforms can use NEON SIMD if requested.
+        This work was done by Fabian "ryg" Giesen. SSE2 is used by
+        default, but NEON must be enabled explicitly; see docs.
+
+        With other JPEG optimizations included in this version, we see
+        2x speedup on a JPEG on an x86 machine, and a 1.5x speedup
+        on a JPEG on an ARM machine, relative to previous versions of this
+        library. The same results will not obtain for all JPGs and for all
+        x86/ARM machines. (Note that progressive JPEGs are significantly
+        slower to decode than regular JPEGs.) This doesn't mean that this
+        is the fastest JPEG decoder in the land; rather, it brings it
+        closer to parity with standard libraries. If you want the fastest
+        decode, look elsewhere. (See "Philosophy" section of docs below.)
+
+        See final bullet items below for more info on SIMD.
+
+      - Added STBI_MALLOC, STBI_REALLOC, and STBI_FREE macros for replacing
+        the memory allocator. Unlike other STBI libraries, these macros don't
+        support a context parameter, so if you need to pass a context in to
+        the allocator, you'll have to store it in a global or a thread-local
+        variable.
+
+      - Split existing STBI_NO_HDR flag into two flags, STBI_NO_HDR and
+        STBI_NO_LINEAR.
+            STBI_NO_HDR:     suppress implementation of .hdr reader format
+            STBI_NO_LINEAR:  suppress high-dynamic-range light-linear float API
+
+      - You can suppress implementation of any of the decoders to reduce
+        your code footprint by #defining one or more of the following
+        symbols before creating the implementation.
+
+            STBI_NO_JPEG
+            STBI_NO_PNG
+            STBI_NO_BMP
+            STBI_NO_PSD
+            STBI_NO_TGA
+            STBI_NO_GIF
+            STBI_NO_HDR
+            STBI_NO_PIC
+            STBI_NO_PNM   (.ppm and .pgm)
+
+      - You can request *only* certain decoders and suppress all other ones
+        (this will be more forward-compatible, as addition of new decoders
+        doesn't require you to disable them explicitly):
+
+            STBI_ONLY_JPEG
+            STBI_ONLY_PNG
+            STBI_ONLY_BMP
+            STBI_ONLY_PSD
+            STBI_ONLY_TGA
+            STBI_ONLY_GIF
+            STBI_ONLY_HDR
+            STBI_ONLY_PIC
+            STBI_ONLY_PNM   (.ppm and .pgm)
+
+         Note that you can define multiples of these, and you will get all
+         of them ("only x" and "only y" is interpreted to mean "only x&y").
+
+       - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+         want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+
+      - Compilation of all SIMD code can be suppressed with
+            #define STBI_NO_SIMD
+        It should not be necessary to disable SIMD unless you have issues
+        compiling (e.g. using an x86 compiler which doesn't support SSE
+        intrinsics or that doesn't support the method used to detect
+        SSE2 support at run-time), and even those can be reported as
+        bugs so I can refine the built-in compile-time checking to be
+        smarter.
+
+      - The old STBI_SIMD system which allowed installing a user-defined
+        IDCT etc. has been removed. If you need this, don't upgrade. My
+        assumption is that almost nobody was doing this, and those who
+        were will find the built-in SIMD more satisfactory anyway.
+
+      - RGB values computed for JPEG images are slightly different from
+        previous versions of stb_image. (This is due to using less
+        integer precision in SIMD.) The C code has been adjusted so
+        that the same RGB values will be computed regardless of whether
+        SIMD support is available, so your app should always produce
+        consistent results. But these results are slightly different from
+        previous versions. (Specifically, about 3% of available YCbCr values
+        will compute different RGB results from pre-1.49 versions by +-1;
+        most of the deviating values are one smaller in the G channel.)
+
+      - If you must produce consistent results with previous versions of
+        stb_image, #define STBI_JPEG_OLD and you will get the same results
+        you used to; however, you will not get the SIMD speedups for
+        the YCbCr-to-RGB conversion step (although you should still see
+        significant JPEG speedup from the other changes).
+
+        Please note that STBI_JPEG_OLD is a temporary feature; it will be
+        removed in future versions of the library. It is only intended for
+        near-term back-compatibility use.
+
+
+   Latest revision history:
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack; 
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) partial animated GIF support
+                         limited 16-bit PSD support
+                         minor bugs, code cleanup, and compiler warnings
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) additional corruption checking
+                         stbi_set_flip_vertically_on_load
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPEG, including x86 SSE2 & ARM NEON SIMD
+                         progressive JPEG
+                         PGM/PPM support
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         STBI_NO_*, STBI_ONLY_*
+                         GIF bugfix
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    urraka@github (animated gif)           Junggon Kim (PNM comments)
+                                           Daniel Gibson (16-bit TGA)
+
+ Optimizations & bugfixes
+    Fabian "ryg" Giesen
+    Arseny Kapoulkine
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
+    Christpher Lloyd        Martin Golini      Jerry Jansson      Joseph Thomson
+    Dave Moore              Roy Eltham         Hayaki Saito       Phil Jordan
+    Won Chun                Luke Graham        Johan Duparc       Nathan Reed
+    the Horde3D community   Thomas Ruf         Ronny Chevalier    Nick Verigakis
+    Janez Zemva             John Bartholomew   Michal Cichon      svdijk@github
+    Jonathan Blow           Ken Hamada         Tero Hanninen      Baldur Karlsson
+    Laurent Gomila          Cort Stratton      Sergio Gonzalez    romigrou@github
+    Aruelien Pocheville     Thibault Reuille   Cass Everitt       Matthew Gregan
+    Ryamond Barbiero        Paul Du Bois       Engin Manap        snagar@github
+    Michaelangel007@github  Oriol Ferrer Mesia socks-the-fox
+    Blazej Dariusz Roszkowski
+
+
+LICENSE
+
+This software is dual-licensed to the public domain and under the following
+license: you are granted a perpetual, irrevocable license to copy, modify,
+publish, and distribute this file as you see fit.
+
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 16-bit-per-channel PNG
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - no 1-bit BMP
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data)
+//
+// Standard parameters:
+//    int *x       -- outputs image width in pixels
+//    int *y       -- outputs image height in pixels
+//    int *comp    -- outputs # of image components in image file
+//    int req_comp -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
+// If req_comp is non-zero, *comp has the number of components that _would_
+// have been output otherwise. E.g. if you set req_comp to 4, you will always
+// get RGBA output, but you can check *comp to see if it's trivially opaque
+// because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
+// can be queried for an extremely brief, end-user unfriendly explanation
+// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
+// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy to use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries do not emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// make more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// The output of the JPEG decoder is slightly different from versions where
+// SIMD support was introduced (that is, for versions before 1.49). The
+// difference is only +-1 in the 8-bit RGB channels, and only on a small
+// fraction of pixels. You can force the pre-1.49 behavior by defining
+// STBI_JPEG_OLD, but this will disable some of the SIMD decoding path
+// and hence cost some performance.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image now supports loading HDR images in general, and currently
+// the Radiance .HDR file format, although the support is provided
+// generically. You can still load any file through the existing interface;
+// if you attempt to load an HDR file, it will be automatically remapped to
+// LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// By default we convert iphone-formatted PNGs back to RGB, even though
+// they are internally encoded differently. You can disable this conversion
+// by by calling stbi_convert_iphone_png_to_rgb(0), in which case
+// you will always just get the native iphone "format" through (which
+// is BGR stored in RGB).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for req_comp
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+typedef unsigned char stbi_uc;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *comp, int req_comp);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// NOT THREADSAFE
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
+
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+
+#ifdef _MSC_VER
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// NOTE: not clear do we actually need this for the 64-bit path?
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
+// this is just broken and gcc are jerks for not fixing it properly
+// http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+static int stbi__sse2_available()
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+static int stbi__sse2_available()
+{
+#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 // GCC 4.8 or later
+   // GCC 4.8+ has a nice way to do this
+   return __builtin_cpu_supports("sse2");
+#else
+   // portable way to do this, preferably without using GCC inline ASM?
+   // just bail for now.
+   return 0;
+#endif
+}
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+// assume GCC or Clang on ARM targets
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   fseek((FILE*) user, n, SEEK_CUR);
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static stbi_uc *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static stbi_uc *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static stbi_uc *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+// this is not threadsafe
+static const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+    stbi__vertically_flip_on_load = flag_true_if_should_flip;
+}
+
+static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result = stbi__load_main(s, x, y, comp, req_comp);
+
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int w = *x, h = *y;
+      int depth = req_comp ? req_comp : *comp;
+      int row,col,z;
+      stbi_uc temp;
+
+      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
+      for (row = 0; row < (h>>1); row++) {
+         for (col = 0; col < w; col++) {
+            for (z = 0; z < depth; z++) {
+               temp = result[(row * w + col) * depth + z];
+               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
+               result[((h - row - 1) * w + col) * depth + z] = temp;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+#ifndef STBI_NO_HDR
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int w = *x, h = *y;
+      int depth = req_comp ? req_comp : *comp;
+      int row,col,z;
+      float temp;
+
+      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
+      for (row = 0; row < (h>>1); row++) {
+         for (col = 0; col < w; col++) {
+            for (z = 0; z < depth; z++) {
+               temp = result[(row * w + col) * depth + z];
+               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
+               result[((h - row - 1) * w + col) * depth + z] = temp;
+            }
+         }
+      }
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_flip(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_flip(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_flip(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_flip(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int      stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   return z + (stbi__get16le(s) << 16);
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc(req_comp * x * y);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define COMBO(a,b)  ((a)*8+(b))
+      #define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (COMBO(img_n, req_comp)) {
+         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
+         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
+         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
+         CASE(2,1) dest[0]=src[0]; break;
+         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
+         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
+         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
+         CASE(3,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
+         CASE(3,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
+         CASE(4,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
+         CASE(4,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
+         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
+         default: STBI_ASSERT(0);
+      }
+      #undef CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output = (float *) stbi__malloc(x * y * comp * sizeof(float));
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+      if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output = (stbi_uc *) stbi__malloc(x * y * comp);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi_uc dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0,code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i)
+      for (j=0; j < count[i]; ++j)
+         h->size[k++] = (stbi_uc) (i+1);
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (-1 << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+
+   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   k = stbi_lrot(j->code_buffer, n);
+   STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & ~sgn);
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi_uc *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      data[0] = (short) (dc << j->succ_low);
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) << shift);
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) << 12)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0] << 2;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s);
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi_uc *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4;
+            int t = q & 15,i;
+            if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s);
+            L -= 65;
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      stbi__skip(z->s, stbi__get16be(z->s)-2);
+      return 1;
+   }
+   return 0;
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   c = stbi__get8(s);
+   if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG");    // JFIF requires
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (z->img_comp[i].id != i+1)   // JFIF requires
+         if (z->img_comp[i].id != i) {  // some version of jpegtran outputs non-JFIF-compliant files!
+            // somethings output this (see http://fileformats.archiveteam.org/wiki/JPEG#Color_format)
+            if (z->img_comp[i].id != rgb[i])
+               return stbi__err("bad component ID","Corrupt JPEG");
+            ++z->rgb;
+         }
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].raw_data = stbi__malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
+
+      if (z->img_comp[i].raw_data == NULL) {
+         for(--i; i >= 0; --i) {
+            STBI_FREE(z->img_comp[i].raw_data);
+            z->img_comp[i].raw_data = NULL;
+         }
+         return stbi__err("outofmem", "Out of memory");
+      }
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      z->img_comp[i].linebuf = NULL;
+      if (z->progressive) {
+         z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
+         z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
+         z->img_comp[i].raw_coeff = STBI_MALLOC(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      } else {
+         z->img_comp[i].coeff = 0;
+         z->img_comp[i].raw_coeff = 0;
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+            // handle 0s at the end of image data from IP Kamera 9060
+            while (!stbi__at_eof(j->s)) {
+               int x = stbi__get8(j->s);
+               if (x == 255) {
+                  j->marker = stbi__get8(j->s);
+                  break;
+               } else if (x != 0) {
+                  return stbi__err("junk before marker", "Corrupt JPEG");
+               }
+            }
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+      } else {
+         if (!stbi__process_marker(j, m)) return 0;
+      }
+      m = stbi__get_marker(j);
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+#ifdef STBI_JPEG_OLD
+// this is the same YCbCr-to-RGB calculation that stb_image has used
+// historically before the algorithm changes in 1.49
+#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 16) + 32768; // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr*float2fixed(1.40200f);
+      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
+      b = y_fixed                            + cb*float2fixed(1.77200f);
+      r >>= 16;
+      g >>= 16;
+      b >>= 16;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#else
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* float2fixed(1.40200f);
+      g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                               +   cb* float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* float2fixed(1.40200f);
+      g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                             +   cb* float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      #ifndef STBI_JPEG_OLD
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      #endif
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   #ifndef STBI_JPEG_OLD
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   #endif
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   int i;
+   for (i=0; i < j->s->img_n; ++i) {
+      if (j->img_comp[i].raw_data) {
+         STBI_FREE(j->img_comp[i].raw_data);
+         j->img_comp[i].raw_data = NULL;
+         j->img_comp[i].data = NULL;
+      }
+      if (j->img_comp[i].raw_coeff) {
+         STBI_FREE(j->img_comp[i].raw_coeff);
+         j->img_comp[i].raw_coeff = 0;
+         j->img_comp[i].coeff = 0;
+      }
+      if (j->img_comp[i].linebuf) {
+         STBI_FREE(j->img_comp[i].linebuf);
+         j->img_comp[i].linebuf = NULL;
+      }
+   }
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n;
+
+   if (z->s->img_n == 3 && n < 3)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4];
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc(n * z->s->img_x * z->s->img_y + 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (z->rgb == 3) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            stbi_uc *y = coutput[0];
+            if (n == 1)
+               for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+            else
+               for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp  = z->s->img_n; // report original components, not output
+      return output;
+   }
+}
+
+static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg j;
+   j.s = s;
+   stbi__setup_jpeg(&j);
+   r = stbi__decode_jpeg_header(&j, STBI__SCAN_type);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[288];
+   stbi__uint16 value[288];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   if (z->zbuffer >= z->zbuffer_end) return 0;
+   return *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s == 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   STBI_ASSERT(z->size[b] == s);
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) stbi__fill_bits(a);
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (int) (z->zout     - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
+   while (cur + n > limit)
+      limit *= 2;
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < hlit + hdist) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else if (c == 16) {
+         c = stbi__zreceive(a,2)+3;
+         memset(lencodes+n, lencodes[n-1], c);
+         n += c;
+      } else if (c == 17) {
+         c = stbi__zreceive(a,3)+3;
+         memset(lencodes+n, 0, c);
+         n += c;
+      } else {
+         STBI_ASSERT(c == 18);
+         c = stbi__zreceive(a,7)+11;
+         memset(lencodes+n, 0, c);
+         n += c;
+      }
+   }
+   if (n != hlit+hdist) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   STBI_ASSERT(a->num_bits == 0);
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+// @TODO: should statically initialize these for optimal thread safety
+static stbi_uc stbi__zdefault_length[288], stbi__zdefault_distance[32];
+static void stbi__init_zdefaults(void)
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zdefault_distance[31]) stbi__init_zdefaults();
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc(x * y * output_bytes); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+   if (s->img_x == x && s->img_y == y) {
+      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
+   } else { // interlaced:
+      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+   }
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior = cur - stride;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         STBI_ASSERT(img_width_bytes <= x);
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); break;
+            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); break;
+            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); break;
+            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); break;
+            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); break;
+         }
+         #undef CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            CASE(STBI__F_none)         cur[k] = raw[k]; break;
+            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); break;
+            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); break;
+            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); break;
+            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); break;
+            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); break;
+         }
+         #undef CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc(a->s->img_x * a->s->img_y * out_n);
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_n + out_x*out_n,
+                      a->out + (j*x+i)*out_n, out_n);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc(pixel_count * pal_img_n);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__reduce_png(stbi__png *p)
+{
+   int i;
+   int img_len = p->s->img_x * p->s->img_y * p->s->img_out_n;
+   stbi_uc *reduced;
+   stbi__uint16 *orig = (stbi__uint16*)p->out;
+
+   if (p->depth != 16) return 1; // don't need to do anything if not 16-bit data
+
+   reduced = (stbi_uc *)stbi__malloc(img_len);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i) reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is a decent approx of 16->8 bit scaling
+
+   p->out = reduced;
+   STBI_FREE(orig);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load = 0;
+static int stbi__de_iphone_flag = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag = flag_true_if_should_convert;
+}
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               p[0] = p[2] * 255 / a;
+               p[1] = p[1] * 255 / a;
+               p[2] =  t   * 255 / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3];
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+			if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+               if (scan == STBI__SCAN_header) return 1;
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+               // if SCAN_header, have to scan to see if we have a tRNS
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp)
+{
+   unsigned char *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth == 16) {
+         if (!stbi__reduce_png(p)) {
+            return result;
+         }
+      }
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         result = stbi__convert_format(result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static unsigned char *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) n += 16, z >>= 16;
+   if (z >= 0x00100) n +=  8, z >>=  8;
+   if (z >= 0x00010) n +=  4, z >>=  4;
+   if (z >= 0x00004) n +=  2, z >>=  2;
+   if (z >= 0x00002) n +=  1, z >>=  1;
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+static int stbi__shiftsigned(int v, int shift, int bits)
+{
+   int result;
+   int z=0;
+
+   if (shift < 0) v <<= -shift;
+   else v >>= shift;
+   result = v;
+
+   z = bits;
+   while (z < 8) {
+      result += v >> z;
+      z += bits;
+   }
+   return result;
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+} stbi__bmp_data;
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (info->bpp == 1) return stbi__errpuc("monochrome", "BMP type not supported: 1-bit");
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               if (info->bpp == 32) {
+                  info->mr = 0xffu << 16;
+                  info->mg = 0xffu <<  8;
+                  info->mb = 0xffu <<  0;
+                  info->ma = 0xffu << 24;
+                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+               } else {
+                  info->mr = 31u << 10;
+                  info->mg = 31u <<  5;
+                  info->mb = 31u <<  0;
+               }
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+
+   info.all_a = 255;   
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - 14 - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - 14 - info.hsz) >> 2;
+   }
+
+   s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   out = (stbi_uc *) stbi__malloc(target * s->img_x * s->img_y);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      for (j=0; j < (int) s->img_y; ++j) {
+         for (i=0; i < (int) s->img_x; i += 2) {
+            int v=stbi__get8(s),v2=0;
+            if (info.bpp == 4) {
+               v2 = v & 15;
+               v >>= 4;
+            }
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+            if (i+1 == (int) s->img_x) break;
+            v = (info.bpp == 8) ? stbi__get8(s) : v2;
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+         }
+         stbi__skip(s, pad);
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - 14 - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+   
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i], p1[i] = p2[i], p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if(is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+            // else: fall-through
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+            return STBI_rgb;
+      case 24: // fall-through
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (r * 255)/31;
+   out[1] = (g * 255)/31;
+   out[2] = (b * 255)/31;
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4];
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   tga_data = (unsigned char*)stbi__malloc( (size_t)tga_width * tga_height * tga_comp );
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc( tga_palette_len * tga_comp );
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   int   pixelCount;
+   int channelCount, compression;
+   int channel, i, count, len;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Create the destination image.
+   out = (stbi_uc *) stbi__malloc(4 * w*h);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceeded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            count = 0;
+            while (count < pixelCount) {
+               len = stbi__get8(s);
+               if (len == 128) {
+                  // No-op.
+               } else if (len < 128) {
+                  // Copy next len+1 bytes literally.
+                  len++;
+                  count += len;
+                  while (len) {
+                     *p = stbi__get8(s);
+                     p += 4;
+                     len--;
+                  }
+               } else if (len > 128) {
+                  stbi_uc   val;
+                  // Next -len+1 bytes in the dest are replicated from next source byte.
+                  // (Interpret len as a negative 8-bit int.)
+                  len ^= 0x0FF;
+                  len += 2;
+                  val = stbi__get8(s);
+                  count += len;
+                  while (len) {
+                     *p = val;
+                     p += 4;
+                     len--;
+                  }
+               }
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out + channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            stbi_uc val = channel == 3 ? 255 : 0;
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = val;
+         } else {
+            // Read the data.
+            if (bitdepth == 16) {
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = (stbi_uc) (stbi__get16be(s) >> 8);
+            } else {
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = stbi__get8(s);
+            }
+         }
+      }
+   }
+
+   if (channelCount >= 4) {
+      for (i=0; i < w*h; ++i) {
+         unsigned char *pixel = out + 4*i;
+         if (pixel[3] != 0 && pixel[3] != 255) {
+            // remove weird white matte from PSD
+            float a = pixel[3] / 255.0f;
+            float ra = 1.0f / a;
+            float inv_a = 255.0f * (1 - ra);
+            pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+            pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+            pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+         }
+      }
+   }
+
+   if (req_comp && req_comp != 4) {
+      out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp)
+{
+   stbi_uc *result;
+   int i, x,y;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if ((1 << 28) / x < y) return stbi__errpuc("too large", "Image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc(x*y*4);
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out, *old_out;             // output buffer (always 4 components)
+   int flags, bgindex, ratio, transparent, eflags, delay;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[4096];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   p = &g->out[g->cur_x + g->cur_y];
+   c = &g->color_table[g->codes[code].suffix * 4];
+
+   if (c[3] >= 128) {
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) return stbi__errpuc("no clear code", "Corrupt GIF");
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 4096)        return stbi__errpuc("too many codes", "Corrupt GIF");
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+static void stbi__fill_gif_background(stbi__gif *g, int x0, int y0, int x1, int y1)
+{
+   int x, y;
+   stbi_uc *c = g->pal[g->bgindex];
+   for (y = y0; y < y1; y += 4 * g->w) {
+      for (x = x0; x < x1; x += 4) {
+         stbi_uc *p  = &g->out[y + x];
+         p[0] = c[2];
+         p[1] = c[1];
+         p[2] = c[0];
+         p[3] = 0;
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp)
+{
+   int i;
+   stbi_uc *prev_out = 0;
+
+   if (g->out == 0 && !stbi__gif_header(s, g, comp,0))
+      return 0; // stbi__g_failure_reason set by stbi__gif_header
+
+   prev_out = g->out;
+   g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
+   if (g->out == 0) return stbi__errpuc("outofmem", "Out of memory");
+
+   switch ((g->eflags & 0x1C) >> 2) {
+      case 0: // unspecified (also always used on 1st frame)
+         stbi__fill_gif_background(g, 0, 0, 4 * g->w, 4 * g->w * g->h);
+         break;
+      case 1: // do not dispose
+         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
+         g->old_out = prev_out;
+         break;
+      case 2: // dispose to background
+         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
+         stbi__fill_gif_background(g, g->start_x, g->start_y, g->max_x, g->max_y);
+         break;
+      case 3: // dispose to previous
+         if (g->old_out) {
+            for (i = g->start_y; i < g->max_y; i += 4 * g->w)
+               memcpy(&g->out[i + g->start_x], &g->old_out[i + g->start_x], g->max_x - g->start_x);
+         }
+         break;
+   }
+
+   for (;;) {
+      switch (stbi__get8(s)) {
+         case 0x2C: /* Image Descriptor */
+         {
+            int prev_trans = -1;
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               if (g->transparent >= 0 && (g->eflags & 0x01)) {
+                  prev_trans = g->pal[g->transparent][3];
+                  g->pal[g->transparent][3] = 0;
+               }
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (o == NULL) return NULL;
+
+            if (prev_trans != -1)
+               g->pal[g->transparent][3] = (stbi_uc) prev_trans;
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            if (stbi__get8(s) == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = stbi__get16le(s);
+                  g->transparent = stbi__get8(s);
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0)
+               stbi__skip(s, len);
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+
+   STBI_NOTUSED(req_comp);
+}
+
+static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *u = 0;
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   memset(g, 0, sizeof(*g));
+
+   u = stbi__gif_load_next(s, g, comp, req_comp);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g->w;
+      *y = g->h;
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g->w, g->h);
+   }
+   else if (g->out)
+      STBI_FREE(g->out);
+   STBI_FREE(g);
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s)
+{
+   const char *signature = "#?RADIANCE\n";
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+         return 0;
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+
+
+   // Check identifier
+   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   // Read data
+   hdr_data = (float *) stbi__malloc(height * width * req_comp * sizeof(float));
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) scanline = (stbi_uc *) stbi__malloc(width * 4);
+
+         for (k = 0; k < 4; ++k) {
+            i = 0;
+            while (i < width) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;   
+   p = stbi__bmp_parse_header(s, &info);
+   stbi__rewind( s );
+   if (p == NULL)
+      return 0;
+   *x = s->img_x;
+   *y = s->img_y;
+   *comp = info.ma ? 4 : 3;
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   if (stbi__get16be(s) != 8) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained;
+   stbi__pic_packet packets[10];
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+//    Does not support 16-bit-per-channel
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *out;
+   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+      return 0;
+   *x = s->img_x;
+   *y = s->img_y;
+   *comp = s->img_n;
+
+   out = (stbi_uc *) stbi__malloc(s->img_n * s->img_x * s->img_y);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+
+   if (req_comp && req_comp != s->img_n) {
+      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv;
+   char c, p, t;
+
+   stbi__rewind( s );
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+
+   if (maxv > 255)
+      return stbi__err("max value > 255", "PPM image not 8-bit");
+   else
+      return 1;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/

+ 2 - 1
panda/src/putil/bam.h

@@ -32,7 +32,7 @@ static const unsigned short _bam_major_ver = 6;
 // Bumped to major version 6 on 2006-02-11 to factor out PandaNode::CData.
 // Bumped to major version 6 on 2006-02-11 to factor out PandaNode::CData.
 
 
 static const unsigned short _bam_first_minor_ver = 14;
 static const unsigned short _bam_first_minor_ver = 14;
-static const unsigned short _bam_minor_ver = 41;
+static const unsigned short _bam_minor_ver = 42;
 // Bumped to minor version 14 on 2007-12-19 to change default ColorAttrib.
 // Bumped to minor version 14 on 2007-12-19 to change default ColorAttrib.
 // Bumped to minor version 15 on 2008-04-09 to add TextureAttrib::_implicit_sort.
 // Bumped to minor version 15 on 2008-04-09 to add TextureAttrib::_implicit_sort.
 // Bumped to minor version 16 on 2008-05-13 to add Texture::_quality_level.
 // Bumped to minor version 16 on 2008-05-13 to add Texture::_quality_level.
@@ -61,5 +61,6 @@ static const unsigned short _bam_minor_ver = 41;
 // Bumped to minor version 39 on 2016-01-09 to change lights and materials.
 // Bumped to minor version 39 on 2016-01-09 to change lights and materials.
 // Bumped to minor version 40 on 2016-01-11 to make NodePaths writable.
 // Bumped to minor version 40 on 2016-01-11 to make NodePaths writable.
 // Bumped to minor version 41 on 2016-03-02 to change LensNode, Lens, and Camera.
 // Bumped to minor version 41 on 2016-03-02 to change LensNode, Lens, and Camera.
+// Bumped to minor version 42 on 2016-04-08 to expand ColorBlendAttrib.
 
 
 #endif
 #endif

+ 11 - 0
panda/src/putil/bamReader.I

@@ -159,6 +159,17 @@ get_file_pos() {
   return _source->get_file_pos();
   return _source->get_file_pos();
 }
 }
 
 
+/**
+ * Registers a factory function that is called when an object of the given
+ * type is encountered within the .bam stream.
+ *
+ * @param user_data an optional pointer to be passed along to the function.
+ */
+void BamReader::
+register_factory(TypeHandle handle, WritableFactory::CreateFunc *func, void *user_data) {
+  get_factory()->register_factory(handle, func, user_data);
+}
+
 /**
 /**
  * Returns the global WritableFactory for generating TypedWritable objects
  * Returns the global WritableFactory for generating TypedWritable objects
  */
  */

Some files were not shown because too many files changed in this diff