Browse Source

Merge branch 'master' into webgl-port

rdb 9 years ago
parent
commit
40cc045a52
100 changed files with 13923 additions and 1828 deletions
  1. 1 1
      direct/src/fsm/FSM.py
  2. 24 25
      direct/src/gui/DirectScrolledList.py
  3. 1 2
      direct/src/gui/OnscreenText.py
  4. 16 16
      direct/src/interval/MetaInterval.py
  5. 1 0
      direct/src/showbase/ShowBaseGlobal.py
  6. 0 374
      direct/src/showbase/pandaSqueezeTool.py
  7. 0 57
      direct/src/showbase/pandaSqueezer.py
  8. 17 3
      dtool/src/dtoolbase/pvector.h
  9. 3 2
      dtool/src/dtoolbase/typeRegistry.h
  10. 31 4
      dtool/src/interrogate/interfaceMakerPythonNative.cxx
  11. 4 1
      dtool/src/interrogate/interfaceMakerPythonNative.h
  12. 1 0
      dtool/src/parser-inc/sys/time.h
  13. 0 6
      makepanda/installer.nsi
  14. 23 12
      makepanda/makepanda.py
  15. 49 19
      makepanda/makepandacore.py
  16. 1 1
      panda/src/bullet/bulletContactResult.I
  17. 1 1
      panda/src/bullet/bulletContactResult.h
  18. 1 1
      panda/src/bullet/bulletHeightfieldShape.I
  19. 4 0
      panda/src/bullet/bulletTriangleMesh.cxx
  20. 10 4
      panda/src/chan/animChannelMatrixXfmTable.cxx
  21. 10 4
      panda/src/chan/animChannelScalarTable.cxx
  22. 11 0
      panda/src/chan/config_chan.cxx
  23. 9 9
      panda/src/display/drawableRegion.I
  24. 10 13
      panda/src/display/drawableRegion.cxx
  25. 1 1
      panda/src/display/drawableRegion.h
  26. 7 0
      panda/src/display/frameBufferProperties.cxx
  27. 3 0
      panda/src/display/graphicsEngine.cxx
  28. 9 0
      panda/src/display/graphicsStateGuardian.I
  29. 5 1
      panda/src/display/graphicsStateGuardian.cxx
  30. 3 0
      panda/src/display/graphicsStateGuardian.h
  31. 12 0
      panda/src/downloader/socketStream.h
  32. 5 0
      panda/src/dxgsg9/config_dxgsg9.cxx
  33. 55 31
      panda/src/dxgsg9/dxGraphicsStateGuardian9.cxx
  34. 1 0
      panda/src/dxgsg9/dxGraphicsStateGuardian9.h
  35. 53 0
      panda/src/express/zStreamBuf.cxx
  36. 3 0
      panda/src/express/zStreamBuf.h
  37. 15 5
      panda/src/gles2gsg/gles2gsg.h
  38. 760 1039
      panda/src/gles2gsg/panda_esgl2ext.h
  39. 4 5
      panda/src/glstuff/glGraphicsBuffer_src.cxx
  40. 251 65
      panda/src/glstuff/glGraphicsStateGuardian_src.cxx
  41. 6 0
      panda/src/glstuff/glGraphicsStateGuardian_src.h
  42. 9 8
      panda/src/glstuff/glShaderContext_src.cxx
  43. 2 1
      panda/src/gobj/geomPrimitive.cxx
  44. 9 0
      panda/src/gobj/geomVertexFormat.I
  45. 2 4
      panda/src/gobj/geomVertexFormat.cxx
  46. 4 0
      panda/src/gobj/geomVertexFormat.h
  47. 5 3
      panda/src/gobj/shader.cxx
  48. 174 78
      panda/src/gobj/texture.cxx
  49. 2 0
      panda/src/grutil/config_grutil.cxx
  50. 1 0
      panda/src/grutil/p3grutil_composite1.cxx
  51. 191 0
      panda/src/grutil/shaderTerrainMesh.I
  52. 715 0
      panda/src/grutil/shaderTerrainMesh.cxx
  53. 205 0
      panda/src/grutil/shaderTerrainMesh.h
  54. 5 0
      panda/src/movies/config_movies.cxx
  55. 2976 0
      panda/src/movies/dr_flac.h
  56. 12 0
      panda/src/movies/flacAudio.I
  57. 64 0
      panda/src/movies/flacAudio.cxx
  58. 54 0
      panda/src/movies/flacAudio.h
  59. 12 0
      panda/src/movies/flacAudioCursor.I
  60. 120 0
      panda/src/movies/flacAudioCursor.cxx
  61. 65 0
      panda/src/movies/flacAudioCursor.h
  62. 2 0
      panda/src/movies/p3movies_composite1.cxx
  63. 4 0
      panda/src/pgraph/alphaTestAttrib.h
  64. 5 0
      panda/src/pgraph/antialiasAttrib.h
  65. 3 0
      panda/src/pgraph/audioVolumeAttrib.h
  66. 3 0
      panda/src/pgraph/auxBitplaneAttrib.h
  67. 8 3
      panda/src/pgraph/camera.cxx
  68. 4 0
      panda/src/pgraph/colorAttrib.h
  69. 44 15
      panda/src/pgraph/colorBlendAttrib.I
  70. 67 5
      panda/src/pgraph/colorBlendAttrib.cxx
  71. 36 5
      panda/src/pgraph/colorBlendAttrib.h
  72. 3 0
      panda/src/pgraph/colorScaleAttrib.h
  73. 3 0
      panda/src/pgraph/colorWriteAttrib.h
  74. 4 0
      panda/src/pgraph/cullBinAttrib.h
  75. 5 0
      panda/src/pgraph/cullFaceAttrib.h
  76. 5 0
      panda/src/pgraph/depthOffsetAttrib.h
  77. 3 0
      panda/src/pgraph/depthTestAttrib.h
  78. 3 0
      panda/src/pgraph/depthWriteAttrib.h
  79. 3 0
      panda/src/pgraph/fogAttrib.h
  80. 3 0
      panda/src/pgraph/lightRampAttrib.h
  81. 3 0
      panda/src/pgraph/materialAttrib.h
  82. 6 1
      panda/src/pgraph/renderModeAttrib.h
  83. 1 0
      panda/src/pgraph/rescaleNormalAttrib.h
  84. 3 0
      panda/src/pgraph/scissorAttrib.h
  85. 1 0
      panda/src/pgraph/shadeModelAttrib.h
  86. 4 0
      panda/src/pgraph/shaderAttrib.h
  87. 1 0
      panda/src/pgraph/transparencyAttrib.h
  88. 3 0
      panda/src/pgraphnodes/config_pgraphnodes.cxx
  89. 2 0
      panda/src/pgraphnodes/p3pgraphnodes_composite1.cxx
  90. 1 2
      panda/src/pgraphnodes/p3pgraphnodes_composite2.cxx
  91. 48 0
      panda/src/pgraphnodes/sphereLight.I
  92. 146 0
      panda/src/pgraphnodes/sphereLight.cxx
  93. 90 0
      panda/src/pgraphnodes/sphereLight.h
  94. 7 0
      panda/src/pnmimagetypes/config_pnmimagetypes.cxx
  95. 1 0
      panda/src/pnmimagetypes/p3pnmimagetypes_composite2.cxx
  96. 509 0
      panda/src/pnmimagetypes/pnmFileTypeStbImage.cxx
  97. 73 0
      panda/src/pnmimagetypes/pnmFileTypeStbImage.h
  98. 6755 0
      panda/src/pnmimagetypes/stb_image.h
  99. 2 1
      panda/src/putil/bam.h
  100. 11 0
      panda/src/putil/bamReader.I

+ 1 - 1
direct/src/fsm/FSM.py

@@ -310,7 +310,7 @@ class FSM(DirectObject):
                 self.name, request, str(args)[1:]))
 
             filter = self.getCurrentFilter()
-            result = list(filter(request, args))
+            result = filter(request, args)
             if result:
                 if isinstance(result, str):
                     # If the return value is a string, it's just the name

+ 24 - 25
direct/src/gui/DirectScrolledList.py

@@ -8,7 +8,6 @@ from direct.directnotify import DirectNotifyGlobal
 from direct.task.Task import Task
 from .DirectFrame import *
 from .DirectButton import *
-import types
 
 
 class DirectScrolledListItem(DirectButton):
@@ -49,7 +48,7 @@ class DirectScrolledList(DirectFrame):
     def __init__(self, parent = None, **kw):
         assert self.notify.debugStateCall(self)
         self.index = 0
-        self.forceHeight = None
+        self.__forceHeight = None
 
         """ If one were to want a scrolledList that makes and adds its items
            as needed, simply pass in an items list of strings (type 'str')
@@ -115,12 +114,12 @@ class DirectScrolledList(DirectFrame):
 
     def setForceHeight(self):
         assert self.notify.debugStateCall(self)
-        self.forceHeight = self["forceHeight"]
+        self.__forceHeight = self["forceHeight"]
 
     def recordMaxHeight(self):
         assert self.notify.debugStateCall(self)
-        if self.forceHeight is not None:
-            self.maxHeight = self.forceHeight
+        if self.__forceHeight is not None:
+            self.maxHeight = self.__forceHeight
         else:
             self.maxHeight = 0.0
             for item in self["items"]:
@@ -130,24 +129,24 @@ class DirectScrolledList(DirectFrame):
     def setScrollSpeed(self):
         assert self.notify.debugStateCall(self)
         # Items per second to move
-        self.scrollSpeed = self["scrollSpeed"]
-        if self.scrollSpeed <= 0:
-            self.scrollSpeed = 1
+        self.__scrollSpeed = self["scrollSpeed"]
+        if self.__scrollSpeed <= 0:
+            self.__scrollSpeed = 1
 
     def setNumItemsVisible(self):
         assert self.notify.debugStateCall(self)
         # Items per second to move
-        self.numItemsVisible = self["numItemsVisible"]
+        self.__numItemsVisible = self["numItemsVisible"]
 
     def destroy(self):
         assert self.notify.debugStateCall(self)
         taskMgr.remove(self.taskName("scroll"))
         if hasattr(self, "currentSelected"):
             del self.currentSelected
-        if self.incButtonCallback:
-            self.incButtonCallback = None
-        if self.decButtonCallback:
-            self.decButtonCallback = None
+        if self.__incButtonCallback:
+            self.__incButtonCallback = None
+        if self.__decButtonCallback:
+            self.__decButtonCallback = None
         self.incButton.destroy()
         self.decButton.destroy()
         DirectFrame.destroy(self)
@@ -169,10 +168,10 @@ class DirectScrolledList(DirectFrame):
         #for i in range(len(self["items"])):
         #    print "buttontext[", i,"]", self["items"][i]["text"]
 
-        if(len(self["items"])==0):
+        if len(self["items"]) == 0:
             return 0
 
-        if(type(self["items"][0])!=types.InstanceType):
+        if type(self["items"][0]) == type(''):
             self.notify.warning("getItemIndexForItemID: cant find itemID for non-class list items!")
             return 0
 
@@ -309,7 +308,7 @@ class DirectScrolledList(DirectFrame):
     def __incButtonDown(self, event):
         assert self.notify.debugStateCall(self)
         task = Task(self.__scrollByTask)
-        task.setDelay(1.0 / self.scrollSpeed)
+        task.setDelay(1.0 / self.__scrollSpeed)
         task.prevTime = 0.0
         task.delta = 1
         taskName = self.taskName("scroll")
@@ -317,13 +316,13 @@ class DirectScrolledList(DirectFrame):
         taskMgr.add(task, taskName)
         self.scrollBy(task.delta)
         messenger.send('wakeup')
-        if self.incButtonCallback:
-            self.incButtonCallback()
+        if self.__incButtonCallback:
+            self.__incButtonCallback()
 
     def __decButtonDown(self, event):
         assert self.notify.debugStateCall(self)
         task = Task(self.__scrollByTask)
-        task.setDelay(1.0 / self.scrollSpeed)
+        task.setDelay(1.0 / self.__scrollSpeed)
         task.prevTime = 0.0
         task.delta = -1
         taskName = self.taskName("scroll")
@@ -331,8 +330,8 @@ class DirectScrolledList(DirectFrame):
         taskMgr.add(task, taskName)
         self.scrollBy(task.delta)
         messenger.send('wakeup')
-        if self.decButtonCallback:
-            self.decButtonCallback()
+        if self.__decButtonCallback:
+            self.__decButtonCallback()
 
     def __buttonUp(self, event):
         assert self.notify.debugStateCall(self)
@@ -345,7 +344,7 @@ class DirectScrolledList(DirectFrame):
         Add this string and extraArg to the list
         """
         assert self.notify.debugStateCall(self)
-        if(type(item) == types.InstanceType):
+        if type(item) != type(''):
             # cant add attribs to non-classes (like strings & ints)
             item.itemID = self.nextItemID
             self.nextItemID += 1
@@ -354,7 +353,7 @@ class DirectScrolledList(DirectFrame):
             item.reparentTo(self.itemFrame)
         if refresh:
             self.refresh()
-        if(type(item) == types.InstanceType):
+        if type(item) != type(''):
             return item.itemID  # to pass to scrollToItemID
 
     def removeItem(self, item, refresh=1):
@@ -466,11 +465,11 @@ class DirectScrolledList(DirectFrame):
 
     def setIncButtonCallback(self):
         assert self.notify.debugStateCall(self)
-        self.incButtonCallback = self["incButtonCallback"]
+        self.__incButtonCallback = self["incButtonCallback"]
 
     def setDecButtonCallback(self):
         assert self.notify.debugStateCall(self)
-        self.decButtonCallback = self["decButtonCallback"]
+        self.__decButtonCallback = self["decButtonCallback"]
 
 
 """

+ 1 - 2
direct/src/gui/OnscreenText.py

@@ -4,7 +4,6 @@ __all__ = ['OnscreenText', 'Plain', 'ScreenTitle', 'ScreenPrompt', 'NameConfirm'
 
 from panda3d.core import *
 from . import DirectGuiGlobals as DGG
-from direct.showbase.DirectObject import DirectObject
 import sys
 
 ## These are the styles of text we might commonly see.  They set the
@@ -17,7 +16,7 @@ ScreenPrompt = 3
 NameConfirm = 4
 BlackOnWhite = 5
 
-class OnscreenText(DirectObject, NodePath):
+class OnscreenText(NodePath):
 
     def __init__(self, text = '',
                  style = Plain,

+ 16 - 16
direct/src/interval/MetaInterval.py

@@ -268,7 +268,7 @@ class MetaInterval(CMetaInterval):
             self.addInterval(ival, maxDuration - ival.getDuration(), TRACK_START)
         self.popLevel(duration)
 
-    def addTrack(self, list, name, relTime, relTo, duration):
+    def addTrack(self, trackList, name, relTime, relTo, duration):
         # Adds a "track list".  This is a list of tuples of the form:
         #
         #   (<delay>, <Interval>,
@@ -281,19 +281,19 @@ class MetaInterval(CMetaInterval):
         # (TRACK_START).  If the relative code is omitted, the default
         # is TRACK_START.
         self.pushLevel(name, relTime, relTo)
-        for tuple in list:
-            if isinstance(tuple, tuple) or \
-               isinstance(tuple, list):
-                relTime = tuple[0]
-                ival = tuple[1]
-                if len(tuple) >= 3:
-                    relTo = tuple[2]
+        for tupleObj in trackList:
+            if isinstance(tupleObj, tuple) or \
+               isinstance(tupleObj, list):
+                relTime = tupleObj[0]
+                ival = tupleObj[1]
+                if len(tupleObj) >= 3:
+                    relTo = tupleObj[2]
                 else:
                     relTo = TRACK_START
                 self.addInterval(ival, relTime, relTo)
 
             else:
-                self.notify.error("Not a tuple in Track: %s" % (tuple,))
+                self.notify.error("Not a tuple in Track: %s" % (tupleObj,))
         self.popLevel(duration)
 
     def addInterval(self, ival, relTime, relTo):
@@ -593,22 +593,22 @@ class Track(MetaInterval):
         meta.addTrack(self.ivals, self.getName(),
                       relTime, relTo, self.phonyDuration)
 
-    def validateComponent(self, tuple):
+    def validateComponent(self, tupleObj):
         # This is called only in debug mode to verify that the
         # indicated component added to the MetaInterval is appropriate
         # to this type of MetaInterval.  In most cases except Track,
         # this is the same as asking that the component is itself an
         # Interval.
 
-        if not (isinstance(tuple, tuple) or \
-                isinstance(tuple, list)):
+        if not (isinstance(tupleObj, tuple) or \
+                isinstance(tupleObj, list)):
             # It's not a tuple.
             return 0
 
-        relTime = tuple[0]
-        ival = tuple[1]
-        if len(tuple) >= 3:
-            relTo = tuple[2]
+        relTime = tupleObj[0]
+        ival = tupleObj[1]
+        if len(tupleObj) >= 3:
+            relTo = tupleObj[2]
         else:
             relTo = TRACK_START
 

+ 1 - 0
direct/src/showbase/ShowBaseGlobal.py

@@ -20,6 +20,7 @@ def inspect(anObject):
     Inspector = importlib.import_module('direct.tkpanels.Inspector')
     return Inspector.inspect(anObject)
 
+import sys
 if sys.version_info >= (3, 0):
     import builtins
 else:

+ 0 - 374
direct/src/showbase/pandaSqueezeTool.py

@@ -1,374 +0,0 @@
-"""Undocumented Module"""
-
-__all__ = ['usage', 'Squeezer', 'Loader', 'boot', 'open', 'explode', 'getloader', 'squeeze', 'searchPath']
-
-#!/usr/bin/env python
-#
-# SQUEEZE
-#
-# squeeze a python program
-#
-# installation:
-# - use this script as is, or squeeze it using the following command:
-#
-# python squeezeTool.py -1su -o squeeze -b squeezeTool squeezeTool.py
-#
-# notes:
-# - this is pretty messy.  make sure to test everything carefully
-#   if you change anything
-#
-# - the name "squeeze" is taken from an ABC800 utility which did
-#   about the same thing with Basic II bytecodes.
-#
-# history:
-# 1.0   97-04-22 fl     Created
-# 1.1   97-05-25 fl     Added base64 embedding option (-1)
-#       97-05-25 fl     Check for broken package file
-# 1.2   97-05-26 fl     Support uncompressed packages (-u)
-# 1.3   97-05-27 fl     Check byte code magic, eliminated StringIO, etc.
-# 1.4   97-06-04 fl     Removed last bits of white space, removed try/except
-# 1.5   97-06-17 fl     Added squeeze archive capabilities (-x)
-# 1.6   98-05-04 fl     Minor fixes in preparation for public source release
-#
-# reviews:
-#       "Fredrik Lundh is a friggin genius"
-#       -- Aaron Watters, author of 'Internet Programming with Python'
-#
-#       "I agree ... this is a friggin Good Thing"
-#       -- Paul Everitt, Digital Creations
-#
-# Copyright (c) 1997 by Fredrik Lundh.
-# Copyright (c) 1997-1998 by Secret Labs AB
-#
-# [email protected]
-# http://www.pythonware.com
-#
-# --------------------------------------------------------------------
-# Permission to use, copy, modify, and distribute this software and
-# its associated documentation for any purpose and without fee is
-# hereby granted.  This software is provided as is.
-# --------------------------------------------------------------------
-
-VERSION = "1.6/98-05-04"
-MAGIC   = "[PANDASQUEEZE]"
-
-import base64, imp, marshal, os, sys
-
-# --------------------------------------------------------------------
-# usage
-
-def usage():
-        print("\nSQUEEZE", VERSION, "(c) 1997-1998 by Secret Labs AB")
-        print("""\
-Convert a Python application to a compressed module package.
-
-Usage: squeeze [-1ux] -o app [-b start] modules... [-d files...]
-
-This utility creates a compressed package file named "app.pyz", which
-contains the given module files.  It also creates a bootstrap script
-named "app.py", which loads the package and imports the given "start"
-module to get things going.  Example:
-
-        squeeze -o app -b appMain app*.py
-
-The -1 option tells squeeze to put the package file inside the boot-
-strap script using base64 encoding.  The result is a single text file
-containing the full application.
-
-The -u option disables compression.  Otherwise, the package will be
-compressed using zlib, and the user needs zlib to run the resulting
-application.
-
-The -d option can be used to put additional files in the package file.
-You can access these files via "__main__.open(filename)" (returns a
-StringIO file object).
-
-The -x option can be used with -d to create a self-extracting archive,
-instead of a package.  When the resulting script is executed, the
-data files are extracted.  Omit the -b option in this case.
-""")
-        sys.exit(1)
-
-
-# --------------------------------------------------------------------
-# squeezer -- collect squeezed modules
-
-class Squeezer:
-
-        def __init__(self):
-
-                self.rawbytes = self.bytes = 0
-                self.modules = {}
-
-        def addmodule(self, file):
-
-                if file[-1] == "c":
-                        file = file[:-1]
-
-                m = os.path.splitext(os.path.split(file)[1])[0]
-
-                # read sourcefile
-                f = open(file)
-                codestring = f.read()
-                f.close()
-
-                # dump to file
-                self.modules[m] = compile(codestring, m, "exec")
-
-        def adddata(self, file):
-
-                self.modules["+"+file] = open(file, "rb").read()
-
-        def getarchive(self):
-
-                # marshal our module dictionary
-                data = marshal.dumps(self.modules)
-                self.rawbytes = len(data)
-
-                # return (compressed) dictionary
-                data = zlib.compress(data, 9)
-                self.bytes = len(data)
-
-                return data
-
-        def getstatus(self):
-                return self.bytes, self.rawbytes
-
-
-# --------------------------------------------------------------------
-# loader (used in bootstrap code)
-
-loader = """
-import ihooks
-
-PYZ_MODULE = 64
-
-class Loader(ihooks.ModuleLoader):
-
-    def __init__(self, modules):
-        self.__modules = modules
-        return ihooks.ModuleLoader.__init__(self)
-
-    def find_module(self, name, path = None):
-        try:
-            self.__modules[name]
-            return None, None, (None, None, PYZ_MODULE)
-        except KeyError:
-            return ihooks.ModuleLoader.find_module(self, name, path)
-
-    def load_module(self, name, stuff):
-        file, filename, (suff, mode, type) = stuff
-        if type != PYZ_MODULE:
-            return ihooks.ModuleLoader.load_module(self, name, stuff)
-        #print "PYZ:", "import", name
-        basename = name.split('.')[-1]
-        code = self.__modules[basename]
-        del self.__modules[basename] # no need to keep this one around
-        m = self.hooks.add_module(name)
-        m.__file__ = filename
-        exec code in m.__dict__
-        return m
-
-def boot(name, fp, size, offset = 0):
-
-    global data
-
-    try:
-        import %(modules)s
-    except ImportError:
-        #print "PYZ:", "failed to load marshal and zlib libraries"
-        return # cannot boot from PYZ file
-    #print "PYZ:", "boot from", name+".PYZ"
-
-    # load archive and install import hook
-    if offset:
-        data = fp[offset:]
-    else:
-        data = fp.read(size)
-        fp.close()
-
-    if len(data) != size:
-        raise IOError, "package is truncated"
-
-    data = marshal.loads(%(data)s)
-
-    ihooks.install(ihooks.ModuleImporter(Loader(data)))
-"""
-
-loaderopen = """
-
-def open(name):
-    from io import StringIO
-    try:
-        return StringIO(data["+"+name])
-    except KeyError:
-        raise IOError, (0, "no such file")
-"""
-
-loaderexplode = """
-
-def explode():
-    for k, v in data.items():
-        if k[0] == "+":
-            try:
-                open(k[1:], "wb").write(v)
-                print k[1:], "extracted ok"
-            except IOError, v:
-                print k[1:], "failed:", "IOError", v
-
-"""
-
-def getloader(data, package):
-
-        s = loader
-
-        if data:
-                if explode:
-                        s = s + loaderexplode
-                else:
-                        s = s + loaderopen
-
-        dict = {
-                "modules": "marshal, zlib",
-                "data":    "zlib.decompress(data)",
-                }
-
-        s = s % dict
-
-        return marshal.dumps(compile(s, "<package>", "exec"))
-
-
-# --------------------------------------------------------------------
-# Main
-# --------------------------------------------------------------------
-
-#
-# parse options
-
-import sys
-import zlib
-
-embed = 0
-explode = 0
-
-def squeeze(app, start, filelist, outputDir):
-        localMagic = MAGIC
-        data = None
-
-        bootstrap = os.path.join(outputDir, app + ".py")
-        archiveBase = app + ".pyz"
-        archive   = os.path.join(outputDir, archiveBase)
-
-        archiveid = app
-
-        #
-        # avoid overwriting files not generated by squeeze
-
-        try:
-                fp = open(bootstrap)
-                s = fp.readline()
-                s.index(MAGIC)
-        except IOError:
-                pass
-        except ValueError:
-                print("%s was not created by squeeze.  You have to manually" % (bootstrap))
-                print("remove the file to proceed.")
-                sys.exit(1)
-
-        #
-        # collect modules
-
-        sq = Squeezer()
-        for file in filelist:
-                # print 'addmodule:', file
-                sq.addmodule(file)
-
-        package = sq.getarchive()
-        size = len(package)
-
-        #
-        # get loader
-
-        loader = getloader(data, package)
-
-        zbegin, zend = "zlib.decompress(", ")"
-        loader = zlib.compress(loader, 9)
-
-        loaderlen = len(loader)
-
-        magic = repr(imp.get_magic())
-        version = sys.version.split()[0]
-
-        #
-        # generate script and package files
-
-        if embed:
-
-                # embedded archive
-                data = base64.encodestring(loader + package)
-
-                fp = open(bootstrap, "w")
-                fp.write('''\
-#%(localMagic)s %(archiveid)s
-import ihooks, zlib, base64, marshal
-s=base64.decodestring("""
-%(data)s""")
-exec marshal.loads(%(zbegin)ss[:%(loaderlen)d]%(zend)s)
-boot("%(app)s", s, %(size)d, %(loaderlen)d)
-exec "import %(start)s"
-''' % locals())
-                bytes = fp.tell()
-
-        else:
-
-                # separate archive file
-
-                fp = open(archive, "wb")
-
-                fp.write(loader)
-                fp.write(package)
-
-                bytes = fp.tell()
-                fp.close()
-                #
-                # create bootstrap code
-
-                fp = open(bootstrap, "w")
-                # Note: David Rose adjusted the following to be panda-specific.
-                fp.write("""\
-#%(localMagic)s %(archiveid)s
-import ihooks, zlib, marshal, os, sys
-
-def searchPath(filename):
-  # Look along panda3d.__path__ for the indicated filename.  Returns
-  # the located pathname, or None if the filename is not found.
-  import panda3d
-
-  for dir in panda3d.__path__:
-    pathname = os.path.join(dir, filename)
-    if os.path.exists(pathname):
-      return pathname
-
-  return None
-
-# Look for %(archiveBase)s along panda3d.__path__.
-archiveName = "%(archiveBase)s"
-archivePath = searchPath(archiveName)
-if archivePath == None:
-  raise ImportError, "Could not locate panda3d.%%s." %% (archiveName)
-
-f=open(archivePath,"rb")
-exec marshal.loads(%(zbegin)sf.read(%(loaderlen)d)%(zend)s)
-boot("%(app)s", f, %(size)d)
-exec "from %(start)s import *"
-#exec "run()"
-""" % locals())
-                bytes = bytes + fp.tell()
-                fp.close()
-
-        #
-        # show statistics
-
-        dummy, rawbytes = sq.getstatus()
-
-        print("squeezed %s to %s (%d%%)" % (rawbytes, bytes, bytes * 100 / rawbytes))

+ 0 - 57
direct/src/showbase/pandaSqueezer.py

@@ -1,57 +0,0 @@
-"""Undocumented Module"""
-
-__all__ = []
-
-import os
-import sys
-import getopt
-from . import pandaSqueezeTool
-
-# Assumption: We will be squeezing the files from the current directory or the -d directory.
-
-if __name__ == "__main__":
-    try:
-        opts, pargs = getopt.getopt(sys.argv[1:], 'Od:')
-    except Exception as e:
-        # User passed in a bad option, print the error and the help, then exit
-        print(e)
-        print('Usage: pass in -O for optimized')
-        print('       pass in -d directory')
-        sys.exit()
-
-    fOptimized = 0
-    # Store the option values into our variables
-    for opt in opts:
-        flag, value = opt
-        if (flag == '-O'):
-            fOptimized = 1
-            print('Squeezing pyo files')
-        elif (flag == '-d'):
-            os.chdir(value)
-
-    def getSqueezeableFiles():
-        fileList = os.listdir(".")
-        newFileList = []
-        if fOptimized:
-            targetFileExtension = ".pyo"
-        else:
-            targetFileExtension = ".pyc"
-        for i in fileList:
-            base, ext = os.path.splitext(i)
-            if (ext == ".py"):
-                newFileList.append(i)
-        return newFileList
-
-    def squeezePandaFiles():
-        l = getSqueezeableFiles()
-        pandaSqueezeTool.squeeze("PandaModules", "PandaModulesUnsqueezed", l)
-
-        # Clean up the source files now that they've been squeezed.  If
-        # you don't like this behavior (e.g. if you want to inspect the
-        # generated files), use genPyCode -n to avoid squeezing
-        # altogether.
-        for i in l:
-            os.unlink(i)
-
-
-    squeezePandaFiles()

+ 17 - 3
dtool/src/dtoolbase/pvector.h

@@ -39,11 +39,25 @@ public:
   typedef vector<Type, allocator> base_class;
   typedef TYPENAME base_class::size_type size_type;
 
-  pvector(TypeHandle type_handle = pvector_type_handle) : base_class(allocator(type_handle)) { }
+  explicit pvector(TypeHandle type_handle = pvector_type_handle) : base_class(allocator(type_handle)) { }
   pvector(const pvector<Type> &copy) : base_class(copy) { }
-  pvector(size_type n, TypeHandle type_handle = pvector_type_handle) : base_class(n, Type(), allocator(type_handle)) { }
-  pvector(size_type n, const Type &value, TypeHandle type_handle = pvector_type_handle) : base_class(n, value, allocator(type_handle)) { }
+  explicit pvector(size_type n, TypeHandle type_handle = pvector_type_handle) : base_class(n, Type(), allocator(type_handle)) { }
+  explicit pvector(size_type n, const Type &value, TypeHandle type_handle = pvector_type_handle) : base_class(n, value, allocator(type_handle)) { }
   pvector(const Type *begin, const Type *end, TypeHandle type_handle = pvector_type_handle) : base_class(begin, end, allocator(type_handle)) { }
+
+#ifdef USE_MOVE_SEMANTICS
+  pvector(pvector<Type> &&from) NOEXCEPT : base_class(move(from)) {};
+
+  pvector<Type> &operator =(pvector<Type> &&from) NOEXCEPT {
+    base_class::operator =(move(from));
+    return *this;
+  }
+#endif
+
+  pvector<Type> &operator =(const pvector<Type> &copy) {
+    base_class::operator =(copy);
+    return *this;
+  }
 };
 
 #endif  // USE_STL_ALLOCATOR

+ 3 - 2
dtool/src/dtoolbase/typeRegistry.h

@@ -37,14 +37,15 @@ class EXPCL_DTOOL TypeRegistry : public MemoryBase {
 public:
   // User code shouldn't generally need to call TypeRegistry::register_type()
   // or record_derivation() directly; instead, use the register_type
-  // convenience function, defined below.
+  // convenience function, defined in register_type.h.
   bool register_type(TypeHandle &type_handle, const string &name);
+
+PUBLISHED:
   TypeHandle register_dynamic_type(const string &name);
 
   void record_derivation(TypeHandle child, TypeHandle parent);
   void record_alternate_name(TypeHandle type, const string &name);
 
-PUBLISHED:
   TypeHandle find_type(const string &name) const;
   TypeHandle find_type_by_id(int id) const;
 

+ 31 - 4
dtool/src/interrogate/interfaceMakerPythonNative.cxx

@@ -1212,6 +1212,10 @@ write_sub_module(ostream &out, Object *obj) {
       out << "  assert(" << class_ptr << " != NULL);\n";
     } else {
       class_ptr = "&Dtool_" + class_name;
+
+      // If this is a typedef to a class defined in the same module, make sure
+      // that the class is initialized before we try to define the typedef.
+      out << "  Dtool_PyModuleClassInit_" << class_name << "(module);\n";
     }
   }
 
@@ -1734,7 +1738,7 @@ write_module_class(ostream &out, Object *obj) {
 
       switch (rfi->second._wrapper_type) {
       case WT_no_params:
-      case WT_iter_next: // TODO: fix iter_next to return NULL instead of None
+      case WT_iter_next:
         // PyObject *func(PyObject *self)
         {
           out << "//////////////////\n";
@@ -1747,9 +1751,15 @@ write_module_class(ostream &out, Object *obj) {
           out << "    return NULL;\n";
           out << "  }\n\n";
 
+          int return_flags = RF_pyobject | RF_err_null;
+          if (rfi->second._wrapper_type == WT_iter_next) {
+            // If the function returns NULL, we should return NULL to indicate
+            // a StopIteration, rather than returning None.
+            return_flags |= RF_preserve_null;
+          }
           string expected_params;
           write_function_forset(out, def._remaps, 0, 0, expected_params, 2, true, true,
-                                AT_no_args, RF_pyobject | RF_err_null, false);
+                                AT_no_args, return_flags, false);
 
           out << "  if (!_PyErr_OCCURRED()) {\n";
           out << "    return Dtool_Raise_BadArgumentsError(\n";
@@ -2692,6 +2702,12 @@ write_module_class(ostream &out, Object *obj) {
   out << "#if PY_VERSION_HEX >= 0x02050000\n";
   write_function_slot(out, 2, slots, "nb_index");
   out << "#endif\n";
+
+  out << "#if PY_VERSION_HEX >= 0x03050000\n";
+  write_function_slot(out, 2, slots, "nb_matrix_multiply");
+  write_function_slot(out, 2, slots, "nb_inplace_matrix_multiply");
+  out << "#endif\n";
+
   out << "};\n\n";
 
   // NB: it's tempting not to write this table when a class doesn't have them.
@@ -2938,6 +2954,10 @@ write_module_class(ostream &out, Object *obj) {
   out << "#if PY_VERSION_HEX >= 0x02060000\n";
   out << "    0, // tp_version_tag\n";
   out << "#endif\n";
+  // destructor tp_finalize
+  out << "#if PY_VERSION_HEX >= 0x03040000\n";
+  out << "    0, // tp_finalize\n";
+  out << "#endif\n";
   out << "  },\n";
 
   // It's tempting to initialize the type handle here, but this causes static
@@ -5842,8 +5862,15 @@ write_function_instance(ostream &out, FunctionRemap *remap,
       indent(out, indent_level) << "Py_INCREF(Py_None);\n";
       indent(out, indent_level) << "return Py_None;\n";
 
+    } else if (return_flags & RF_preserve_null) {
+      indent(out, indent_level) << "if (" << return_expr << " == NULL) {\n";
+      indent(out, indent_level) << "  return NULL;\n";
+      indent(out, indent_level) << "} else {\n";
+      pack_return_value(out, indent_level + 2, remap, return_expr, return_flags);
+      indent(out, indent_level) << "}\n";
+
     } else {
-      pack_return_value(out, indent_level, remap, return_expr);
+      pack_return_value(out, indent_level, remap, return_expr, return_flags);
     }
 
   } else if (return_flags & RF_coerced) {
@@ -6000,7 +6027,7 @@ error_raise_return(ostream &out, int indent_level, int return_flags,
  */
 void InterfaceMakerPythonNative::
 pack_return_value(ostream &out, int indent_level, FunctionRemap *remap,
-                  string return_expr) {
+                  string return_expr, int return_flags) {
 
   ParameterRemap *return_type = remap->_return_type;
   CPPType *orig_type = return_type->get_orig_type();

+ 4 - 1
dtool/src/interrogate/interfaceMakerPythonNative.h

@@ -101,6 +101,9 @@ private:
     // Assign to the coerced argument, in the case of a coercion constructor.
     RF_coerced = 0x040,
 
+    // Don't automatically map NULL to None
+    RF_preserve_null = 0x080,
+
     // These indicate what should be returned on error.
     RF_err_notimplemented = 0x002,
     RF_err_null = 0x004,
@@ -164,7 +167,7 @@ private:
                           const string &exc_type, const string &message,
                           const string &format_args = "");
   void pack_return_value(ostream &out, int indent_level, FunctionRemap *remap,
-                         std::string return_expr);
+                         std::string return_expr, int return_flags);
 
   void write_make_seq(ostream &out, Object *obj, const std::string &ClassName,
                       const std::string &cClassName, MakeSeq *make_seq);

+ 1 - 0
dtool/src/parser-inc/sys/time.h

@@ -2,3 +2,4 @@
 
 struct timeval;
 struct fd_set;
+struct timezone;

+ 0 - 6
makepanda/installer.nsi

@@ -15,7 +15,6 @@
 ;   BUILT         - location of panda install tree.
 ;   SOURCE        - location of the panda source-tree if available, OR location of panda install tree.
 ;   PYVER         - version of Python that Panda was built with (ie, "2.7")
-;   PYEXTRAS      - directory containing python extras, if any.
 ;   REGVIEW       - either 32 or 64, depending on the build architecture.
 ;
 
@@ -372,11 +371,6 @@ SectionGroup "Python support"
         SetOutPath "$INSTDIR\python"
         File /r "${BUILT}\python\*"
 
-        !ifdef PYEXTRAS
-        SetOutPath "$INSTDIR\python\lib"
-        File /nonfatal /r "${PYEXTRAS}\*"
-        !endif
-
         SetDetailsPrint both
         DetailPrint "Adding registry keys for Python..."
         SetDetailsPrint listonly

+ 23 - 12
makepanda/makepanda.py

@@ -561,6 +561,10 @@ if (COMPILER == "MSVC"):
                 LibName(pkg, 'dxerrVNUM.lib'.replace("VNUM", vnum))
             #LibName(pkg, 'ddraw.lib')
             LibName(pkg, 'dxguid.lib')
+
+    if not PkgSkip("FREETYPE") and os.path.isdir(GetThirdpartyDir() + "freetype/include/freetype2"):
+        IncDirectory("FREETYPE", GetThirdpartyDir() + "freetype/include/freetype2")
+
     IncDirectory("ALWAYS", GetThirdpartyDir() + "extras/include")
     LibName("WINSOCK", "wsock32.lib")
     LibName("WINSOCK2", "wsock32.lib")
@@ -587,17 +591,26 @@ if (COMPILER == "MSVC"):
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "quartz.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbc32.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbccp32.lib")
-    if (PkgSkip("PNG")==0):      LibName("PNG",      GetThirdpartyDir() + "png/lib/libpng_static.lib")
+    if (PkgSkip("OPENSSL")==0):
+        LibName("OPENSSL", GetThirdpartyDir() + "openssl/lib/libpandassl.lib")
+        LibName("OPENSSL", GetThirdpartyDir() + "openssl/lib/libpandaeay.lib")
+    if (PkgSkip("PNG")==0):
+        if os.path.isfile(GetThirdpartyDir() + "png/lib/libpng16_static.lib"):
+            LibName("PNG", GetThirdpartyDir() + "png/lib/libpng16_static.lib")
+        else:
+            LibName("PNG", GetThirdpartyDir() + "png/lib/libpng_static.lib")
+    if (PkgSkip("TIFF")==0):
+        if os.path.isfile(GetThirdpartyDir() + "tiff/lib/libtiff.lib"):
+            LibName("TIFF", GetThirdpartyDir() + "tiff/lib/libtiff.lib")
+        else:
+            LibName("TIFF", GetThirdpartyDir() + "tiff/lib/tiff.lib")
     if (PkgSkip("JPEG")==0):     LibName("JPEG",     GetThirdpartyDir() + "jpeg/lib/jpeg-static.lib")
-    if (PkgSkip("TIFF")==0):     LibName("TIFF",     GetThirdpartyDir() + "tiff/lib/libtiff.lib")
     if (PkgSkip("ZLIB")==0):     LibName("ZLIB",     GetThirdpartyDir() + "zlib/lib/zlibstatic.lib")
     if (PkgSkip("VRPN")==0):     LibName("VRPN",     GetThirdpartyDir() + "vrpn/lib/vrpn.lib")
     if (PkgSkip("VRPN")==0):     LibName("VRPN",     GetThirdpartyDir() + "vrpn/lib/quat.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("CGGL",     GetThirdpartyDir() + "nvidiacg/lib/cgGL.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("CGDX9",    GetThirdpartyDir() + "nvidiacg/lib/cgD3D9.lib")
     if (PkgSkip("NVIDIACG")==0): LibName("NVIDIACG", GetThirdpartyDir() + "nvidiacg/lib/cg.lib")
-    if (PkgSkip("OPENSSL")==0):  LibName("OPENSSL",  GetThirdpartyDir() + "openssl/lib/libpandassl.lib")
-    if (PkgSkip("OPENSSL")==0):  LibName("OPENSSL",  GetThirdpartyDir() + "openssl/lib/libpandaeay.lib")
     if (PkgSkip("FREETYPE")==0): LibName("FREETYPE", GetThirdpartyDir() + "freetype/lib/freetype.lib")
     if (PkgSkip("FFTW")==0):     LibName("FFTW",     GetThirdpartyDir() + "fftw/lib/rfftw.lib")
     if (PkgSkip("FFTW")==0):     LibName("FFTW",     GetThirdpartyDir() + "fftw/lib/fftw.lib")
@@ -703,7 +716,7 @@ if (COMPILER == "MSVC"):
         IncDirectory("SPEEDTREE", SDK["SPEEDTREE"] + "/Include")
     if (PkgSkip("BULLET")==0):
         suffix = '.lib'
-        if GetTargetArch() == 'x64':
+        if GetTargetArch() == 'x64' and os.path.isfile(GetThirdpartyDir() + "bullet/lib/BulletCollision_x64.lib"):
             suffix = '_x64.lib'
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/LinearMath" + suffix)
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/BulletCollision" + suffix)
@@ -2237,6 +2250,7 @@ DTOOL_CONFIG=[
     ("HAVE_SOFTIMAGE_PIC",             '1',                      '1'),
     ("HAVE_BMP",                       '1',                      '1'),
     ("HAVE_PNM",                       '1',                      '1'),
+    ("HAVE_STB_IMAGE",                 '1',                      '1'),
     ("HAVE_VORBIS",                    'UNDEF',                  'UNDEF'),
     ("HAVE_NVIDIACG",                  'UNDEF',                  'UNDEF'),
     ("HAVE_FREETYPE",                  'UNDEF',                  'UNDEF'),
@@ -3467,8 +3481,7 @@ if (not RUNTIME):
   TargetAdd('libp3putil.in', opts=OPTS, input=IGATEFILES)
   TargetAdd('libp3putil.in', opts=['IMOD:panda3d.core', 'ILIB:libp3putil', 'SRCDIR:panda/src/putil'])
   TargetAdd('libp3putil_igate.obj', input='libp3putil.in', opts=["DEPENDENCYONLY"])
-  TargetAdd('p3putil_typedWritable_ext.obj', opts=OPTS, input='typedWritable_ext.cxx')
-  TargetAdd('p3putil_pythonCallbackObject.obj', opts=OPTS, input='pythonCallbackObject.cxx')
+  TargetAdd('p3putil_ext_composite.obj', opts=OPTS, input='p3putil_ext_composite.cxx')
 
 #
 # DIRECTORY: panda/src/audio/
@@ -4077,8 +4090,7 @@ if (not RUNTIME):
   if PkgSkip("FREETYPE")==0:
     TargetAdd('core.pyd', input="libp3pnmtext_igate.obj")
 
-  TargetAdd('core.pyd', input='p3putil_typedWritable_ext.obj')
-  TargetAdd('core.pyd', input='p3putil_pythonCallbackObject.obj')
+  TargetAdd('core.pyd', input='p3putil_ext_composite.obj')
   TargetAdd('core.pyd', input='p3pnmimage_pfmFile_ext.obj')
   TargetAdd('core.pyd', input='p3event_pythonTask.obj')
   TargetAdd('core.pyd', input='p3gobj_ext_composite.obj')
@@ -6600,7 +6612,6 @@ def MakeInstallerNSIS(file, title, installdir):
         'BUILT'       : panda,
         'SOURCE'      : psource,
         'PYVER'       : SDK["PYTHONVERSION"][6:9],
-        'PYEXTRAS'    : os.path.join(os.path.abspath(GetThirdpartyBase()), 'win-extras'),
         'REGVIEW'     : regview,
     }
 
@@ -6977,8 +6988,8 @@ def MakeInstallerOSX():
         oscmd("cp -R %s/pandac                dstroot/pythoncode/Developer/Panda3D/pandac" % GetOutputDir())
         oscmd("cp -R %s/direct                dstroot/pythoncode/Developer/Panda3D/direct" % GetOutputDir())
         oscmd("ln -s %s                       dstroot/pythoncode/usr/local/bin/ppython" % SDK["PYTHONEXEC"])
-        oscmd("cp -R %s/*.so                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir())
-        oscmd("cp -R %s/*.py                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir())
+        oscmd("cp -R %s/*.so                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir(), True)
+        oscmd("cp -R %s/*.py                  dstroot/pythoncode/Developer/Panda3D/" % GetOutputDir(), True)
         if os.path.isdir(GetOutputDir()+"/Pmw"):
             oscmd("cp -R %s/Pmw               dstroot/pythoncode/Developer/Panda3D/Pmw" % GetOutputDir())
             compileall.compile_dir("dstroot/pythoncode/Developer/Panda3D/Pmw")

+ 49 - 19
makepanda/makepandacore.py

@@ -1133,12 +1133,12 @@ def GetThirdpartyDir():
     target_arch = GetTargetArch()
 
     if (target == 'windows'):
+        vc = SDK["VISUALSTUDIO_VERSION"].split('.')[0]
+
         if target_arch == 'x64':
-            THIRDPARTYDIR = base + "/win-libs-vc10-x64/"
-            if not os.path.isdir(THIRDPARTYDIR):
-                THIRDPARTYDIR = base + "/win-libs-vc10/"
+            THIRDPARTYDIR = base + "/win-libs-vc" + vc + "-x64/"
         else:
-            THIRDPARTYDIR = base + "/win-libs-vc10/"
+            THIRDPARTYDIR = base + "/win-libs-vc" + vc + "/"
 
     elif (target == 'darwin'):
         # OSX thirdparty binaries are universal, where possible.
@@ -1423,10 +1423,16 @@ def PkgConfigEnable(opt, pkgname, tool = "pkg-config"):
     for i, j in PkgConfigGetDefSymbols(pkgname, tool).items():
         DefSymbol(opt, i, j)
 
-def LocateLibrary(lib, lpath=[]):
-    """ Returns True if this library was found in the given search path, False otherwise. """
+def LocateLibrary(lib, lpath=[], prefer_static=False):
+    """Searches for the library in the search path, returning its path if found,
+    or None if it was not found."""
     target = GetTarget()
 
+    if prefer_static and target != 'windows':
+        for dir in lpath:
+            if os.path.isfile(os.path.join(dir, 'lib%s.a' % lib)):
+                return os.path.join(dir, 'lib%s.a' % lib)
+
     for dir in lpath:
         if target == 'darwin' and os.path.isfile(os.path.join(dir, 'lib%s.dylib' % lib)):
             return os.path.join(dir, 'lib%s.dylib' % lib)
@@ -1498,6 +1504,7 @@ def SmartPkgEnable(pkg, pkgconfig = None, libs = None, incs = None, defs = None,
         LibName(target_pkg, "-lswresample")
         return
 
+    # First check if the package is in the thirdparty directory.
     pkg_dir = os.path.join(GetThirdpartyDir(), pkg.lower())
     if not custom_loc and os.path.isdir(pkg_dir):
         if framework and os.path.isdir(os.path.join(pkg_dir, framework + ".framework")):
@@ -1508,32 +1515,55 @@ def SmartPkgEnable(pkg, pkgconfig = None, libs = None, incs = None, defs = None,
         if os.path.isdir(os.path.join(pkg_dir, "include")):
             IncDirectory(target_pkg, os.path.join(pkg_dir, "include"))
 
-        if os.path.isdir(os.path.join(pkg_dir, "lib")):
-            LibDirectory(target_pkg, os.path.join(pkg_dir, "lib"))
+            # Handle cases like freetype2 where the include dir is a subdir under "include"
+            for i in incs:
+                if os.path.isdir(os.path.join(pkg_dir, "include", i)):
+                    IncDirectory(target_pkg, os.path.join(pkg_dir, "include", i))
+
+        lpath = [os.path.join(pkg_dir, "lib")]
 
-        if (PkgSkip("PYTHON") == 0):
+        if not PkgSkip("PYTHON"):
             py_lib_dir = os.path.join(pkg_dir, "lib", SDK["PYTHONVERSION"])
             if os.path.isdir(py_lib_dir):
-                LibDirectory(target_pkg, py_lib_dir)
+                lpath.append(py_lib_dir)
 
-        # TODO: check for a .pc file in the lib/pkg-config/ dir
+        # TODO: check for a .pc file in the lib/pkgconfig/ dir
         if (tool != None and os.path.isfile(os.path.join(pkg_dir, "bin", tool))):
             tool = os.path.join(pkg_dir, "bin", tool)
             for i in PkgConfigGetLibs(None, tool):
-                LibName(target_pkg, i)
+                if i.startswith('-l'):
+                    # To make sure we don't pick up the system copy, write out
+                    # the full path instead.
+                    libname = i[2:]
+                    location = LocateLibrary(libname, lpath, prefer_static=True)
+                    if location is not None:
+                        LibName(target_pkg, location)
+                    else:
+                        print(GetColor("cyan") + "Couldn't find library lib" + libname + " in thirdparty directory " + pkg.lower() + GetColor())
+                        LibName(target_pkg, i)
+                else:
+                    LibName(target_pkg, i)
             for i, j in PkgConfigGetDefSymbols(None, tool).items():
                 DefSymbol(target_pkg, i, j)
             return
 
+        # Now search for the libraries in the package's lib directories.
         for l in libs:
             libname = l
             if l.startswith("lib"):
                 libname = l[3:]
-            # This is for backward compatibility - in the thirdparty dir, we kept some libs with "panda" prefix, like libpandatiff.
-            if len(glob.glob(os.path.join(pkg_dir, "lib", "libpanda%s.*" % (libname)))) > 0 \
-               and len(glob.glob(os.path.join(pkg_dir, "lib", "lib%s.*" % (libname)))) == 0:
-                libname = "panda" + libname
-            LibName(target_pkg, "-l" + libname)
+
+            location = LocateLibrary(libname, lpath, prefer_static=True)
+            if location is not None:
+                LibName(target_pkg, location)
+            else:
+                # This is for backward compatibility - in the thirdparty dir,
+                # we kept some libs with "panda" prefix, like libpandatiff.
+                location = LocateLibrary("panda" + libname, lpath, prefer_static=True)
+                if location is not None:
+                    LibName(target_pkg, location)
+                else:
+                    print(GetColor("cyan") + "Couldn't find library lib" + libname + " in thirdparty directory " + pkg.lower() + GetColor())
 
         for d, v in defs.values():
             DefSymbol(target_pkg, d, v)
@@ -2301,8 +2331,8 @@ def SetupVisualStudioEnviron():
         AddToPathEnv("PATH",    SDK["MSPLATFORM"] + "bin\\" + arch)
 
         # Windows Kit 10 introduces the "universal CRT".
-        inc_dir = SDK["MSPLATFORM"] + "Include\\10.0.10240.0\\"
-        lib_dir = SDK["MSPLATFORM"] + "Lib\\10.0.10240.0\\"
+        inc_dir = SDK["MSPLATFORM"] + "Include\\10.0.10586.0\\"
+        lib_dir = SDK["MSPLATFORM"] + "Lib\\10.0.10586.0\\"
         AddToPathEnv("INCLUDE", inc_dir + "shared")
         AddToPathEnv("INCLUDE", inc_dir + "ucrt")
         AddToPathEnv("INCLUDE", inc_dir + "um")

+ 1 - 1
panda/src/bullet/bulletContactResult.I

@@ -86,7 +86,7 @@ get_num_contacts() const {
 /**
  *
  */
-INLINE BulletContact &BulletContactResult::
+INLINE BulletContact BulletContactResult::
 get_contact(int idx) {
 
   nassertr(idx >= 0 && idx < (int)_contacts.size(), _empty);

+ 1 - 1
panda/src/bullet/bulletContactResult.h

@@ -62,7 +62,7 @@ struct EXPCL_PANDABULLET BulletContactResult : public btCollisionWorld::ContactR
 
 PUBLISHED:
   INLINE int get_num_contacts() const;
-  INLINE BulletContact &get_contact(int idx);
+  INLINE BulletContact get_contact(int idx);
   MAKE_SEQ(get_contacts, get_num_contacts, get_contact);
 
 public:

+ 1 - 1
panda/src/bullet/bulletHeightfieldShape.I

@@ -18,7 +18,7 @@ INLINE BulletHeightfieldShape::
 ~BulletHeightfieldShape() {
 
   delete _shape;
-  delete _data;
+  delete [] _data;
 }
 
 /**

+ 4 - 0
panda/src/bullet/bulletTriangleMesh.cxx

@@ -134,6 +134,8 @@ add_geom(const Geom *geom, bool remove_duplicate_vertices, const TransformState
       _mesh->addTriangle(v0, v1, v2, remove_duplicate_vertices);
     }
   }
+
+  delete [] vertices;
 }
 
 /**
@@ -163,6 +165,8 @@ add_array(const PTA_LVecBase3 &points, const PTA_int &indices, bool remove_dupli
 
     _mesh->addTriangle(v0, v1, v2, remove_duplicate_vertices);
   }
+
+  delete [] vertices;
 }
 
 /**

+ 10 - 4
panda/src/chan/animChannelMatrixXfmTable.cxx

@@ -327,10 +327,16 @@ void AnimChannelMatrixXfmTable::
 write_datagram(BamWriter *manager, Datagram &me) {
   AnimChannelMatrix::write_datagram(manager, me);
 
-  if (compress_channels && !FFTCompressor::is_compression_available()) {
-    chan_cat.error()
-      << "Compression is not available; writing uncompressed channels.\n";
-    compress_channels = false;
+  if (compress_channels) {
+    chan_cat.warning()
+      << "FFT compression of animations is deprecated.  For compatibility "
+         "with future versions of Panda3D, set compress-channels to false.\n";
+
+    if (!FFTCompressor::is_compression_available()) {
+      chan_cat.error()
+        << "Compression is not available; writing uncompressed channels.\n";
+      compress_channels = false;
+    }
   }
 
   me.add_bool(compress_channels);

+ 10 - 4
panda/src/chan/animChannelScalarTable.cxx

@@ -146,10 +146,16 @@ void AnimChannelScalarTable::
 write_datagram(BamWriter *manager, Datagram &me) {
   AnimChannelScalar::write_datagram(manager, me);
 
-  if (compress_channels && !FFTCompressor::is_compression_available()) {
-    chan_cat.error()
-      << "Compression is not available; writing uncompressed channels.\n";
-    compress_channels = false;
+  if (compress_channels) {
+    chan_cat.warning()
+      << "FFT compression of animations is deprecated.  For compatibility "
+         "with future versions of Panda3D, set compress-channels to false.\n";
+
+    if (!FFTCompressor::is_compression_available()) {
+      chan_cat.error()
+        << "Compression is not available; writing uncompressed channels.\n";
+      compress_channels = false;
+    }
   }
 
   me.add_bool(compress_channels);

+ 11 - 0
panda/src/chan/config_chan.cxx

@@ -138,4 +138,15 @@ ConfigureFn(config_chan) {
   AnimChannelScalarTable::register_with_read_factory();
   AnimChannelScalarDynamic::register_with_read_factory();
   AnimPreloadTable::register_with_read_factory();
+
+  // For compatibility with old .bam files.
+#ifndef STDFLOAT_DOUBLE
+  TypeRegistry *reg = TypeRegistry::ptr();
+  reg->record_alternate_name(AnimChannelFixed<ACMatrixSwitchType>::get_class_type(),
+                             "AnimChannelFixed<LMatrix4f>");
+  reg->record_alternate_name(MovingPart<ACMatrixSwitchType>::get_class_type(),
+                             "MovingPart<LMatrix4f>");
+  reg->record_alternate_name(MovingPart<ACScalarSwitchType>::get_class_type(),
+                             "MovingPart<float>");
+#endif
 }

+ 9 - 9
panda/src/display/drawableRegion.I

@@ -17,10 +17,10 @@
 INLINE DrawableRegion::
 DrawableRegion() :
   _screenshot_buffer_type(RenderBuffer::T_front),
-  _draw_buffer_type(RenderBuffer::T_back)
+  _draw_buffer_type(RenderBuffer::T_back),
+  _clear_mask(0)
 {
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = false;
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = LColor(0.0f, 0.0f, 0.0f, 0.0f);
   }
   _clear_value[RTP_depth] = LColor(1.0f,1.0f,1.0f,1.0f);
@@ -35,11 +35,11 @@ INLINE DrawableRegion::
 DrawableRegion(const DrawableRegion &copy) :
   _screenshot_buffer_type(copy._screenshot_buffer_type),
   _draw_buffer_type(copy._draw_buffer_type),
+  _clear_mask(copy._clear_mask),
   _pixel_zoom(copy._pixel_zoom),
   _pixel_factor(copy._pixel_factor)
 {
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = copy._clear_active[i];
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = copy._clear_value[i];
   }
 }
@@ -51,8 +51,8 @@ INLINE void DrawableRegion::
 operator = (const DrawableRegion &copy) {
   _screenshot_buffer_type = copy._screenshot_buffer_type;
   _draw_buffer_type = copy._draw_buffer_type;
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = copy._clear_active[i];
+  _clear_mask = copy._clear_mask;
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = copy._clear_value[i];
   }
   _pixel_zoom = copy._pixel_zoom;
@@ -64,8 +64,8 @@ operator = (const DrawableRegion &copy) {
  */
 INLINE void DrawableRegion::
 copy_clear_settings(const DrawableRegion &copy) {
-  for (int i=0; i<RTP_COUNT; i++) {
-    _clear_active[i] = copy._clear_active[i];
+  _clear_mask = copy._clear_mask;
+  for (int i = 0; i < RTP_COUNT; ++i) {
     _clear_value[i] = copy._clear_value[i];
   }
   update_pixel_factor();

+ 10 - 13
panda/src/display/drawableRegion.cxx

@@ -27,8 +27,12 @@ DrawableRegion::
  */
 void DrawableRegion::
 set_clear_active(int n, bool clear_active) {
-  nassertv((n >= 0)&&(n < RTP_COUNT));
-  _clear_active[n] = clear_active;
+  nassertv(n >= 0 && n < RTP_COUNT);
+  if (clear_active) {
+    _clear_mask |= 1 << n;
+  } else {
+    _clear_mask &= ~(1 << n);
+  }
   update_pixel_factor();
 }
 
@@ -37,8 +41,8 @@ set_clear_active(int n, bool clear_active) {
  */
 bool DrawableRegion::
 get_clear_active(int n) const {
-  nassertr((n >= 0)&&(n < RTP_COUNT), false);
-  return _clear_active[n];
+  nassertr(n >= 0 && n < RTP_COUNT, false);
+  return (_clear_mask & (1 << n)) != 0;
 }
 
 /**
@@ -66,9 +70,7 @@ get_clear_value(int n) const {
  */
 void DrawableRegion::
 disable_clears() {
-  for (int i = 0; i < RTP_COUNT; ++i) {
-    _clear_active[i] = false;
-  }
+  _clear_mask = 0;
   update_pixel_factor();
 }
 
@@ -79,12 +81,7 @@ disable_clears() {
  */
 bool DrawableRegion::
 is_any_clear_active() const {
-  for (int i = 0; i < RTP_COUNT; ++i) {
-    if (get_clear_active(i)) {
-      return true;
-    }
-  }
-  return false;
+  return (_clear_mask != 0);
 }
 
 /**

+ 1 - 1
panda/src/display/drawableRegion.h

@@ -109,9 +109,9 @@ protected:
 protected:
   int _screenshot_buffer_type;
   int _draw_buffer_type;
+  int _clear_mask;
 
 private:
-  bool    _clear_active[RTP_COUNT];
   LColor  _clear_value[RTP_COUNT];
 
   PN_stdfloat _pixel_zoom;

+ 7 - 0
panda/src/display/frameBufferProperties.cxx

@@ -480,6 +480,13 @@ get_quality(const FrameBufferProperties &reqs) const {
     quality -= 10000000;
   }
 
+  // Deduct for software-only renderers in absence of a special request.
+  // Cost: 2,000,000
+
+  if (get_force_software() && !reqs.get_force_software()) {
+    quality -= 2000000;
+  }
+
   // Deduct for missing depth, color, alpha, stencil, or accum.  Cost:
   // 1,000,000
 

+ 3 - 0
panda/src/display/graphicsEngine.cxx

@@ -415,6 +415,9 @@ make_output(GraphicsPipe *pipe,
           if (flags & GraphicsPipe::BF_fb_props_optional) {
             display_cat.warning()
               << "FrameBufferProperties available less than requested.\n";
+            display_cat.warning(false)
+              << "  requested: " << fb_prop << "\n"
+              << "  got: " << window->get_fb_properties() << "\n";
             return window;
           }
           display_cat.error()

+ 9 - 0
panda/src/display/graphicsStateGuardian.I

@@ -684,6 +684,15 @@ get_max_color_targets() const {
   return _max_color_targets;
 }
 
+/**
+ * Returns true if dual source (incoming1_color and incoming1_alpha) blend
+ * operands are supported by this GSG.
+ */
+INLINE bool GraphicsStateGuardian::
+get_supports_dual_source_blending() const {
+  return _supports_dual_source_blending;
+}
+
 /**
  * Deprecated.  Use get_max_color_targets() instead, which returns the exact
  * same value.

+ 5 - 1
panda/src/display/graphicsStateGuardian.cxx

@@ -246,6 +246,7 @@ GraphicsStateGuardian(CoordinateSystem internal_coordinate_system,
 
   // Assume a maximum of 1 render target in absence of MRT.
   _max_color_targets = 1;
+  _supports_dual_source_blending = false;
 
   _supported_geom_rendering = 0;
 
@@ -2195,7 +2196,10 @@ begin_draw_primitives(const GeomPipelineReader *geom_reader,
                       bool force) {
   _munger = munger;
   _data_reader = data_reader;
-  return _data_reader->has_vertex();
+
+  // Always draw if we have a shader, since the shader might use a different
+  // mechanism for fetching vertex data.
+  return _data_reader->has_vertex() || (_target_shader && _target_shader->has_shader());
 }
 
 /**

+ 3 - 0
panda/src/display/graphicsStateGuardian.h

@@ -172,6 +172,7 @@ PUBLISHED:
 
   INLINE int get_max_color_targets() const;
   INLINE int get_maximum_simultaneous_render_targets() const;
+  INLINE bool get_supports_dual_source_blending() const;
 
   MAKE_PROPERTY(max_vertices_per_array, get_max_vertices_per_array);
   MAKE_PROPERTY(max_vertices_per_primitive, get_max_vertices_per_primitive);
@@ -217,6 +218,7 @@ PUBLISHED:
   MAKE_PROPERTY(supports_timer_query, get_supports_timer_query);
   MAKE_PROPERTY(timer_queries_active, get_timer_queries_active);
   MAKE_PROPERTY(max_color_targets, get_max_color_targets);
+  MAKE_PROPERTY(supports_dual_source_blending, get_supports_dual_source_blending);
 
   INLINE ShaderModel get_shader_model() const;
   INLINE void set_shader_model(ShaderModel shader_model);
@@ -609,6 +611,7 @@ protected:
   bool _supports_indirect_draw;
 
   int _max_color_targets;
+  bool _supports_dual_source_blending;
 
   int  _supported_geom_rendering;
   bool _color_scale_via_lighting;

+ 12 - 0
panda/src/downloader/socketStream.h

@@ -126,6 +126,10 @@ public:
   INLINE ISocketStream(streambuf *buf);
   virtual ~ISocketStream();
 
+#if _MSC_VER >= 1800
+  INLINE ISocketStream(const ISocketStream &copy) = delete;
+#endif
+
 PUBLISHED:
   enum ReadState {
     RS_initial,
@@ -155,6 +159,10 @@ class EXPCL_PANDAEXPRESS OSocketStream : public ostream, public SSWriter {
 public:
   INLINE OSocketStream(streambuf *buf);
 
+#if _MSC_VER >= 1800
+  INLINE OSocketStream(const OSocketStream &copy) = delete;
+#endif
+
 PUBLISHED:
   virtual bool is_closed() = 0;
   virtual void close() = 0;
@@ -170,6 +178,10 @@ class EXPCL_PANDAEXPRESS SocketStream : public iostream, public SSReader, public
 public:
   INLINE SocketStream(streambuf *buf);
 
+#if _MSC_VER >= 1800
+  INLINE SocketStream(const SocketStream &copy) = delete;
+#endif
+
 PUBLISHED:
   virtual bool is_closed() = 0;
   virtual void close() = 0;

+ 5 - 0
panda/src/dxgsg9/config_dxgsg9.cxx

@@ -265,3 +265,8 @@ init_libdxgsg9() {
   PandaSystem *ps = PandaSystem::get_global_ptr();
   ps->add_system("DirectX9");
 }
+
+// Necessary to allow use of dxerr from MSVC 2015
+#if _MSC_VER >= 1900
+int (WINAPIV * __vsnprintf)(char *, size_t, const char*, va_list) = _vsnprintf;
+#endif

+ 55 - 31
panda/src/dxgsg9/dxGraphicsStateGuardian9.cxx

@@ -3766,43 +3766,24 @@ do_issue_blending() {
     }
   }
 
-  const ColorBlendAttrib *target_color_blend = DCAST(ColorBlendAttrib, _target_rs->get_attrib_def(ColorBlendAttrib::get_class_slot()));
-  CPT(ColorBlendAttrib) color_blend = target_color_blend;
-  ColorBlendAttrib::Mode color_blend_mode = target_color_blend->get_mode();
+  const ColorBlendAttrib *color_blend;
+  _target_rs->get_attrib_def(color_blend);
+  ColorBlendAttrib::Mode color_blend_mode = color_blend->get_mode();
 
-  const TransparencyAttrib *target_transparency = DCAST(TransparencyAttrib, _target_rs->get_attrib_def(TransparencyAttrib::get_class_slot()));
+  const TransparencyAttrib *target_transparency;
+  _target_rs->get_attrib_def(target_transparency);
   TransparencyAttrib::Mode transparency_mode = target_transparency->get_mode();
 
   // Is there a color blend set?
   if (color_blend_mode != ColorBlendAttrib::M_none) {
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
-
-    switch (color_blend_mode) {
-    case ColorBlendAttrib::M_add:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
-      break;
-
-    case ColorBlendAttrib::M_subtract:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_SUBTRACT);
-      break;
-
-    case ColorBlendAttrib::M_inv_subtract:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_REVSUBTRACT);
-      break;
-
-    case ColorBlendAttrib::M_min:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_MIN);
-      break;
-
-    case ColorBlendAttrib::M_max:
-      set_render_state(D3DRS_BLENDOP, D3DBLENDOP_MAX);
-      break;
-    }
-
-    set_render_state(D3DRS_SRCBLEND,
-        get_blend_func(color_blend->get_operand_a()));
-    set_render_state(D3DRS_DESTBLEND,
-        get_blend_func(color_blend->get_operand_b()));
+    set_render_state(D3DRS_SEPARATEALPHABLENDENABLE, TRUE);
+    set_render_state(D3DRS_BLENDOP, get_blend_mode(color_blend_mode));
+    set_render_state(D3DRS_BLENDOPALPHA, get_blend_mode(color_blend->get_alpha_mode()));
+    set_render_state(D3DRS_SRCBLEND, get_blend_func(color_blend->get_operand_a()));
+    set_render_state(D3DRS_DESTBLEND, get_blend_func(color_blend->get_operand_b()));
+    set_render_state(D3DRS_SRCBLENDALPHA, get_blend_func(color_blend->get_alpha_operand_a()));
+    set_render_state(D3DRS_DESTBLENDALPHA, get_blend_func(color_blend->get_alpha_operand_b()));
     return;
   }
 
@@ -3817,6 +3798,7 @@ do_issue_blending() {
   case TransparencyAttrib::M_multisample_mask:
   case TransparencyAttrib::M_dual:
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
+    set_render_state(D3DRS_SEPARATEALPHABLENDENABLE, FALSE);
     set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
     set_render_state(D3DRS_SRCBLEND, D3DBLEND_SRCALPHA);
     set_render_state(D3DRS_DESTBLEND, D3DBLEND_INVSRCALPHA);
@@ -3824,6 +3806,7 @@ do_issue_blending() {
 
   case TransparencyAttrib::M_premultiplied_alpha:
     set_render_state(D3DRS_ALPHABLENDENABLE, TRUE);
+    set_render_state(D3DRS_SEPARATEALPHABLENDENABLE, FALSE);
     set_render_state(D3DRS_BLENDOP, D3DBLENDOP_ADD);
     set_render_state(D3DRS_SRCBLEND, D3DBLEND_ONE);
     set_render_state(D3DRS_DESTBLEND, D3DBLEND_INVSRCALPHA);
@@ -4052,6 +4035,33 @@ get_light_color(Light *light) const {
   return *(D3DCOLORVALUE *)cf.get_data();
 }
 
+/**
+ * Maps from ColorBlendAttrib::Mode to D3DBLENDOP vaule.
+ */
+D3DBLENDOP DXGraphicsStateGuardian9::
+get_blend_mode(ColorBlendAttrib::Mode mode) {
+  switch (mode) {
+  case ColorBlendAttrib::M_add:
+    return D3DBLENDOP_ADD;
+
+  case ColorBlendAttrib::M_subtract:
+    return D3DBLENDOP_SUBTRACT;
+
+  case ColorBlendAttrib::M_inv_subtract:
+    return D3DBLENDOP_REVSUBTRACT;
+
+  case ColorBlendAttrib::M_min:
+    return D3DBLENDOP_MIN;
+
+  case ColorBlendAttrib::M_max:
+    return D3DBLENDOP_MAX;
+  }
+
+  dxgsg9_cat.error()
+    << "Unknown color blend mode " << (int)mode << endl;
+  return D3DBLENDOP_ADD;
+}
+
 /**
  * Maps from ColorBlendAttrib::Operand to D3DBLEND value.
  */
@@ -4106,6 +4116,20 @@ get_blend_func(ColorBlendAttrib::Operand operand) {
 
   case ColorBlendAttrib::O_incoming_color_saturate:
     return D3DBLEND_SRCALPHASAT;
+
+  case ColorBlendAttrib::O_incoming1_color:
+    return (D3DBLEND)16; //D3DBLEND_SRCCOLOR2;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+    return (D3DBLEND)17; //D3DBLEND_INVSRCCOLOR2;
+
+  case ColorBlendAttrib::O_incoming1_alpha:
+    // Not supported by DX9.
+    return (D3DBLEND)18;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    // Not supported by DX9.
+    return (D3DBLEND)19;
   }
 
   dxgsg9_cat.error()

+ 1 - 0
panda/src/dxgsg9/dxGraphicsStateGuardian9.h

@@ -217,6 +217,7 @@ protected:
   const D3DCOLORVALUE &get_light_color(Light *light) const;
   INLINE static D3DTRANSFORMSTATETYPE get_tex_mat_sym(int stage_index);
 
+  static D3DBLENDOP get_blend_mode(ColorBlendAttrib::Mode mode);
   static D3DBLEND get_blend_func(ColorBlendAttrib::Operand operand);
   void report_texmgr_stats();
 

+ 53 - 0
panda/src/express/zStreamBuf.cxx

@@ -170,6 +170,59 @@ close_write() {
   }
 }
 
+/**
+ * Implements seeking within the stream.  ZStreamBuf only allows seeking back
+ * to the beginning of the stream.
+ */
+streampos ZStreamBuf::
+seekoff(streamoff off, ios_seekdir dir, ios_openmode which) {
+  // Necessary for tellg() to work after seeking to 0.
+  if (dir == ios::cur && off == 0) {
+    if (_source->tellg() == 0) {
+      return 0;
+    } else {
+      return -1;
+    }
+  }
+
+  if (off != 0 || dir != ios::beg) {
+    // We only know how to reposition to the beginning.
+    return -1;
+  }
+
+  if (which != ios::in) {
+    // We can only do this with the input stream.
+    return -1;
+  }
+
+  size_t n = egptr() - gptr();
+  gbump(n);
+
+  _source->seekg(0, ios::beg);
+  if (_source->tellg() == 0) {
+    _z_source.next_in = Z_NULL;
+    _z_source.avail_in = 0;
+    _z_source.next_out = Z_NULL;
+    _z_source.avail_out = 0;
+    int result = inflateReset(&_z_source);
+    if (result < 0) {
+      show_zlib_error("inflateReset", result, _z_source);
+    }
+    return 0;
+  }
+
+  return -1;
+}
+
+/**
+ * Implements seeking within the stream.  ZStreamBuf only allows seeking back
+ * to the beginning of the stream.
+ */
+streampos ZStreamBuf::
+seekpos(streampos pos, ios_openmode which) {
+  return seekoff(pos, ios::beg, which);
+}
+
 /**
  * Called by the system ostream implementation when its internal buffer is
  * filled, plus one character.

+ 3 - 0
panda/src/express/zStreamBuf.h

@@ -35,6 +35,9 @@ public:
   void open_write(ostream *dest, bool owns_dest, int compression_level);
   void close_write();
 
+  virtual streampos seekoff(streamoff off, ios_seekdir dir, ios_openmode which);
+  virtual streampos seekpos(streampos pos, ios_openmode which);
+
 protected:
   virtual int overflow(int c);
   virtual int sync();

+ 15 - 5
panda/src/gles2gsg/gles2gsg.h

@@ -80,8 +80,6 @@ typedef char GLchar;
 #define GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS
 #define GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT GL_FRAMEBUFFER_INCOMPLETE_FORMATS
 #define GL_DEPTH_ATTACHMENT_EXT GL_DEPTH_ATTACHMENT
-#define GL_COLOR_ATTACHMENT0_EXT GL_COLOR_ATTACHMENT0
-#define GL_COLOR_ATTACHMENT1_EXT (GL_COLOR_ATTACHMENT0 + 1)
 #define GL_STENCIL_ATTACHMENT_EXT GL_STENCIL_ATTACHMENT
 #define GL_DEPTH_STENCIL GL_DEPTH_STENCIL_OES
 #define GL_DEPTH_STENCIL_EXT GL_DEPTH_STENCIL_OES
@@ -89,7 +87,6 @@ typedef char GLchar;
 #define GL_DEPTH24_STENCIL8_EXT GL_DEPTH24_STENCIL8_OES
 #define GL_DEPTH_COMPONENT24 GL_DEPTH_COMPONENT24_OES
 #define GL_DEPTH_COMPONENT32 GL_DEPTH_COMPONENT32_OES
-#define GL_TEXTURE_3D GL_TEXTURE_3D_OES
 #define GL_MAX_3D_TEXTURE_SIZE GL_MAX_3D_TEXTURE_SIZE_OES
 #define GL_SAMPLER_3D GL_SAMPLER_3D_OES
 #define GL_BGRA GL_BGRA_EXT
@@ -121,8 +118,21 @@ typedef char GLchar;
 #define GL_COMPARE_R_TO_TEXTURE_ARB GL_COMPARE_REF_TO_TEXTURE_EXT
 #define GL_SAMPLER_2D_SHADOW GL_SAMPLER_2D_SHADOW_EXT
 #define GL_MAX_DRAW_BUFFERS GL_MAX_DRAW_BUFFERS_NV
-#define GL_COMPRESSED_RGBA_S3TC_DXT3_EXT GL_COMPRESSED_RGBA_S3TC_DXT3_ANGLE
-#define GL_COMPRESSED_RGBA_S3TC_DXT5_EXT GL_COMPRESSED_RGBA_S3TC_DXT5_ANGLE
+#define GL_SRC1_COLOR GL_SRC1_COLOR_EXT
+#define GL_ONE_MINUS_SRC1_COLOR GL_ONE_MINUS_SRC1_COLOR_EXT
+#define GL_SRC1_ALPHA GL_SRC1_ALPHA_EXT
+#define GL_ONE_MINUS_SRC1_ALPHA GL_ONE_MINUS_SRC1_ALPHA_EXT
+
+#define GL_DEBUG_OUTPUT_SYNCHRONOUS GL_DEBUG_OUTPUT_SYNCHRONOUS_KHR
+#define GL_DEBUG_TYPE_PERFORMANCE GL_DEBUG_TYPE_PERFORMANCE_KHR
+#define GL_DEBUG_SEVERITY_HIGH GL_DEBUG_SEVERITY_HIGH_KHR
+#define GL_DEBUG_SEVERITY_MEDIUM GL_DEBUG_SEVERITY_MEDIUM_KHR
+#define GL_DEBUG_SEVERITY_LOW GL_DEBUG_SEVERITY_LOW_KHR
+#define GL_DEBUG_SEVERITY_NOTIFICATION GL_DEBUG_SEVERITY_NOTIFICATION_KHR
+#define GL_BUFFER GL_BUFFER_KHR
+#define GL_SHADER GL_SHADER_KHR
+#define GL_PROGRAM GL_PROGRAM_KHR
+#define GL_DEBUG_OUTPUT GL_DEBUG_OUTPUT_KHR
 
 // For GLES 3 compat - need a better solution for this
 #define GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT 0x1

File diff suppressed because it is too large
+ 760 - 1039
panda/src/gles2gsg/panda_esgl2ext.h


+ 4 - 5
panda/src/glstuff/glGraphicsBuffer_src.cxx

@@ -902,7 +902,7 @@ bind_slot(int layer, bool rb_resize, Texture **attach, RenderTexturePlane slot,
             }
           } else {
             if (_fb_properties.get_color_bits() > 16 * 3) {
-              gl_format = GL_RGBA32F_ARB;
+              gl_format = GL_RGB32F_ARB;
             } else if (_fb_properties.get_color_bits() > 8 * 3) {
               gl_format = GL_RGB16_EXT;
             } else {
@@ -920,11 +920,11 @@ bind_slot(int layer, bool rb_resize, Texture **attach, RenderTexturePlane slot,
             }
           } else {
             if (_fb_properties.get_color_bits() > 16 * 3) {
-              gl_format = GL_RGB32F_ARB;
+              gl_format = GL_RGBA32F_ARB;
             } else if (_fb_properties.get_color_bits() > 8 * 3) {
-              gl_format = GL_RGB16_EXT;
+              gl_format = GL_RGBA16_EXT;
             } else {
-              gl_format = GL_RGB;
+              gl_format = GL_RGBA;
             }
           }
         }
@@ -1090,7 +1090,6 @@ bind_slot_multisample(bool rb_resize, Texture **attach, RenderTexturePlane slot,
                                         GL_RENDERBUFFER_EXT, _rbm[slot]);
     }
   } else {
-    Texture *Tex = attach[slot];
     GLuint gl_format = GL_RGBA;
 #ifndef OPENGLES
     switch (slot) {

+ 251 - 65
panda/src/glstuff/glGraphicsStateGuardian_src.cxx

@@ -129,16 +129,23 @@ null_glActiveTexture(GLenum gl_texture_stage) {
 
 #ifdef OPENGLES_2
 #define _glBlendEquation glBlendEquation
+#define _glBlendEquationSeparate glBlendEquationSeparate
+#define _glBlendFuncSeparate glBlendFuncSeparate
 #define _glBlendColor glBlendColor
 #else
 static void APIENTRY
 null_glBlendEquation(GLenum) {
 }
-#endif
+
+static void APIENTRY
+null_glBlendFuncSeparate(GLenum src, GLenum dest, GLenum, GLenum) {
+  glBlendFunc(src, dest);
+}
 
 static void APIENTRY
 null_glBlendColor(GLclampf, GLclampf, GLclampf, GLclampf) {
 }
+#endif
 
 #ifndef OPENGLES_1
 // We have a default shader that will be applied when there isn't any shader
@@ -1954,6 +1961,9 @@ reset() {
 #endif
 
 #ifdef OPENGLES_1
+  _supports_framebuffer_multisample = false;
+  _supports_framebuffer_blit = false;
+
   if (has_extension("GL_OES_framebuffer_object")) {
     _supports_framebuffer_object = true;
     _glIsRenderbuffer = (PFNGLISRENDERBUFFEROESPROC)
@@ -2011,9 +2021,76 @@ reset() {
   _glGetFramebufferAttachmentParameteriv = glGetFramebufferAttachmentParameteriv;
   _glGenerateMipmap = glGenerateMipmap;
 
-#else
-  // TODO: add ARB3.0 version
-  if (has_extension("GL_EXT_framebuffer_object")) {
+  if (is_at_least_gles_version(3, 0)) {
+    _supports_framebuffer_multisample = true;
+    _supports_framebuffer_blit = true;
+
+    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
+      get_extension_func("glRenderbufferStorageMultisample");
+    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
+      get_extension_func("glBlitFramebuffer");
+  } else {
+    if (has_extension("GL_ANGLE_framebuffer_multisample")) {
+      _supports_framebuffer_multisample = true;
+      _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEANGLEPROC)
+        get_extension_func("glRenderbufferStorageMultisampleANGLE");
+    } else {
+      _supports_framebuffer_multisample = false;
+    }
+    if (has_extension("GL_ANGLE_framebuffer_blit")) {
+      _supports_framebuffer_blit = true;
+      _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERANGLEPROC)
+        get_extension_func("glBlitFramebufferANGLE");
+    } else {
+      _supports_framebuffer_blit = false;
+    }
+  }
+#else  // Desktop OpenGL case.
+  if (is_at_least_gl_version(3, 0) || has_extension("GL_ARB_framebuffer_object")) {
+    _supports_framebuffer_object = true;
+    _supports_framebuffer_multisample = true;
+    _supports_framebuffer_blit = true;
+
+    _glIsRenderbuffer = (PFNGLISRENDERBUFFERPROC)
+      get_extension_func("glIsRenderbuffer");
+    _glBindRenderbuffer = (PFNGLBINDRENDERBUFFERPROC)
+      get_extension_func("glBindRenderbuffer");
+    _glDeleteRenderbuffers = (PFNGLDELETERENDERBUFFERSPROC)
+      get_extension_func("glDeleteRenderbuffers");
+    _glGenRenderbuffers = (PFNGLGENRENDERBUFFERSPROC)
+      get_extension_func("glGenRenderbuffers");
+    _glRenderbufferStorage = (PFNGLRENDERBUFFERSTORAGEPROC)
+      get_extension_func("glRenderbufferStorage");
+    _glGetRenderbufferParameteriv = (PFNGLGETRENDERBUFFERPARAMETERIVPROC)
+      get_extension_func("glGetRenderbufferParameteriv");
+    _glIsFramebuffer = (PFNGLISFRAMEBUFFERPROC)
+      get_extension_func("glIsFramebuffer");
+    _glBindFramebuffer = (PFNGLBINDFRAMEBUFFERPROC)
+      get_extension_func("glBindFramebuffer");
+    _glDeleteFramebuffers = (PFNGLDELETEFRAMEBUFFERSPROC)
+      get_extension_func("glDeleteFramebuffers");
+    _glGenFramebuffers = (PFNGLGENFRAMEBUFFERSPROC)
+      get_extension_func("glGenFramebuffers");
+    _glCheckFramebufferStatus = (PFNGLCHECKFRAMEBUFFERSTATUSPROC)
+      get_extension_func("glCheckFramebufferStatus");
+    _glFramebufferTexture1D = (PFNGLFRAMEBUFFERTEXTURE1DPROC)
+      get_extension_func("glFramebufferTexture1D");
+    _glFramebufferTexture2D = (PFNGLFRAMEBUFFERTEXTURE2DPROC)
+      get_extension_func("glFramebufferTexture2D");
+    _glFramebufferTexture3D = (PFNGLFRAMEBUFFERTEXTURE3DPROC)
+      get_extension_func("glFramebufferTexture3D");
+    _glFramebufferRenderbuffer = (PFNGLFRAMEBUFFERRENDERBUFFERPROC)
+      get_extension_func("glFramebufferRenderbuffer");
+    _glGetFramebufferAttachmentParameteriv = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)
+      get_extension_func("glGetFramebufferAttachmentParameteriv");
+    _glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC)
+      get_extension_func("glGenerateMipmap");
+    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC)
+      get_extension_func("glRenderbufferStorageMultisampleEXT");
+    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERPROC)
+      get_extension_func("glBlitFramebuffer");
+
+  } else if (has_extension("GL_EXT_framebuffer_object")) {
     _supports_framebuffer_object = true;
     _glIsRenderbuffer = (PFNGLISRENDERBUFFEREXTPROC)
       get_extension_func("glIsRenderbufferEXT");
@@ -2050,14 +2127,25 @@ reset() {
     _glGenerateMipmap = (PFNGLGENERATEMIPMAPEXTPROC)
       get_extension_func("glGenerateMipmapEXT");
 
-  } else if (is_at_least_gl_version(3, 0)) {
-    // This case should go away when we support the ARB/3.0 version of FBOs.
-    _supports_framebuffer_object = false;
-    _glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC)
-      get_extension_func("glGenerateMipmap");
+    if (has_extension("GL_EXT_framebuffer_multisample")) {
+      _supports_framebuffer_multisample = true;
+      _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
+        get_extension_func("glRenderbufferStorageMultisampleEXT");
+    } else {
+      _supports_framebuffer_multisample = false;
+    }
+    if (has_extension("GL_EXT_framebuffer_blit")) {
+      _supports_framebuffer_blit = true;
+      _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
+        get_extension_func("glBlitFramebufferEXT");
+    } else {
+      _supports_framebuffer_blit = false;
+    }
 
   } else {
     _supports_framebuffer_object = false;
+    _supports_framebuffer_multisample = false;
+    _supports_framebuffer_blit = false;
     _glGenerateMipmap = NULL;
   }
 #endif
@@ -2086,49 +2174,16 @@ reset() {
   }
 #endif  // !OPENGLES_1
 
-  _supports_framebuffer_multisample = false;
-  if (is_at_least_gles_version(3, 0)) {
-    _supports_framebuffer_multisample = true;
-    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
-      get_extension_func("glRenderbufferStorageMultisample");
-
-#ifdef OPENGLES
-  } else if (has_extension("GL_APPLE_framebuffer_multisample")) {
-    _supports_framebuffer_multisample = true;
-    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEAPPLEPROC)
-      get_extension_func("glRenderbufferStorageMultisampleAPPLE");
-#else
-  } else if (has_extension("GL_EXT_framebuffer_multisample")) {
-    _supports_framebuffer_multisample = true;
-    _glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)
-      get_extension_func("glRenderbufferStorageMultisampleEXT");
-#endif
-  }
-
 #ifndef OPENGLES
   _supports_framebuffer_multisample_coverage_nv = false;
-  if (has_extension("GL_NV_framebuffer_multisample_coverage")) {
+  if (_supports_framebuffer_multisample &&
+      has_extension("GL_NV_framebuffer_multisample_coverage")) {
     _supports_framebuffer_multisample_coverage_nv = true;
     _glRenderbufferStorageMultisampleCoverage = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLECOVERAGENVPROC)
       get_extension_func("glRenderbufferStorageMultisampleCoverageNV");
   }
 #endif
 
-#ifndef OPENGLES_1
-  _supports_framebuffer_blit = false;
-
-  if (is_at_least_gles_version(3, 0)) {
-    _supports_framebuffer_blit = true;
-    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
-      get_extension_func("glBlitFramebuffer");
-
-  } else if (has_extension("GL_EXT_framebuffer_blit")) {
-    _supports_framebuffer_blit = true;
-    _glBlitFramebuffer = (PFNGLBLITFRAMEBUFFEREXTPROC)
-      get_extension_func("glBlitFramebufferEXT");
-  }
-#endif
-
 #if defined(OPENGLES_1)
   _glDrawBuffers = NULL;
   _max_color_targets = 1;
@@ -2296,29 +2351,115 @@ reset() {
   }
 #endif
 
-  // In OpenGL ES 2.x, this is supported in the core.
-#ifndef OPENGLES_2
-  _glBlendEquation = NULL;
-  bool supports_blend_equation = false;
+#ifdef OPENGLES_1
+  // In OpenGL ES 1, blending is supported via extensions.
+  if (has_extension("GL_OES_blend_subtract")) {
+    _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
+      get_extension_func("glBlendEquationOES");
+
+    if (_glBlendEquation == NULL) {
+      _glBlendEquation = null_glBlendEquation;
+      GLCAT.warning()
+        << "BlendEquationOES advertised as supported by OpenGL ES runtime, but "
+           "could not get pointer to extension function.\n";
+    }
+  } else {
+    _glBlendEquation = null_glBlendEquation;
+  }
+
+  if (has_extension("GL_OES_blend_equation_separate")) {
+    _glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEOESPROC)
+      get_extension_func("glBlendEquationSeparateOES");
+
+    if (_glBlendEquation == NULL) {
+      _supports_blend_equation_separate = false;
+      GLCAT.warning()
+        << "BlendEquationSeparateOES advertised as supported by OpenGL ES "
+           "runtime, but could not get pointer to extension function.\n";
+    } else {
+      _supports_blend_equation_separate = true;
+    }
+  } else {
+    _supports_blend_equation_separate = false;
+    _glBlendEquationSeparate = NULL;
+  }
+
+  if (has_extension("GL_OES_blend_func_separate")) {
+    _glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEOESPROC)
+      get_extension_func("glBlendFuncSeparateOES");
+
+    if (_glBlendFuncSeparate == NULL) {
+      _glBlendFuncSeparate = null_glBlendFuncSeparate;
+      GLCAT.warning()
+        << "BlendFuncSeparateOES advertised as supported by OpenGL ES runtime, but "
+           "could not get pointer to extension function.\n";
+    }
+  } else {
+    _glBlendFuncSeparate = null_glBlendFuncSeparate;
+  }
+
+#elif defined(OPENGLES)
+  // In OpenGL ES 2.x and above, this is supported in the core.
+  _supports_blend_equation_separate = false;
+
+#else
   if (is_at_least_gl_version(1, 2)) {
-    supports_blend_equation = true;
     _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
       get_extension_func("glBlendEquation");
-  } else if (has_extension("GL_OES_blend_subtract")) {
-    supports_blend_equation = true;
-    _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
-      get_extension_func("glBlendEquationOES");
+
   } else if (has_extension("GL_EXT_blend_minmax")) {
-    supports_blend_equation = true;
     _glBlendEquation = (PFNGLBLENDEQUATIONPROC)
       get_extension_func("glBlendEquationEXT");
+
+  } else {
+    _glBlendEquation = null_glBlendEquation;
   }
-  if (supports_blend_equation && _glBlendEquation == NULL) {
-    GLCAT.warning()
-      << "BlendEquation advertised as supported by OpenGL runtime, but could not get pointers to extension function.\n";
-  }
+
   if (_glBlendEquation == NULL) {
     _glBlendEquation = null_glBlendEquation;
+    GLCAT.warning()
+      << "BlendEquation advertised as supported by OpenGL runtime, but could "
+         "not get pointer to extension function.\n";
+  }
+
+  if (is_at_least_gl_version(2, 0)) {
+    _supports_blend_equation_separate = true;
+    _glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEPROC)
+      get_extension_func("glBlendEquationSeparate");
+
+  } else if (has_extension("GL_EXT_blend_equation_separate")) {
+    _supports_blend_equation_separate = true;
+    _glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEEXTPROC)
+      get_extension_func("glBlendEquationSeparateEXT");
+
+  } else {
+    _supports_blend_equation_separate = false;
+    _glBlendEquationSeparate = NULL;
+  }
+
+  if (_supports_blend_equation_separate && _glBlendEquationSeparate == NULL) {
+    _supports_blend_equation_separate = false;
+    GLCAT.warning()
+      << "BlendEquationSeparate advertised as supported by OpenGL runtime, "
+         "but could not get pointer to extension function.\n";
+  }
+
+  if (is_at_least_gl_version(1, 4)) {
+    _glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEPROC)
+      get_extension_func("glBlendFuncSeparate");
+
+  } else if (has_extension("GL_EXT_blend_func_separate")) {
+    _glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEEXTPROC)
+      get_extension_func("glBlendFuncSeparateEXT");
+
+  } else {
+    _glBlendFuncSeparate = null_glBlendFuncSeparate;
+  }
+
+  if (_glBlendFuncSeparate == NULL) {
+    _glBlendFuncSeparate = null_glBlendFuncSeparate;
+    GLCAT.warning()
+      << "BlendFuncSeparate advertised as supported by OpenGL runtime, but could not get pointers to extension function.\n";
   }
 #endif
 
@@ -2344,6 +2485,15 @@ reset() {
   }
 #endif
 
+#ifdef OPENGLES_1
+  // OpenGL ES 1 doesn't support dual-source blending.
+#elif defined(OPENGLES)
+  _supports_dual_source_blending = has_extension("GL_EXT_blend_func_extended");
+#else
+  _supports_dual_source_blending =
+    is_at_least_gl_version(3, 3) || has_extension("GL_ARB_blend_func_extended");
+#endif
+
 #ifdef OPENGLES
   _edge_clamp = GL_CLAMP_TO_EDGE;
 #else
@@ -6914,6 +7064,7 @@ do_issue_blending() {
   _target_rs->get_attrib_def(target_color_blend);
   CPT(ColorBlendAttrib) color_blend = target_color_blend;
   ColorBlendAttrib::Mode color_blend_mode = target_color_blend->get_mode();
+  ColorBlendAttrib::Mode alpha_blend_mode = target_color_blend->get_alpha_mode();
 
   const TransparencyAttrib *target_transparency;
   _target_rs->get_attrib_def(target_transparency);
@@ -6926,9 +7077,17 @@ do_issue_blending() {
     enable_multisample_alpha_one(false);
     enable_multisample_alpha_mask(false);
     enable_blend(true);
-    _glBlendEquation(get_blend_equation_type(color_blend_mode));
-    glBlendFunc(get_blend_func(color_blend->get_operand_a()),
-                get_blend_func(color_blend->get_operand_b()));
+
+    if (_supports_blend_equation_separate) {
+      _glBlendEquationSeparate(get_blend_equation_type(color_blend_mode),
+                               get_blend_equation_type(alpha_blend_mode));
+    } else {
+      _glBlendEquation(get_blend_equation_type(color_blend_mode));
+    }
+    _glBlendFuncSeparate(get_blend_func(color_blend->get_operand_a()),
+                         get_blend_func(color_blend->get_operand_b()),
+                         get_blend_func(color_blend->get_alpha_operand_a()),
+                         get_blend_func(color_blend->get_alpha_operand_b()));
 
 #ifndef OPENGLES_1
     LColor c;
@@ -6943,9 +7102,17 @@ do_issue_blending() {
 #endif
 
     if (GLCAT.is_spam()) {
-      GLCAT.spam() << "glBlendEquation(" << color_blend_mode << ")\n";
-      GLCAT.spam() << "glBlendFunc(" << color_blend->get_operand_a()
-                                     << color_blend->get_operand_b() << ")\n";
+      if (_supports_blend_equation_separate) {
+        GLCAT.spam() << "glBlendEquationSeparate(" << color_blend_mode << ", "
+                                                   << alpha_blend_mode << ")\n";
+      } else {
+        GLCAT.spam() << "glBlendEquation(" << color_blend_mode << ")\n";
+      }
+      GLCAT.spam() << "glBlendFuncSeparate("
+                   << color_blend->get_operand_a() << ", "
+                   << color_blend->get_operand_b() << ", "
+                   << color_blend->get_alpha_operand_a() << ", "
+                   << color_blend->get_alpha_operand_b() << ")\n";
 #ifndef OPENGLES_1
       GLCAT.spam() << "glBlendColor(" << c << ")\n";
 #endif
@@ -9325,6 +9492,13 @@ get_blend_func(ColorBlendAttrib::Operand operand) {
   case ColorBlendAttrib::O_one_minus_constant_alpha:
   case ColorBlendAttrib::O_one_minus_alpha_scale:
     break;
+
+  // No dual-source blending, either.
+  case ColorBlendAttrib::O_incoming1_color:
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+  case ColorBlendAttrib::O_incoming1_alpha:
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    break;
 #else
   case ColorBlendAttrib::O_constant_color:
   case ColorBlendAttrib::O_color_scale:
@@ -9341,6 +9515,18 @@ get_blend_func(ColorBlendAttrib::Operand operand) {
   case ColorBlendAttrib::O_one_minus_constant_alpha:
   case ColorBlendAttrib::O_one_minus_alpha_scale:
     return GL_ONE_MINUS_CONSTANT_ALPHA;
+
+  case ColorBlendAttrib::O_incoming1_color:
+    return GL_SRC1_COLOR;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+    return GL_ONE_MINUS_SRC1_COLOR;
+
+  case ColorBlendAttrib::O_incoming1_alpha:
+    return GL_SRC1_ALPHA;
+
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    return GL_ONE_MINUS_SRC1_ALPHA;
 #endif
 
   case ColorBlendAttrib::O_incoming_color_saturate:
@@ -11337,7 +11523,7 @@ upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
   }
 
   if (needs_reload && gtc->_immutable) {
-    GLCAT.warning() << "Attempt to modify texture with immutable storage, recreating texture.\n";
+    GLCAT.info() << "Attempt to modify texture with immutable storage, recreating texture.\n";
     gtc->reset_data();
     glBindTexture(target, gtc->_index);
 

+ 6 - 0
panda/src/glstuff/glGraphicsStateGuardian_src.h

@@ -141,6 +141,8 @@ typedef void (APIENTRYP PFNGLTEXSTORAGE3DPROC) (GLenum target, GLsizei levels, G
 typedef void (APIENTRYP PFNGLBINDVERTEXARRAYPROC) (GLuint array);
 typedef void (APIENTRYP PFNGLDELETEVERTEXARRAYSPROC) (GLsizei n, const GLuint *arrays);
 typedef void (APIENTRYP PFNGLGENVERTEXARRAYSPROC) (GLsizei n, GLuint *arrays);
+typedef void (APIENTRYP PFNGLBLENDEQUATIONSEPARATEPROC) (GLenum modeRGB, GLenum modeAlpha);
+typedef void (APIENTRYP PFNGLBLENDFUNCSEPARATEPROC) (GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha);
 
 #ifndef OPENGLES_1
 // GLSL shader functions
@@ -817,8 +819,12 @@ public:
   PFNGLBUFFERSTORAGEPROC _glBufferStorage;
 #endif
 
+  bool _supports_blend_equation_separate;
 #ifndef OPENGLES_2
+  // OpenGL ES 2+ has these in the core.
   PFNGLBLENDEQUATIONPROC _glBlendEquation;
+  PFNGLBLENDEQUATIONSEPARATEPROC _glBlendEquationSeparate;
+  PFNGLBLENDFUNCSEPARATEPROC _glBlendFuncSeparate;
 #endif
 #ifndef OPENGLES
   PFNGLBLENDCOLORPROC _glBlendColor;

+ 9 - 8
panda/src/glstuff/glShaderContext_src.cxx

@@ -677,23 +677,24 @@ reflect_uniform(int i, char *name_buffer, GLsizei name_buflen) {
   _glgsg->_glGetActiveUniform(_glsl_program, i, name_buflen, NULL, &param_size, &param_type, name_buffer);
   GLint p = _glgsg->_glGetUniformLocation(_glsl_program, name_buffer);
 
+  if (GLCAT.is_debug()) {
+    GLCAT.debug()
+      << "Active uniform " << name_buffer << " with size " << param_size
+      << " and type 0x" << hex << param_type << dec
+      << " is bound to location " << p << "\n";
+  }
 
   // Some NVidia drivers (361.43 for example) (incorrectly) include "internal"
   // uniforms in the list starting with "_main_" (for example,
   // "_main_0_gp5fp[0]") we need to skip those, because we don't know anything
   // about them
   if (strncmp(name_buffer, "_main_", 6) == 0) {
-    GLCAT.warning() << "Ignoring uniform " << name_buffer << " which may be generated by buggy Nvidia driver.\n";
+    if (GLCAT.is_debug()) {
+      GLCAT.debug() << "Ignoring uniform " << name_buffer << " which may be generated by buggy Nvidia driver.\n";
+    }
     return;
   }
 
-  if (GLCAT.is_debug()) {
-    GLCAT.debug()
-      << "Active uniform " << name_buffer << " with size " << param_size
-      << " and type 0x" << hex << param_type << dec
-      << " is bound to location " << p << "\n";
-  }
-
   if (p < 0) {
     // Special meaning, or it's in a uniform block.  Let it go.
     return;

+ 2 - 1
panda/src/gobj/geomPrimitive.cxx

@@ -2231,7 +2231,8 @@ get_num_primitives() const {
  */
 bool GeomPrimitivePipelineReader::
 check_valid(const GeomVertexDataPipelineReader *data_reader) const {
-  if (get_num_vertices() != 0  &&
+  if (get_num_vertices() != 0 &&
+      data_reader->get_num_arrays() > 0 &&
       get_max_vertex() >= data_reader->get_num_rows()) {
 
 #ifndef NDEBUG

+ 9 - 0
panda/src/gobj/geomVertexFormat.I

@@ -235,6 +235,15 @@ get_morph_delta(size_t n) const {
   return _morphs[n]._delta;
 }
 
+/**
+ * Returns a standard vertex format containing no arrays at all, useful for
+ * pull-style vertex rendering.
+ */
+INLINE const GeomVertexFormat *GeomVertexFormat::
+get_empty() {
+  return get_registry()->_empty;
+}
+
 /**
  * Returns a standard vertex format with just a 3-component vertex position.
  */

+ 2 - 4
panda/src/gobj/geomVertexFormat.cxx

@@ -890,6 +890,8 @@ Registry() {
  */
 void GeomVertexFormat::Registry::
 make_standard_formats() {
+  _empty = register_format(new GeomVertexFormat);
+
   _v3 = register_format(new GeomVertexArrayFormat
                         (InternalName::get_vertex(), 3,
                          NT_stdfloat, C_point));
@@ -1011,10 +1013,6 @@ register_format(GeomVertexFormat *format) {
     new_format = (*fi);
     if (!new_format->is_registered()) {
       new_format->do_register();
-      if (new_format->get_num_arrays() == 0) {
-        gobj_cat.warning()
-          << "Empty GeomVertexFormat registered.\n";
-      }
     }
   }
 

+ 4 - 0
panda/src/gobj/geomVertexFormat.h

@@ -125,6 +125,8 @@ PUBLISHED:
   void write_with_data(ostream &out, int indent_level,
                        const GeomVertexData *data) const;
 
+  INLINE static const GeomVertexFormat *get_empty();
+
   // Some standard vertex formats.  No particular requirement to use one of
   // these, but the DirectX renderers can use these formats directly, whereas
   // any other format will have to be converted first.
@@ -227,6 +229,8 @@ private:
     Formats _formats;
     LightReMutex _lock;
 
+    CPT(GeomVertexFormat) _empty;
+
     CPT(GeomVertexFormat) _v3;
     CPT(GeomVertexFormat) _v3n3;
     CPT(GeomVertexFormat) _v3t2;

+ 5 - 3
panda/src/gobj/shader.cxx

@@ -2362,15 +2362,14 @@ r_preprocess_source(ostream &out, const Filename &fn,
   bool had_include = false;
   int lineno = 0;
   while (getline(*source, line)) {
-    // We always forward the actual line - the GLSL compiler will silently
-    // ignore #pragma lines anyway.
     ++lineno;
-    out << line << "\n";
 
     // Check if this line contains a #pragma.
     char pragma[64];
     if (line.size() < 8 ||
         sscanf(line.c_str(), " # pragma %63s", pragma) != 1) {
+      // Just pass the line through unmodified.
+      out << line << "\n";
 
       // One exception: check for an #endif after an include.  We have to
       // restore the line number in case the include happened under an #if
@@ -2435,8 +2434,11 @@ r_preprocess_source(ostream &out, const Filename &fn,
 
     } else if (strcmp(pragma, "optionNV") == 0) {
       // This is processed by NVIDIA drivers.  Don't touch it.
+      out << line << "\n";
 
     } else {
+      // Forward it, the driver will ignore it if it doesn't know it.
+      out << line << "\n";
       shader_cat.warning()
         << "Ignoring unknown pragma directive \"" << pragma << "\" at line "
         << lineno << " of file " << fn << ":\n  " << line << "\n";

+ 174 - 78
panda/src/gobj/texture.cxx

@@ -830,8 +830,11 @@ set_ram_image_as(CPTA_uchar image, const string &supplied_format) {
         } else if (format.at(s) == 'R') {
           component = 2;
         } else if (format.at(s) == 'A') {
-          nassertv(cdata->_num_components != 3);
-          component = cdata->_num_components - 1;
+          if (cdata->_num_components != 3) {
+            component = cdata->_num_components - 1;
+          } else {
+            // Ignore.
+          }
         } else if (format.at(s) == '0') {
           // Ignore.
         } else if (format.at(s) == '1') {
@@ -859,8 +862,11 @@ set_ram_image_as(CPTA_uchar image, const string &supplied_format) {
       } else if (format.at(s) == 'R') {
         component = 2;
       } else if (format.at(s) == 'A') {
-        nassertv(cdata->_num_components != 3);
-        component = cdata->_num_components - 1;
+        if (cdata->_num_components != 3) {
+          component = cdata->_num_components - 1;
+        } else {
+          // Ignore.
+        }
       } else if (format.at(s) == '0') {
         // Ignore.
       } else if (format.at(s) == '1') {
@@ -6088,18 +6094,23 @@ do_get_uncompressed_ram_image(CData *cdata) {
  * Rather than just returning a pointer to the data, like
  * get_uncompressed_ram_image, this function first processes the data and
  * reorders the components using the specified format string, and places these
- * into a new char array.  The 'format' argument should specify in which order
- * the components of the texture must be.  For example, valid format strings
- * are "RGBA", "GA", "ABRG" or "AAA". A component can also be written as "0"
- * or "1", which means an empty/black or a full/white channel, respectively.
+ * into a new char array.
+ *
+ * The 'format' argument should specify in which order the components of the
+ * texture must be.  For example, valid format strings are "RGBA", "GA",
+ * "ABRG" or "AAA".  A component can also be written as "0" or "1", which
+ * means an empty/black or a full/white channel, respectively.
+ *
  * This function is particularly useful to copy an image in-memory to a
  * different library (for example, PIL or wxWidgets) that require a different
  * component order than Panda's internal format, BGRA. Note, however, that
  * this conversion can still be too slow if you want to do it every frame, and
- * should thus be avoided for that purpose.  The only requirement for the
- * reordering is that an uncompressed image must be available.  If the RAM
- * image is compressed, it will attempt to re-load the texture from disk, if
- * it doesn't find an uncompressed image there, it will return NULL.
+ * should thus be avoided for that purpose.
+ *
+ * The only requirement for the reordering is that an uncompressed image must
+ * be available.  If the RAM image is compressed, it will attempt to re-load
+ * the texture from disk, if it doesn't find an uncompressed image there, it
+ * will return NULL.
  */
 CPTA_uchar Texture::
 get_ram_image_as(const string &requested_format) {
@@ -6125,92 +6136,177 @@ get_ram_image_as(const string &requested_format) {
     return CPTA_uchar(data);
   }
 
+  // Check if we have an alpha channel, and remember which channel we use.
+  int alpha = -1;
+  if (Texture::has_alpha(cdata->_format)) {
+    alpha = cdata->_num_components - 1;
+  }
+
+  // Validate the format beforehand.
+  for (size_t i = 0; i < format.size(); ++i) {
+    if (format[i] != 'B' && format[i] != 'G' && format[i] != 'R' &&
+        format[i] != 'A' && format[i] != '0' && format[i] != '1') {
+      gobj_cat.error() << "Unexpected component character '"
+        << format[i] << "', expected one of RGBA01!\n";
+      return CPTA_uchar(get_class_type());
+    }
+  }
+
   // Create a new empty array that can hold our image.
   PTA_uchar newdata = PTA_uchar::empty_array(imgsize * format.size() * cdata->_component_width, get_class_type());
 
   // These ifs are for optimization of commonly used image types.
-  if (format == "RGBA" && cdata->_num_components == 4 && cdata->_component_width == 1) {
-    imgsize *= 4;
-    for (int p = 0; p < imgsize; p += 4) {
-      newdata[p    ] = data[p + 2];
-      newdata[p + 1] = data[p + 1];
-      newdata[p + 2] = data[p    ];
-      newdata[p + 3] = data[p + 3];
+  if (cdata->_component_width == 1) {
+    if (format == "RGBA" && cdata->_num_components == 4) {
+      const PN_uint32 *src = (const PN_uint32 *)data.p();
+      PN_uint32 *dst = (PN_uint32 *)newdata.p();
+
+      for (int p = 0; p < imgsize; ++p) {
+        PN_uint32 v = *src++;
+        *dst++ = ((v & 0xff00ff00u)) |
+                 ((v & 0x00ff0000u) >> 16) |
+                 ((v & 0x000000ffu) << 16);
+      }
+      return newdata;
+    }
+    if (format == "RGB" && cdata->_num_components == 4) {
+      const PN_uint32 *src = (const PN_uint32 *)data.p();
+      PN_uint32 *dst = (PN_uint32 *)newdata.p();
+
+      // Convert blocks of 4 pixels at a time, so that we can treat both the
+      // source and destination as 32-bit integers.
+      int blocks = imgsize >> 2;
+      for (int i = 0; i < blocks; ++i) {
+        PN_uint32 v0 = *src++;
+        PN_uint32 v1 = *src++;
+        PN_uint32 v2 = *src++;
+        PN_uint32 v3 = *src++;
+        *dst++ = ((v0 & 0x00ff0000u) >> 16) |
+                 ((v0 & 0x0000ff00u)) |
+                 ((v0 & 0x000000ffu) << 16) |
+                 ((v1 & 0x00ff0000u) << 8);
+        *dst++ = ((v1 & 0x0000ff00u) >> 8) |
+                 ((v1 & 0x000000ffu) << 8) |
+                 ((v2 & 0x00ff0000u)) |
+                 ((v2 & 0x0000ff00u) << 16);
+        *dst++ = ((v2 & 0x000000ffu)) |
+                 ((v3 & 0x00ff0000u) >> 8) |
+                 ((v3 & 0x0000ff00u) << 8) |
+                 ((v3 & 0x000000ffu) << 24);
+      }
+
+      // If the image size wasn't a multiple of 4, we may have a handful of
+      // pixels left over.  Convert those the slower way.
+      PN_uint8 *tail = (PN_uint8 *)dst;
+      for (int i = (imgsize & ~0x3); i < imgsize; ++i) {
+        PN_uint32 v = *src++;
+        *tail++ = (v & 0x00ff0000u) >> 16;
+        *tail++ = (v & 0x0000ff00u) >> 8;
+        *tail++ = (v & 0x000000ffu);
+      }
+      return newdata;
+    }
+    if (format == "BGR" && cdata->_num_components == 4) {
+      const PN_uint32 *src = (const PN_uint32 *)data.p();
+      PN_uint32 *dst = (PN_uint32 *)newdata.p();
+
+      // Convert blocks of 4 pixels at a time, so that we can treat both the
+      // source and destination as 32-bit integers.
+      int blocks = imgsize >> 2;
+      for (int i = 0; i < blocks; ++i) {
+        PN_uint32 v0 = *src++;
+        PN_uint32 v1 = *src++;
+        PN_uint32 v2 = *src++;
+        PN_uint32 v3 = *src++;
+        *dst++ = (v0 & 0x00ffffffu) | ((v1 & 0x000000ffu) << 24);
+        *dst++ = ((v1 & 0x00ffff00u) >> 8) |  ((v2 & 0x0000ffffu) << 16);
+        *dst++ = ((v2 & 0x00ff0000u) >> 16) | ((v3 & 0x00ffffffu) << 8);
+      }
+
+      // If the image size wasn't a multiple of 4, we may have a handful of
+      // pixels left over.  Convert those the slower way.
+      PN_uint8 *tail = (PN_uint8 *)dst;
+      for (int i = (imgsize & ~0x3); i < imgsize; ++i) {
+        PN_uint32 v = *src++;
+        *tail++ = (v & 0x000000ffu);
+        *tail++ = (v & 0x0000ff00u) >> 8;
+        *tail++ = (v & 0x00ff0000u) >> 16;
+      }
+      return newdata;
     }
-    return newdata;
-  }
-  if (format == "RGB" && cdata->_num_components == 3 && cdata->_component_width == 1) {
-    imgsize *= 3;
-    for (int p = 0; p < imgsize; p += 3) {
-      newdata[p    ] = data[p + 2];
-      newdata[p + 1] = data[p + 1];
-      newdata[p + 2] = data[p    ];
+    const PN_uint8 *src = (const PN_uint8 *)data.p();
+    PN_uint8 *dst = (PN_uint8 *)newdata.p();
+
+    if (format == "RGB" && cdata->_num_components == 3) {
+      for (int i = 0; i < imgsize; ++i) {
+        *dst++ = src[2];
+        *dst++ = src[1];
+        *dst++ = src[0];
+        src += 3;
+      }
+      return newdata;
     }
-    return newdata;
-  }
-  if (format == "A" && cdata->_component_width == 1 && cdata->_num_components != 3) {
-    // We can generally rely on alpha to be the last component.
-    int component = cdata->_num_components - 1;
-    for (int p = 0; p < imgsize; ++p) {
-      newdata[p] = data[component];
+    if (format == "A" && cdata->_num_components != 3) {
+      // We can generally rely on alpha to be the last component.
+      for (int p = 0; p < imgsize; ++p) {
+        dst[p] = src[alpha];
+        src += cdata->_num_components;
+      }
+      return newdata;
     }
-    return newdata;
-  }
-  if (cdata->_component_width == 1) {
+    // Fallback case for other 8-bit-per-channel formats.
     for (int p = 0; p < imgsize; ++p) {
-      for (uchar s = 0; s < format.size(); ++s) {
-        signed char component = -1;
-        if (format.at(s) == 'B' || (cdata->_num_components <= 2 && format.at(s) != 'A')) {
-          component = 0;
-        } else if (format.at(s) == 'G') {
-          component = 1;
-        } else if (format.at(s) == 'R') {
-          component = 2;
-        } else if (format.at(s) == 'A') {
-          nassertr(cdata->_num_components != 3, CPTA_uchar(get_class_type()));
-          component = cdata->_num_components - 1;
-        } else if (format.at(s) == '0') {
-          newdata[p * format.size() + s] = 0x00;
-        } else if (format.at(s) == '1') {
-          newdata[p * format.size() + s] = 0xff;
+      for (size_t i = 0; i < format.size(); ++i) {
+        if (format[i] == 'B' || (cdata->_num_components <= 2 && format[i] != 'A')) {
+          *dst++ = src[0];
+        } else if (format[i] == 'G') {
+          *dst++ = src[1];
+        } else if (format[i] == 'R') {
+          *dst++ = src[2];
+        } else if (format[i] == 'A') {
+          if (alpha >= 0) {
+            *dst++ = src[alpha];
+          } else {
+            *dst++ = 0xff;
+          }
+        } else if (format[i] == '1') {
+          *dst++ = 0xff;
         } else {
-          gobj_cat.error() << "Unexpected component character '"
-            << format.at(s) << "', expected one of RGBA!\n";
-          return CPTA_uchar(get_class_type());
-        }
-        if (component >= 0) {
-          newdata[p * format.size() + s] = data[p * cdata->_num_components + component];
+          *dst++ = 0x00;
         }
       }
+      src += cdata->_num_components;
     }
     return newdata;
   }
+
+  // The slow and general case.
   for (int p = 0; p < imgsize; ++p) {
-    for (uchar s = 0; s < format.size(); ++s) {
-      signed char component = -1;
-      if (format.at(s) == 'B' || (cdata->_num_components <= 2 && format.at(s) != 'A')) {
+    for (size_t i = 0; i < format.size(); ++i) {
+      int component = 0;
+      if (format[i] == 'B' || (cdata->_num_components <= 2 && format[i] != 'A')) {
         component = 0;
-      } else if (format.at(s) == 'G') {
+      } else if (format[i] == 'G') {
         component = 1;
-      } else if (format.at(s) == 'R') {
+      } else if (format[i] == 'R') {
         component = 2;
-      } else if (format.at(s) == 'A') {
-        nassertr(cdata->_num_components != 3, CPTA_uchar(get_class_type()));
-        component = cdata->_num_components - 1;
-      } else if (format.at(s) == '0') {
-        memset((void*)(newdata + (p * format.size() + s) * cdata->_component_width),  0, cdata->_component_width);
-      } else if (format.at(s) == '1') {
-        memset((void*)(newdata + (p * format.size() + s) * cdata->_component_width), -1, cdata->_component_width);
+      } else if (format[i] == 'A') {
+        if (alpha >= 0) {
+          component = alpha;
+        } else {
+          memset((void*)(newdata + (p * format.size() + i) * cdata->_component_width), -1, cdata->_component_width);
+          continue;
+        }
+      } else if (format[i] == '1') {
+        memset((void*)(newdata + (p * format.size() + i) * cdata->_component_width), -1, cdata->_component_width);
+        continue;
       } else {
-        gobj_cat.error() << "Unexpected component character '"
-          << format.at(s) << "', expected one of RGBA!\n";
-        return CPTA_uchar(get_class_type());
-      }
-      if (component >= 0) {
-        memcpy((void*)(newdata + (p * format.size() + s) * cdata->_component_width),
-               (void*)(data + (p * cdata->_num_components + component) * cdata->_component_width),
-               cdata->_component_width);
+        memset((void*)(newdata + (p * format.size() + i) * cdata->_component_width),  0, cdata->_component_width);
+        continue;
       }
+      memcpy((void*)(newdata + (p * format.size() + i) * cdata->_component_width),
+             (void*)(data + (p * cdata->_num_components + component) * cdata->_component_width),
+             cdata->_component_width);
     }
   }
   return newdata;

+ 2 - 0
panda/src/grutil/config_grutil.cxx

@@ -23,6 +23,7 @@
 #include "nodeVertexTransform.h"
 #include "rigidBodyCombiner.h"
 #include "pipeOcclusionCullTraverser.h"
+#include "shaderTerrainMesh.h"
 
 #include "dconfig.h"
 
@@ -123,6 +124,7 @@ init_libgrutil() {
   RigidBodyCombiner::init_type();
   PipeOcclusionCullTraverser::init_type();
   SceneGraphAnalyzerMeter::init_type();
+  ShaderTerrainMesh::init_type();
 
 #ifdef HAVE_AUDIO
   MovieTexture::init_type();

+ 1 - 0
panda/src/grutil/p3grutil_composite1.cxx

@@ -1,6 +1,7 @@
 #include "cardMaker.cxx"
 #include "heightfieldTesselator.cxx"
 #include "geoMipTerrain.cxx"
+#include "shaderTerrainMesh.cxx"
 #include "config_grutil.cxx"
 #include "lineSegs.cxx"
 #include "fisheyeMaker.cxx"

+ 191 - 0
panda/src/grutil/shaderTerrainMesh.I

@@ -0,0 +1,191 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file shaderTerrainMesh.I
+ * @author tobspr
+ * @date 2016-02-16
+ */
+
+/**
+ * @brief Sets the path to the heightfield
+ * @details This sets the path to the terrain heightfield. It should be 16bit
+ *   single channel, and have a power-of-two resolution greater than 32.
+ *   Common sizes are 2048x2048 or 4096x4096.
+ *
+ * @param filename Path to the heightfield
+ */
+INLINE void ShaderTerrainMesh::set_heightfield_filename(const Filename& filename) {
+  _heightfield_source = filename;
+}
+
+/**
+ * @brief Returns the heightfield path
+ * @details This returns the path of the terrain heightfield, previously set with
+ *   set_heightfield()
+ *
+ * @return Path to the heightfield
+ */
+INLINE const Filename& ShaderTerrainMesh::get_heightfield_filename() const {
+  return _heightfield_source;
+}
+
+/**
+ * @brief Sets the chunk size
+ * @details This sets the chunk size of the terrain. A chunk is basically the
+ *   smallest unit in LOD. If the chunk size is too small, the terrain will
+ *   perform bad, since there will be way too many chunks. If the chunk size
+ *   is too big, you will not get proper LOD, and might also get bad performance.
+ *
+ *   For terrains of the size 4096x4096 or 8192x8192, a chunk size of 32 seems
+ *   to produce good results. For smaller resolutions, you should try out a
+ *   size of 16 or even 8 for very small terrains.
+ *
+ *   The amount of chunks generated for the last level equals to
+ *   (heightfield_size / chunk_size) ** 2. The chunk size has to be a power
+ *   of two.
+ *
+ * @param chunk_size Size of the chunks, has to be a power of two
+ */
+INLINE void ShaderTerrainMesh::set_chunk_size(size_t chunk_size) {
+  _chunk_size = chunk_size;
+}
+
+/**
+ * @brief Returns the chunk size
+ * @details This returns the chunk size, previously set with set_chunk_size()
+ * @return Chunk size
+ */
+INLINE size_t ShaderTerrainMesh::get_chunk_size() const {
+  return _chunk_size;
+}
+
+/**
+ * @brief Sets whether to generate patches
+ * @details If this option is set to true, GeomPatches will be used instead of
+ *   GeomTriangles. This is required when the terrain is used with tesselation
+ *   shaders, since patches are required for tesselation, whereas triangles
+ *   are required for regular rendering.
+ *
+ *   If this option is set to true while not using a tesselation shader, the
+ *   terrain will not get rendered, or even produce errors. The same applies
+ *   when this is option is not set, but the terrain is used with tesselation
+ *   shaders.
+ *
+ * @param generate_patches [description]
+ */
+INLINE void ShaderTerrainMesh::set_generate_patches(bool generate_patches) {
+  _generate_patches = generate_patches;
+}
+
+/**
+ * @brief Returns whether to generate patches
+ * @details This returns whether patches are generated, previously set with
+ *   set_generate_patches()
+ *
+ * @return Whether to generate patches
+ */
+INLINE bool ShaderTerrainMesh::get_generate_patches() const {
+  return _generate_patches;
+}
+
+
+/**
+ * @brief Sets the desired triangle width
+ * @details This sets the desired width a triangle should have in pixels.
+ *   A value of 10.0 for example will make the terrain tesselate everything
+ *   in a way that each triangle edge roughly is 10 pixels wide.
+ *   Of course this will not always accurately match, however you can use this
+ *   setting to control the LOD algorithm of the terrain.
+ *
+ * @param target_triangle_width Desired triangle width in pixels
+ */
+INLINE void ShaderTerrainMesh::set_target_triangle_width(PN_stdfloat target_triangle_width) {
+  _target_triangle_width = target_triangle_width;
+}
+
+/**
+ * @brief Returns the target triangle width
+ * @details This returns the target triangle width, previously set with
+ *   ShaderTerrainMesh::set_target_triangle_width()
+ *
+ * @return Target triangle width
+ */
+INLINE PN_stdfloat ShaderTerrainMesh::get_target_triangle_width() const {
+  return _target_triangle_width;
+}
+
+
+/**
+ * @brief Sets whether to enable terrain updates
+ * @details This flag controls whether the terrain should be updated. If this value
+ *   is set to false, no updating of the terrain will happen. This can be useful
+ *   to debug the culling algorithm used by the terrain.
+ *
+ * @param update_enabled Whether to update the terrain
+ */
+INLINE void ShaderTerrainMesh::set_update_enabled(bool update_enabled) {
+  _update_enabled = update_enabled;
+}
+
+/**
+ * @brief Returns whether the terrain is getting updated
+ * @details This returns whether the terrain is getting updates, previously set with
+ *   set_update_enabled()
+ *
+ * @return Whether to update the terrain
+ */
+INLINE bool ShaderTerrainMesh::get_update_enabled() const {
+  return _update_enabled;
+}
+
+/**
+ * @brief Returns a handle to the heightfield texture
+ * @details This returns a handle to the internally used heightfield texture. This
+ *   can be used to set the heightfield as a shader input.
+ *
+ * @return Handle to the heightfield texture
+ */
+INLINE Texture* ShaderTerrainMesh::get_heightfield_tex() const {
+  return _heightfield_tex;
+}
+
+/**
+ * @brief Clears all children
+ * @details This clears all children on the chunk and sets them to NULL. This will
+ *   effectively free all memory consumed by this chunk and its children.
+ */
+INLINE void ShaderTerrainMesh::Chunk::clear_children() {
+  for (size_t i = 0; i < 4; ++i) {
+    delete children[i];
+    children[i] = NULL;
+  }
+}
+
+/**
+ * @brief Chunk constructor
+ * @details This constructs a new chunk, and sets all children to NULL.
+ */
+INLINE ShaderTerrainMesh::Chunk::Chunk() {
+  for (size_t i = 0; i < 4; ++i)
+    children[i] = NULL;
+}
+
+/**
+ * @brief Chunk destructor
+ * @details This destructs the chunk, freeing all used resources
+ */
+INLINE ShaderTerrainMesh::Chunk::~Chunk() {
+  clear_children();
+}
+
+/**
+ * @see ShaderTerrainMesh::uv_to_world(LTexCoord)
+ */
+INLINE LPoint3 ShaderTerrainMesh::uv_to_world(PN_stdfloat u, PN_stdfloat v) const {
+  return uv_to_world(LTexCoord(u, v));
+}

+ 715 - 0
panda/src/grutil/shaderTerrainMesh.cxx

@@ -0,0 +1,715 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file shaderTerrainMesh.cxx
+ * @author tobspr
+ * @date 2016-02-16
+ */
+
+
+#include "shaderTerrainMesh.h"
+#include "geom.h"
+#include "geomVertexFormat.h"
+#include "geomVertexData.h"
+#include "geomVertexWriter.h"
+#include "geomNode.h"
+#include "geomTriangles.h"
+#include "geomPatches.h"
+#include "omniBoundingVolume.h"
+#include "cullableObject.h"
+#include "cullTraverser.h"
+#include "cullHandler.h"
+#include "cullTraverserData.h"
+#include "clockObject.h"
+#include "shaderAttrib.h"
+#include "renderAttrib.h"
+#include "shaderInput.h"
+#include "boundingBox.h"
+#include "samplerState.h"
+#include "config_grutil.h"
+#include "typeHandle.h"
+
+ConfigVariableBool stm_use_hexagonal_layout
+("stm-use-hexagonal-layout", true,
+ PRC_DESC("Set this to true to use a hexagonal vertex layout. This approximates "
+          "the heightfield in a better way, however the CLOD transitions might be "
+          "visible due to the vertices not matching exactly."));
+
+ConfigVariableInt stm_max_chunk_count
+("stm-max-chunk-count", 2048,
+ PRC_DESC("Controls the maximum amount of chunks the Terrain can display. If you use "
+          "a high LOD, you might have to increment this value. The lower this value is "
+          "the less data has to be transferred to the GPU."));
+
+ConfigVariableInt stm_max_views
+("stm-max-views", 8,
+ PRC_DESC("Controls the maximum amount of different views the Terrain can be rendered "
+          "with. Each camera rendering the terrain corresponds to a view. Lowering this "
+          "value will reduce the data that has to be transferred to the GPU."));
+
+PStatCollector ShaderTerrainMesh::_basic_collector("Cull:ShaderTerrainMesh:Setup");
+PStatCollector ShaderTerrainMesh::_lod_collector("Cull:ShaderTerrainMesh:CollectLOD");
+
+NotifyCategoryDef(shader_terrain, "");
+
+TypeHandle ShaderTerrainMesh::_type_handle;
+
+/**
+ * @brief Helper function to check for a power of two
+ * @details This method checks for a power of two by using bitmasks
+ *
+ * @param x Number to check
+ * @return true if x is a power of two, false otherwise
+ */
+int check_power_of_two(size_t x)
+{
+  return ((x != 0) && ((x & (~x + 1)) == x));
+}
+
+/**
+ * @brief Constructs a new Terrain Mesh
+ * @details This constructs a new terrain mesh. By default, no transform is set
+ *   on the mesh, causing it to range over the unit box from (0, 0, 0) to
+ *   (1, 1, 1). Usually you want to set a custom transform with NodePath::set_scale()
+ */
+ShaderTerrainMesh::ShaderTerrainMesh() :
+  PandaNode("ShaderTerrainMesh"),
+  _size(0),
+  _chunk_size(32),
+  _heightfield_source(""),
+  _generate_patches(false),
+  _data_texture(NULL),
+  _chunk_geom(NULL),
+  _current_view_index(0),
+  _last_frame_count(-1),
+  _target_triangle_width(10.0f),
+  _update_enabled(true),
+  _heightfield_tex(NULL)
+{
+  set_final(true);
+  set_bounds(new OmniBoundingVolume());
+}
+
+/**
+ * @brief Generates the terrain mesh
+ * @details This generates the terrain mesh, initializing all chunks of the
+ *   internal used quadtree. At this point, a heightfield and a chunk size should
+ *   have been set, otherwise an error is thrown.
+ *
+ *   If anything goes wrong, like a missing heightfield, then an error is printed
+ *   and false is returned.
+ *
+ * @return true if the terrain was initialized, false if an error occured
+ */
+bool ShaderTerrainMesh::generate() {
+  if (!do_load_heightfield())
+    return false;
+
+  if (_chunk_size < 8 || !check_power_of_two(_chunk_size)) {
+    shader_terrain_cat.error() << "Invalid chunk size! Has to be >= 8 and a power of two!" << endl;
+    return false;
+  }
+
+  if (_chunk_size > _size / 4) {
+    shader_terrain_cat.error() << "Chunk size too close or greater than the actual terrain size!" << endl;
+    return false;
+  }
+
+  do_create_chunks();
+  do_compute_bounds(&_base_chunk);
+  do_create_chunk_geom();
+  do_init_data_texture();
+  do_convert_heightfield();
+
+  return true;
+}
+
+/**
+ * @brief Converts the internal used PNMImage to a Texture
+ * @details This converts the internal used PNMImage to a texture object. The
+ *   reason for this is, that we need the PNMimage for computing the chunk
+ *   bounds, but don't need it afterwards. However, since we have it in ram,
+ *   we can just put its contents into a Texture object, which enables the
+ *   user to call get_heightfield() instead of manually loading the texture
+ *   from disk again to set it as shader input (Panda does not cache PNMImages)
+ */
+void ShaderTerrainMesh::do_convert_heightfield() {
+  _heightfield_tex = new Texture();
+  _heightfield_tex->load(_heightfield);
+  _heightfield_tex->set_keep_ram_image(true);
+
+  if (_heightfield.get_maxval() != 65535) {
+    shader_terrain_cat.warning() << "Using non 16-bit heightfield!" << endl;
+  } else {
+    _heightfield_tex->set_format(Texture::F_r16);
+  }
+  _heightfield_tex->set_minfilter(SamplerState::FT_linear);
+  _heightfield_tex->set_magfilter(SamplerState::FT_linear);
+  _heightfield.clear();
+}
+
+/**
+ * @brief Intermal method to load the heightfield
+ * @details This method loads the heightfield from the heightfield path,
+ *   and performs some basic checks, including a check for a power of two,
+ *   and same width and height.
+ *
+ * @return true if the heightfield was loaded and meets the requirements
+ */
+bool ShaderTerrainMesh::do_load_heightfield() {
+
+  if(!_heightfield.read(_heightfield_source)) {
+    shader_terrain_cat.error() << "Could not load heightfield from " << _heightfield_source << endl;
+    return false;
+  }
+
+  if (_heightfield.get_x_size() != _heightfield.get_y_size()) {
+    shader_terrain_cat.error() << "Only square heightfields are supported!";
+    return false;
+  }
+
+  _size = _heightfield.get_x_size();
+
+  if (_size < 32 || !check_power_of_two(_size)) {
+    shader_terrain_cat.error() << "Invalid heightfield! Needs to be >= 32 and a power of two (was: "
+         << _size << ")!" << endl;
+    return false;
+  }
+
+  return true;
+}
+
+/**
+ * @brief Internal method to init the terrain data texture
+ * @details This method creates the data texture, used to store all chunk data.
+ *   The data texture is set as a shader input later on, and stores the position
+ *   and scale of each chunk. Every row in the data texture denotes a view on
+ *   the terrain.
+ */
+void ShaderTerrainMesh::do_init_data_texture() {
+  _data_texture = new Texture("TerrainDataTexture");
+  _data_texture->setup_2d_texture(stm_max_chunk_count, stm_max_views, Texture::T_float, Texture::F_rgba32);
+  _data_texture->set_clear_color(LVector4(0));
+  _data_texture->clear_image();
+}
+
+/**
+ * @brief Internal method to init the quadtree
+ * @details This method creates the base chunk and then inits all chunks recursively
+ *   by using ShaderTerrainMesh::do_init_chunk().
+ */
+void ShaderTerrainMesh::do_create_chunks() {
+
+  // Release any previously stored children
+  _base_chunk.clear_children();
+
+  // Create the base chunk
+  _base_chunk.depth = 0;
+  _base_chunk.x = 0;
+  _base_chunk.y = 0;
+  _base_chunk.size = _size;
+  _base_chunk.edges.set(0, 0, 0, 0);
+  _base_chunk.avg_height = 0.5;
+  _base_chunk.min_height = 0.0;
+  _base_chunk.max_height = 1.0;
+  _base_chunk.last_clod = 0.0;
+  do_init_chunk(&_base_chunk);
+}
+
+/**
+ * @brief Internal method to recursively init the quadtree
+ * @details This method inits the quadtree. Starting from a given node, it
+ *   first examines if that node should be subdivided.
+ *
+ *   If the node should be subdivided, four children are created and this method
+ *   is called on the children again. If the node is a leaf, all children are
+ *   set to NULL and nothing else happens.
+ *
+ *   The chunk parameter may not be zero or undefined behaviour occurs.
+ *
+ * @param chunk The parent chunk
+ */
+void ShaderTerrainMesh::do_init_chunk(Chunk* chunk) {
+  if (chunk->size > _chunk_size) {
+
+    // Compute children chunk size
+    size_t child_chunk_size = chunk->size / 2;
+
+    // Subdivide chunk into 4 children
+    for (size_t y = 0; y < 2; ++y) {
+      for (size_t x = 0; x < 2; ++x) {
+        Chunk* child = new Chunk();
+        child->size = child_chunk_size;
+        child->depth = chunk->depth + 1;
+        child->x = chunk->x + x * child_chunk_size;
+        child->y = chunk->y + y * child_chunk_size;
+        do_init_chunk(child);
+        chunk->children[x + 2*y] = child;
+      }
+    }
+  } else {
+    // Final chunk, initialize all children to zero
+    for (size_t i = 0; i < 4; ++i) {
+      chunk->children[i] = NULL;
+    }
+  }
+}
+
+/**
+ * @brief Recursively computes the bounds for a given chunk
+ * @details This method takes a parent chunk, and computes the bounds recursively,
+ *   depending on whether the chunk is a leaf or a node.
+ *
+ *   If the chunk is a leaf, then the average, min and max values for that chunk
+ *   are computed by iterating over the heightfield region of that chunk.
+ *
+ *   If the chunk is a node, this method is called recursively on all children
+ *   first, and after that, the average, min and max values for that chunk
+ *   are computed by merging those values of the children.
+ *
+ *   If chunk is NULL, undefined behaviour occurs.
+ *
+ * @param chunk The parent chunk
+ */
+void ShaderTerrainMesh::do_compute_bounds(Chunk* chunk) {
+
+  // Final chunk (Leaf)
+  if (chunk->size == _chunk_size) {
+
+    // Get a pointer to the PNMImage data, this is faster than using get_xel()
+    // for all pixels, since get_xel() also includes bounds checks and so on.
+    xel* data = _heightfield.get_array();
+
+    // Pixel getter function. Note that we have to flip the Y-component, since
+    // panda itself also flips it
+    // auto get_xel = [&](size_t x, size_t y){ return data[x + (_size - 1 - y) * _size].b / (PN_stdfloat)PGM_MAXMAXVAL; };
+    #define get_xel(x, y) (data[(x) + (_size - 1 - (y)) * _size].b / (PN_stdfloat)PGM_MAXMAXVAL)
+
+    // Iterate over all pixels
+    PN_stdfloat avg_height = 0.0, min_height = 1.0, max_height = 0.0;
+    for (size_t x = 0; x < _chunk_size; ++x) {
+      for (size_t y = 0; y < _chunk_size; ++y) {
+
+        // Access data directly, to improve performance
+        PN_stdfloat height = get_xel(chunk->x + x, chunk->y + y);
+        avg_height += height;
+        min_height = min(min_height, height);
+        max_height = max(max_height, height);
+      }
+    }
+
+    // Normalize average height
+    avg_height /= _chunk_size * _chunk_size;
+
+    // Store values
+    chunk->min_height = min_height;
+    chunk->max_height = max_height;
+    chunk->avg_height = avg_height;
+
+    // Get edges in the order (0, 0) (1, 0) (0, 1) (1, 1)
+    for (size_t y = 0; y < 2; ++y) {
+      for (size_t x = 0; x < 2; ++x) {
+        chunk->edges.set_cell(x + 2 * y, get_xel(
+            chunk->x + x * (_chunk_size - 1),
+            chunk->y + y * (_chunk_size - 1)
+          ));
+      }
+    }
+
+    #undef get_xel
+
+  } else {
+
+    // Reset heights
+    chunk->avg_height = 0.0;
+    chunk->min_height = 1.0;
+    chunk->max_height = 0.0;
+
+    // Perform bounds computation for every children and merge the children values
+    for (size_t i = 0; i < 4; ++i) {
+      do_compute_bounds(chunk->children[i]);
+      chunk->avg_height += chunk->children[i]->avg_height / 4.0;
+      chunk->min_height = min(chunk->min_height, chunk->children[i]->min_height);
+      chunk->max_height = max(chunk->max_height, chunk->children[i]->max_height);
+    }
+
+    // Also take the edge points from the children
+    chunk->edges.set_x(chunk->children[0]->edges.get_x());
+    chunk->edges.set_y(chunk->children[1]->edges.get_y());
+    chunk->edges.set_z(chunk->children[2]->edges.get_z());
+    chunk->edges.set_w(chunk->children[3]->edges.get_w());
+  }
+}
+
+/**
+ * @brief Internal method to create the chunk geom
+ * @details This method generates the internal used base chunk. The base chunk geom
+ *   is used to render the actual terrain, and will get instanced for every chunk.
+ *
+ *   The chunk has a size of (size+3) * (size+3), since additional triangles are
+ *   inserted at the borders to prevent holes between chunks of a different LOD.
+ *
+ *   If the generate patches option is set, patches will be generated instead
+ *   of triangles, which allows the terrain to use a tesselation shader.
+ */
+void ShaderTerrainMesh::do_create_chunk_geom() {
+
+  // Convert chunk size to an integer, because we operate on integers and get
+  // signed/unsigned mismatches otherwise
+  int size = (int)_chunk_size;
+
+  // Create vertex data
+  PT(GeomVertexData) gvd = new GeomVertexData("vertices", GeomVertexFormat::get_v3(), Geom::UH_static);
+  gvd->reserve_num_rows( (size + 3) * (size + 3) );
+  GeomVertexWriter vertex_writer(gvd, "vertex");
+
+  // Create primitive
+  PT(GeomPrimitive) triangles = NULL;
+  if (_generate_patches) {
+    triangles = new GeomPatches(3, Geom::UH_static);
+  } else {
+    triangles = new GeomTriangles(Geom::UH_static);
+  }
+
+  // Insert chunk vertices
+  for (int y = -1; y <= size + 1; ++y) {
+    for (int x = -1; x <= size + 1; ++x) {
+      LVector3 vtx_pos(x / (PN_stdfloat)size, y / (PN_stdfloat)size, 0.0f);
+      // Stitched vertices at the cornders
+      if (x == -1 || y == -1 || x == size + 1 || y == size + 1) {
+        vtx_pos.set_z(-1.0f / (PN_stdfloat)size);
+        vtx_pos.set_x(max(0.0f, min(1.0f, vtx_pos.get_x())));
+        vtx_pos.set_y(max(0.0f, min(1.0f, vtx_pos.get_y())));
+      }
+      vertex_writer.add_data3f(vtx_pos);
+    }
+  }
+
+  // Its important to use int and not size_t here, since we do store negative values
+  // auto get_point_index = [&size](int x, int y){ return (x + 1) + (size + 3) * (y + 1); };
+  #define get_point_index(x, y) (((x) + 1) + (size + 3) * ((y) + 1))
+
+  // Create triangles
+  for (int y = -1; y <= size; ++y) {
+    for (int x = -1; x <= size; ++x) {
+      // Get point indices of the quad vertices
+      int tl = get_point_index(x, y);
+      int tr = get_point_index(x + 1, y);
+      int bl = get_point_index(x, y + 1);
+      int br = get_point_index(x + 1, y + 1);
+
+      // Vary triangle scheme on each uneven quad
+      if (stm_use_hexagonal_layout && (x + y) % 2 == 0 ) {
+        triangles->add_vertices(tl, tr, br);
+        triangles->add_vertices(tl, br, bl);
+      } else {
+        triangles->add_vertices(tl, tr, bl);
+        triangles->add_vertices(bl, tr, br);
+      }
+    }
+  }
+
+  #undef get_point_index
+
+  // Construct geom
+  PT(Geom) geom = new Geom(gvd);
+  geom->add_primitive(triangles);
+
+  // Do not set any bounds, we do culling ourself
+  geom->clear_bounds();
+  geom->set_bounds(new OmniBoundingVolume());
+  _chunk_geom = geom;
+}
+
+/**
+ * @copydoc PandaNode::is_renderable()
+ */
+bool ShaderTerrainMesh::is_renderable() const {
+  return true;
+}
+
+/**
+ * @copydoc PandaNode::is_renderable()
+ */
+bool ShaderTerrainMesh::safe_to_flatten() const {
+  return false;
+}
+
+/**
+ * @copydoc PandaNode::safe_to_combine()
+ */
+bool ShaderTerrainMesh::safe_to_combine() const {
+  return false;
+}
+
+/**
+ * @copydoc PandaNode::add_for_draw()
+ */
+void ShaderTerrainMesh::add_for_draw(CullTraverser *trav, CullTraverserData &data) {
+
+  // Make sure the terrain was properly initialized, and the geom was created
+  // successfully
+  nassertv(_data_texture != NULL);
+  nassertv(_chunk_geom != NULL);
+
+  _basic_collector.start();
+
+  // Get current frame count
+  int frame_count = ClockObject::get_global_clock()->get_frame_count();
+
+  if (_last_frame_count != frame_count) {
+    // Frame count changed, this means we are at the beginning of a new frame.
+    // In this case, update the frame count and reset the view index.
+    _last_frame_count = frame_count;
+    _current_view_index = 0;
+  }
+
+  // Get transform and render state for this render pass
+  CPT(TransformState) modelview_transform = data.get_internal_transform(trav);
+  CPT(RenderState) state = data._state->compose(get_state());
+
+  // Store a handle to the scene setup
+  const SceneSetup* scene = trav->get_scene();
+
+  // Get the MVP matrix, this is required for the LOD
+  const Lens* current_lens = scene->get_lens();
+  const LMatrix4& projection_mat = current_lens->get_projection_mat();
+
+  // Get the current lens bounds
+  PT(BoundingVolume) cam_bounds = scene->get_cull_bounds();
+
+  // Transform the camera bounds with the main camera transform
+  DCAST(GeometricBoundingVolume, cam_bounds)->xform(scene->get_camera_transform()->get_mat());
+
+  TraversalData traversal_data;
+  traversal_data.cam_bounds = cam_bounds;
+  traversal_data.model_mat = get_transform()->get_mat();
+  traversal_data.mvp_mat = modelview_transform->get_mat() * projection_mat;
+  traversal_data.emitted_chunks = 0;
+  traversal_data.storage_ptr = (ChunkDataEntry*)_data_texture->modify_ram_image().p();
+  traversal_data.screen_size.set(scene->get_viewport_width(), scene->get_viewport_height());
+
+  // Move write pointer so it points to the beginning of the current view
+  traversal_data.storage_ptr += _data_texture->get_x_size() * _current_view_index;
+
+  if (_update_enabled) {
+    // Traverse recursively
+    _lod_collector.start();
+    do_traverse(&_base_chunk, &traversal_data);
+    _lod_collector.stop();
+  } else {
+    // Do a rough guess of the emitted chunks, we don't know the actual count
+    // (we would have to store it). This is only for debugging anyways, so
+    // its not important we get an accurate count here.
+    traversal_data.emitted_chunks = _data_texture->get_x_size();
+  }
+
+  // Set shader inputs
+  CPT(RenderAttrib) current_shader_attrib = state->get_attrib_def(ShaderAttrib::get_class_slot());
+
+  // Make sure the user didn't forget to set a shader
+  if (!DCAST(ShaderAttrib, current_shader_attrib)->has_shader()) {
+    shader_terrain_cat.warning() << "No shader set on the terrain! You need to set the appropriate shader!" << endl;
+  }
+
+  // Should never happen
+  nassertv(current_shader_attrib != NULL);
+
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.terrain_size", LVecBase2i(_size)) );
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.chunk_size", LVecBase2i(_chunk_size)));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.view_index", LVecBase2i(_current_view_index)));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.data_texture", _data_texture));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_shader_input(
+    new ShaderInput("ShaderTerrainMesh.heightfield", _heightfield_tex));
+  current_shader_attrib = DCAST(ShaderAttrib, current_shader_attrib)->set_instance_count(
+    traversal_data.emitted_chunks);
+
+  state = state->set_attrib(current_shader_attrib, 10000);
+
+  // Emit chunk
+  CullableObject *object = new CullableObject(_chunk_geom, state, modelview_transform);
+  trav->get_cull_handler()->record_object(object, trav);
+
+  // After rendering, increment the view index
+  ++_current_view_index;
+
+  if (_current_view_index > stm_max_views) {
+    shader_terrain_cat.error() << "More views than supported! Increase the stm-max-views config variable!" << endl;
+  }
+
+  _basic_collector.stop();
+}
+
+/**
+ * @brief Traverses the quadtree
+ * @details This method traverses the given chunk, deciding whether it should
+ *   be rendered or subdivided.
+ *
+ *   In case the chunk is decided to be subdivided, this method is called on
+ *   all children.
+ *
+ *   In case the chunk is decided to be rendered, ShaderTerrainMesh::do_emit_chunk() is
+ *   called. Otherwise nothing happens, and the chunk does not get rendered.
+ *
+ * @param chunk Chunk to traverse
+ * @param data Traversal data
+ */
+void ShaderTerrainMesh::do_traverse(Chunk* chunk, TraversalData* data, bool fully_visible) {
+
+  // Don't check bounds if we are fully visible
+  if (!fully_visible) {
+
+    // Construct chunk bounding volume
+    PN_stdfloat scale = 1.0 / (PN_stdfloat)_size;
+    LPoint3 bb_min(chunk->x * scale, chunk->y * scale, chunk->min_height);
+    LPoint3 bb_max((chunk->x + chunk->size) * scale, (chunk->y + chunk->size) * scale, chunk->max_height);
+
+    BoundingBox bbox = BoundingBox(bb_min, bb_max);
+    DCAST(GeometricBoundingVolume, &bbox)->xform(data->model_mat);
+    int intersection = data->cam_bounds->contains(&bbox);
+
+    if (intersection == BoundingVolume::IF_no_intersection) {
+      // No intersection with frustum
+      return;
+    }
+
+    // If the bounds are fully visible, there is no reason to perform culling
+    // on the children, so we set this flag to prevent any bounding computation
+    // on the child nodes.
+    fully_visible = (intersection & BoundingVolume::IF_all) != 0;
+  }
+
+  // Check if the chunk should be subdivided. In case the chunk is a leaf node,
+  // the chunk will never get subdivided.
+  // NOTE: We still always perform the LOD check. This is for the reason that
+  // the lod check also computes the CLOD factor, which is useful.
+  if (do_check_lod_matches(chunk, data) || chunk->size == _chunk_size) {
+    do_emit_chunk(chunk, data);
+  } else {
+    // Traverse children
+    for (size_t i = 0; i < 4; ++i) {
+      do_traverse(chunk->children[i], data, fully_visible);
+    }
+  }
+}
+
+/**
+ * @brief Checks whether a chunk should get subdivided
+ * @details This method checks whether a chunk fits on screen, or should be
+ *   subdivided in order to provide bigger detail.
+ *
+ *   In case this method returns true, the chunk lod is fine, and the chunk
+ *   can be rendered. If the method returns false, the chunk should be subdivided.
+ *
+ * @param chunk Chunk to check
+ * @param data Traversal data
+ *
+ * @return true if the chunk is sufficient, false if the chunk should be subdivided
+ */
+bool ShaderTerrainMesh::do_check_lod_matches(Chunk* chunk, TraversalData* data) {
+
+  // Project all points to world space
+  LVector2 projected_points[4];
+  for (size_t y = 0; y < 2; ++y) {
+    for (size_t x = 0; x < 2; ++x) {
+
+      // Compute point in model space (0,0,0 to 1,1,1)
+      LVector3 edge_pos = LVector3(
+        (PN_stdfloat)(chunk->x + x * (chunk->size - 1)) / (PN_stdfloat)_size,
+        (PN_stdfloat)(chunk->y + y * (chunk->size - 1)) / (PN_stdfloat)_size,
+        chunk->edges.get_cell(x + 2 * y)
+      );
+      LVector4 projected = data->mvp_mat.xform(LVector4(edge_pos, 1.0));
+      if (projected.get_w() == 0.0) {
+        projected.set(0.0, 0.0, -1.0, 1.0f);
+      }
+      projected *= 1.0 / projected.get_w();
+      projected_points[x + 2 * y].set(
+        projected.get_x() * data->screen_size.get_x(),
+        projected.get_y() * data->screen_size.get_y());
+    }
+  }
+
+  // Compute the length of the edges in screen space
+  PN_stdfloat edge_top = (projected_points[1] - projected_points[3]).length_squared();
+  PN_stdfloat edge_right = (projected_points[0] - projected_points[2]).length_squared();
+  PN_stdfloat edge_bottom = (projected_points[2] - projected_points[3]).length_squared();
+  PN_stdfloat edge_left = (projected_points[0] - projected_points[1]).length_squared();
+
+  // CLOD factor
+  PN_stdfloat max_edge = max(edge_top, max(edge_right, max(edge_bottom, edge_left)));
+
+  // Micro-Optimization: We use length_squared() instead of length() to compute the
+  // maximum edge length. This reduces it to one csqrt instead of four.
+  max_edge = csqrt(max_edge);
+
+  PN_stdfloat tesselation_factor = (max_edge / _target_triangle_width) / (PN_stdfloat)_chunk_size;
+  PN_stdfloat clod_factor = max(0.0, min(1.0, 2.0 - tesselation_factor));
+
+  // Store the clod factor
+  chunk->last_clod = clod_factor;
+
+  return tesselation_factor <= 2.0;
+}
+
+/**
+ * @brief Internal method to spawn a chunk
+ * @details This method is used to spawn a chunk in case the traversal decided
+ *   that the chunk gets rendered. It writes the chunks data to the texture, and
+ *   increments the write pointer
+ *
+ * @param chunk Chunk to spawn
+ * @param data Traversal data
+ */
+void ShaderTerrainMesh::do_emit_chunk(Chunk* chunk, TraversalData* data) {
+  if (data->emitted_chunks >= _data_texture->get_x_size()) {
+
+    // Only print warning once
+    if (data->emitted_chunks == _data_texture->get_x_size()) {
+      shader_terrain_cat.error() << "Too many chunks in the terrain! Consider lowering the desired LOD, or increase the stm-max-chunk-count variable." << endl;
+      data->emitted_chunks++;
+    }
+    return;
+  }
+
+  ChunkDataEntry& data_entry = *data->storage_ptr;
+  data_entry.x = chunk->x;
+  data_entry.y = chunk->y;
+  data_entry.size = chunk->size / _chunk_size;
+  data_entry.clod = chunk->last_clod;
+
+  data->emitted_chunks ++;
+  data->storage_ptr ++;
+}
+
+/**
+ * @brief Transforms a texture coordinate to world space
+ * @details This transforms a texture coordinatefrom uv-space (0 to 1) to world
+ *   space. This takes the terrains transform into account, and also samples the
+ *   heightmap. This method should be called after generate().
+ *
+ * @param coord Coordinate in uv-space from 0, 0 to 1, 1
+ * @return World-Space point
+ */
+LPoint3 ShaderTerrainMesh::uv_to_world(const LTexCoord& coord) const {
+  nassertr(_heightfield_tex != NULL, LPoint3(0));
+  PT(TexturePeeker) peeker = _heightfield_tex->peek();
+  nassertr(peeker != NULL, LPoint3(0));
+
+  LColor result;
+  if (!peeker->lookup_bilinear(result, coord.get_x(), coord.get_y())) {
+    shader_terrain_cat.error() << "UV out of range, cant transform to world!" << endl;
+    return LPoint3(0);
+  }
+  LPoint3 unit_point(coord.get_x(), coord.get_y(), result.get_x());
+  return get_transform()->get_mat().xform_point_general(unit_point);
+}

+ 205 - 0
panda/src/grutil/shaderTerrainMesh.h

@@ -0,0 +1,205 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file shaderTerrainMesh.h
+ * @author tobspr
+ * @date 2016-02-16
+ */
+
+#ifndef SHADER_TERRAIN_MESH_H
+#define SHADER_TERRAIN_MESH_H
+
+#include "pandabase.h"
+#include "luse.h"
+#include "pnmImage.h"
+#include "geom.h"
+#include "pandaNode.h"
+#include "texture.h"
+#include "texturePeeker.h"
+#include "configVariableBool.h"
+#include "configVariableInt.h"
+#include "pStatCollector.h"
+#include "filename.h"
+#include <stdint.h>
+
+extern ConfigVariableBool stm_use_hexagonal_layout;
+extern ConfigVariableInt stm_max_chunk_count;
+extern ConfigVariableInt stm_max_views;
+
+
+NotifyCategoryDecl(shader_terrain, EXPCL_PANDA_GRUTIL, EXPTP_PANDA_GRUTIL);
+
+
+/**
+ * @brief Terrain Renderer class utilizing the GPU
+ * @details This class provides functionality to render heightfields of large
+ *   sizes utilizing the GPU. Internally a quadtree is used to generate the LODs.
+ *   The final terrain is then rendered using instancing on the GPU. This makes
+ *   it possible to use very large heightfields (8192+) with very reasonable
+ *   performance. The terrain provides options to control the LOD using a
+ *   target triangle width, see ShaderTerrainMesh::set_target_triangle_width().
+ *
+ *   Because the Terrain is rendered entirely on the GPU, it needs a special
+ *   vertex shader. There is a default vertex shader available, which you can
+ *   use in your own shaders. IMPORTANT: If you don't set an appropriate shader
+ *   on the terrain, nothing will be visible.
+ */
+class EXPCL_PANDA_GRUTIL ShaderTerrainMesh : public PandaNode {
+
+PUBLISHED:
+
+  ShaderTerrainMesh();
+
+  INLINE void set_heightfield_filename(const Filename& filename);
+  INLINE const Filename& get_heightfield_filename() const;
+  MAKE_PROPERTY(heightfield_filename, get_heightfield_filename, set_heightfield_filename);
+
+  INLINE void set_chunk_size(size_t chunk_size);
+  INLINE size_t get_chunk_size() const;
+  MAKE_PROPERTY(chunk_size, get_chunk_size, set_chunk_size);
+
+  INLINE void set_generate_patches(bool generate_patches);
+  INLINE bool get_generate_patches() const;
+  MAKE_PROPERTY(generate_patches, get_generate_patches, set_generate_patches);
+
+  INLINE void set_update_enabled(bool update_enabled);
+  INLINE bool get_update_enabled() const;
+  MAKE_PROPERTY(update_enabled, get_update_enabled, set_update_enabled);
+
+  INLINE void set_target_triangle_width(PN_stdfloat target_triangle_width);
+  INLINE PN_stdfloat get_target_triangle_width() const;
+  MAKE_PROPERTY(target_triangle_width, get_target_triangle_width, set_target_triangle_width);
+
+  INLINE Texture* get_heightfield_tex() const;
+  MAKE_PROPERTY(heightfield_tex, get_heightfield_tex);
+
+  LPoint3 uv_to_world(const LTexCoord& coord) const;
+  INLINE LPoint3 uv_to_world(PN_stdfloat u, PN_stdfloat v) const;
+
+  bool generate();
+
+public:
+
+  // Methods derived from PandaNode
+  virtual bool is_renderable() const;
+  virtual bool safe_to_flatten() const;
+  virtual bool safe_to_combine() const;
+  virtual void add_for_draw(CullTraverser *trav, CullTraverserData &data);
+
+private:
+
+  // Chunk data
+  struct Chunk {
+    // Depth, starting at 0
+    size_t depth;
+
+    // Chunk position in heightfield space
+    size_t x, y;
+
+    // Chunk size in heightfield space
+    size_t size;
+
+    // Children, in the order (0, 0) (1, 0) (0, 1) (1, 1)
+    Chunk* children[4];
+
+    // Chunk heights, used for culling
+    PN_stdfloat avg_height, min_height, max_height;
+
+    // Edge heights, used for lod computation, in the same order as the children
+    LVector4 edges;
+
+    // Last CLOD factor, stored while computing LOD, used for seamless transitions between lods
+    PN_stdfloat last_clod;
+
+    INLINE void clear_children();
+    INLINE Chunk();
+    INLINE ~Chunk();
+  };
+
+
+  // Single entry in the data block
+  struct ChunkDataEntry {
+    // float x, y, size, clod;
+
+    // Panda uses BGRA, the above layout shows how its actually in texture memory,
+    // the layout below makes it work with BGRA.
+    PN_float32 size, y, x, clod;
+  };
+
+  // Data used while traversing all chunks
+  struct TraversalData {
+    // Global MVP used for LOD
+    LMatrix4 mvp_mat;
+
+    // Local model matrix used for culling
+    LMatrix4 model_mat;
+
+    // Camera bounds in world space
+    BoundingVolume* cam_bounds;
+
+    // Amount of emitted chunks so far
+    int emitted_chunks;
+
+    // Screen resolution, used for LOD
+    LVector2i screen_size;
+
+    // Pointer to the texture memory, where each chunk is written to
+    ChunkDataEntry* storage_ptr;
+  };
+
+  bool do_load_heightfield();
+  void do_convert_heightfield();
+  void do_init_data_texture();
+  void do_create_chunks();
+  void do_init_chunk(Chunk* chunk);
+  void do_compute_bounds(Chunk* chunk);
+  void do_create_chunk_geom();
+  void do_traverse(Chunk* chunk, TraversalData* data, bool fully_visible = false);
+  void do_emit_chunk(Chunk* chunk, TraversalData* data);
+  bool do_check_lod_matches(Chunk* chunk, TraversalData* data);
+
+  Chunk _base_chunk;
+  Filename _heightfield_source;
+  size_t _size;
+  size_t _chunk_size;
+  bool _generate_patches;
+  PNMImage _heightfield;
+  PT(Texture) _heightfield_tex;
+  PT(Geom) _chunk_geom;
+  PT(Texture) _data_texture;
+  size_t _current_view_index;
+  int _last_frame_count;
+  PN_stdfloat _target_triangle_width;
+  bool _update_enabled;
+
+  // PStats stuff
+  static PStatCollector _lod_collector;
+  static PStatCollector _basic_collector;
+
+
+// Type handle stuff
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    PandaNode::init_type();
+    register_type(_type_handle, "ShaderTerrainMesh", PandaNode::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#include "shaderTerrainMesh.I"
+
+#endif // SHADER_TERRAIN_MESH_H

+ 5 - 0
panda/src/movies/config_movies.cxx

@@ -13,6 +13,8 @@
 
 #include "config_movies.h"
 #include "dconfig.h"
+#include "flacAudio.h"
+#include "flacAudioCursor.h"
 #include "inkblotVideo.h"
 #include "inkblotVideoCursor.h"
 #include "microphoneAudio.h"
@@ -75,6 +77,8 @@ init_libmovies() {
   }
   initialized = true;
 
+  FlacAudio::init_type();
+  FlacAudioCursor::init_type();
   InkblotVideo::init_type();
   InkblotVideoCursor::init_type();
   MicrophoneAudio::init_type();
@@ -93,6 +97,7 @@ init_libmovies() {
 #endif
 
   MovieTypeRegistry *reg = MovieTypeRegistry::get_global_ptr();
+  reg->register_audio_type(&FlacAudio::make, "flac");
   reg->register_audio_type(&WavAudio::make, "wav wave");
 
 #ifdef HAVE_VORBIS

+ 2976 - 0
panda/src/movies/dr_flac.h

@@ -0,0 +1,2976 @@
+// Public domain. See "unlicense" statement at the end of this file.
+//NB: modified by rdb to use 16-bit instead of 32-bit samples.
+
+// ABOUT
+//
+// This is a simple library for decoding FLAC files.
+//
+//
+//
+// USAGE
+//
+// This is a single-file library. To use it, do something like the following in one .c file.
+//   #define DR_FLAC_IMPLEMENTATION
+//   #include "dr_flac.h"
+//
+// You can then #include this file in other parts of the program as you would with any other header file. To decode audio data,
+// do something like the following:
+//
+//     drflac* pFlac = drflac_open_file("MySong.flac");
+//     if (pFlac == NULL) {
+//         ... Failed to open FLAC file ...
+//     }
+//
+//     int16_t* pSamples = malloc(pFlac->totalSampleCount * sizeof(int16_t));
+//     uint64_t numberOfSamplesActuallyRead = drflac_read_s16(pFlac, pFlac->totalSampleCount, pSamples);
+//
+//     ... pSamples now contains the decoded samples as interleaved signed 16-bit PCM ...
+//
+// The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of
+// channels and the bits per sample, should be directly accessible - just make sure you don't change their values.
+//
+// You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and
+// the decoder will give you as many samples as it can, up to the amount requested. Later on when you need the next batch of
+// samples, just call it again. Example:
+//
+//     while (drflac_read_s16(pFlac, chunkSize, pChunkSamples) > 0) {
+//         do_something();
+//     }
+//
+// You can seek to a specific sample with drflac_seek_to_sample(). The given sample is based on interleaving. So for example,
+// if you were to seek to the sample at index 0 in a stereo stream, you'll be seeking to the first sample of the left channel.
+// The sample at index 1 will be the first sample of the right channel. The sample at index 2 will be the second sample of the
+// left channel, etc.
+//
+//
+//
+// OPTIONS
+// #define these options before including this file.
+//
+// #define DR_FLAC_NO_STDIO
+//   Disable drflac_open_file().
+//
+// #define DR_FLAC_NO_WIN32_IO
+//   Don't use the Win32 API internally for drflac_open_file(). Setting this will force stdio FILE APIs instead. This is
+//   mainly for testing, but it's left here in case somebody might find use for it. dr_flac will use the Win32 API by
+//   default. Ignored when DR_FLAC_NO_STDIO is #defined.
+//
+// #define DR_FLAC_BUFFER_SIZE <number>
+//   Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls
+//   back to the client for more data. Larger values means more memory, but better performance. My tests show diminishing
+//   returns after about 4KB (which is the default). Consider reducing this if you have a very efficient implementation of
+//   onRead(), or increase it if it's very inefficient.
+//
+//
+//
+// QUICK NOTES
+//
+// - Based on my own tests, the 32-bit build is about about 1.1x-1.25x slower than the reference implementation. The 64-bit
+//   build is at about parity.
+// - This should work fine with valid native FLAC files, but it won't work very well when the STREAMINFO block is unavailable
+//   and when a stream starts in the middle of a frame. This is something I plan on addressing.
+// - Audio data is retrieved as signed 16-bit PCM, regardless of the bits per sample the FLAC stream is encoded as.
+// - This has not been tested on big-endian architectures.
+// - Rice codes in unencoded binary form (see https://xiph.org/flac/format.html#rice_partition) has not been tested. If anybody
+//   knows where I can find some test files for this, let me know.
+// - Perverse and erroneous files have not been tested. Again, if you know where I can get some test files let me know.
+// - dr_flac is not thread-safe, but it's APIs can be called from any thread so long as you do your own synchronization.
+// - dr_flac does not currently do any CRC checks.
+// - Ogg encapsulation is not supported, but I want to add it at some point.
+//
+//
+//
+// TODO
+// - Implement a proper test suite.
+// - Add support for initializing the decoder without a STREAMINFO block. Build a synthethic test to get support working at at least
+//   a basic level.
+// - Add support for retrieving metadata blocks so applications can retrieve the album art or whatnot.
+// - Add support for Ogg encapsulation.
+
+#ifndef dr_flac_h
+#define dr_flac_h
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+// As data is read from the client it is placed into an internal buffer for fast access. This controls the
+// size of that buffer. Larger values means more speed, but also more memory. In my testing there is diminishing
+// returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
+#ifndef DR_FLAC_BUFFER_SIZE
+#define DR_FLAC_BUFFER_SIZE   4096
+#endif
+
+// Check if we can enable 64-bit optimizations.
+#if defined(_WIN64)
+#define DRFLAC_64BIT
+#endif
+
+#if defined(__GNUC__)
+#if defined(__x86_64__) || defined(__ppc64__)
+#define DRFLAC_64BIT
+#endif
+#endif
+
+#ifdef DRFLAC_64BIT
+typedef uint64_t drflac_cache_t;
+#else
+typedef uint32_t drflac_cache_t;
+#endif
+
+
+
+// Callback for when data is read. Return value is the number of bytes actually read.
+typedef size_t (* drflac_read_proc)(void* userData, void* bufferOut, size_t bytesToRead);
+
+// Callback for when data needs to be seeked. Offset is always relative to the current position. Return value is false on failure, true success.
+typedef bool (* drflac_seek_proc)(void* userData, int offset);
+
+
+typedef struct
+{
+    // The absolute position of the first byte of the data of the block. This is just past the block's header.
+    long long pos;
+
+    // The size in bytes of the block's data.
+    unsigned int sizeInBytes;
+
+} drflac_block;
+
+typedef struct
+{
+    // The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC.
+    unsigned char subframeType;
+
+    // The number of wasted bits per sample as specified by the sub-frame header.
+    unsigned char wastedBitsPerSample;
+
+    // The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC.
+    unsigned char lpcOrder;
+
+    // The number of bits per sample for this subframe. This is not always equal to the current frame's bit per sample because
+    // an extra bit is required for side channels when interchannel decorrelation is being used.
+    int bitsPerSample;
+
+    // A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pHeap, or
+    // NULL if the heap is not being used. Note that it's a signed 32-bit integer for each value.
+    int32_t* pDecodedSamples;
+
+} drflac_subframe;
+
+typedef struct
+{
+    // If the stream uses variable block sizes, this will be set to the index of the first sample. If fixed block sizes are used, this will
+    // always be set to 0.
+    unsigned long long sampleNumber;
+
+    // If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0.
+    unsigned int frameNumber;
+
+    // The sample rate of this frame.
+    unsigned int sampleRate;
+
+    // The number of samples in each sub-frame within this frame.
+    unsigned short blockSize;
+
+    // The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
+    // will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
+    unsigned char channelAssignment;
+
+    // The number of bits per sample within this frame.
+    unsigned char bitsPerSample;
+
+    // The frame's CRC. This is set, but unused at the moment.
+    unsigned char crc8;
+
+    // The number of samples left to be read in this frame. This is initially set to the block size multiplied by the channel count. As samples
+    // are read, this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
+    unsigned int samplesRemaining;
+
+    // The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels.
+    drflac_subframe subframes[8];
+
+} drflac_frame;
+
+typedef struct
+{
+    // The function to call when more data needs to be read. This is set by drflac_open().
+    drflac_read_proc onRead;
+
+    // The function to call when the current read position needs to be moved.
+    drflac_seek_proc onSeek;
+
+    // The user data to pass around to onRead and onSeek.
+    void* pUserData;
+
+
+    // The sample rate. Will be set to something like 44100.
+    unsigned int sampleRate;
+
+    // The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
+    // value specified in the STREAMINFO block.
+    unsigned char channels;
+
+    // The bits per sample. Will be set to somthing like 16, 24, etc.
+    unsigned char bitsPerSample;
+
+    // The maximum block size, in samples. This number represents the number of samples in each channel (not combined).
+    unsigned short maxBlockSize;
+
+    // The total number of samples making up the stream. This includes every channel. For example, if the stream has 2 channels,
+    // with each channel having a total of 4096, this value will be set to 2*4096 = 8192.
+    uint64_t totalSampleCount;
+
+
+    // The location and size of the APPLICATION block.
+    drflac_block applicationBlock;
+
+    // The location and size of the SEEKTABLE block.
+    drflac_block seektableBlock;
+
+    // The location and size of the VORBIS_COMMENT block.
+    drflac_block vorbisCommentBlock;
+
+    // The location and size of the CUESHEET block.
+    drflac_block cuesheetBlock;
+
+    // The location and size of the PICTURE block.
+    drflac_block pictureBlock;
+
+
+    // Information about the frame the decoder is currently sitting on.
+    drflac_frame currentFrame;
+
+    // The position of the first frame in the stream. This is only ever used for seeking.
+    unsigned long long firstFramePos;
+
+
+
+    // The current byte position in the client's data stream.
+    uint64_t currentBytePos;
+
+    // The index of the next valid cache line in the "L2" cache.
+    size_t nextL2Line;
+
+    // The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining.
+    size_t consumedBits;
+
+    // Unused L2 lines. This will always be 0 until the end of the stream is hit. Used for correctly calculating the current byte
+    // position of the read pointer in the stream.
+    size_t unusedL2Lines;
+
+    // The cached data which was most recently read from the client. When data is read from the client, it is placed within this
+    // variable. As data is read, it's bit-shifted such that the next valid bit is sitting on the most significant bit.
+    drflac_cache_t cache;
+    drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];
+
+
+    // A pointer to the decoded sample data. This is an offset of pExtraData.
+    int32_t* pDecodedSamples;
+
+    // Variable length extra data. We attach this to the end of the object so we avoid unnecessary mallocs.
+    char pExtraData[1];
+
+} drflac;
+
+
+
+
+// Opens a FLAC decoder.
+//
+// This is the lowest level function for opening a FLAC stream. You can also use drflac_open_file() and drflac_open_memory()
+// to open the stream from a file or from a block of memory respectively.
+//
+// At the moment the STREAMINFO block must be present for this to succeed.
+//
+// The onRead and onSeek callbacks are used to read and seek data provided by the client.
+static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData);
+
+// Closes the given FLAC decoder.
+static void drflac_close(drflac* pFlac);
+
+// Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
+//
+// Returns the number of samples actually read.
+static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* pBufferOut);
+
+// Seeks to the sample at the given index.
+static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex);
+
+
+
+#ifndef DR_FLAC_NO_STDIO
+// Opens a flac decoder from the file at the given path.
+static drflac* drflac_open_file(const char* pFile);
+#endif
+
+// Helper for opening a file from a pre-allocated memory buffer.
+//
+// This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
+// the lifetime of the decoder.
+static drflac* drflac_open_memory(const void* data, size_t dataSize);
+
+#endif  //dr_flac_h
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// IMPLEMENTATION
+//
+///////////////////////////////////////////////////////////////////////////////
+#ifdef DR_FLAC_IMPLEMENTATION
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>     // For _byteswap_ulong and _byteswap_uint64
+#endif
+
+#ifdef __linux__
+#define _BSD_SOURCE
+#include <endian.h>
+#endif
+
+#ifdef _MSC_VER
+#define DRFLAC_INLINE __forceinline
+#else
+#define DRFLAC_INLINE inline
+#endif
+
+#define DRFLAC_BLOCK_TYPE_STREAMINFO                    0
+#define DRFLAC_BLOCK_TYPE_PADDING                       1
+#define DRFLAC_BLOCK_TYPE_APPLICATION                   2
+#define DRFLAC_BLOCK_TYPE_SEEKTABLE                     3
+#define DRFLAC_BLOCK_TYPE_VORBIS_COMMENT                4
+#define DRFLAC_BLOCK_TYPE_CUESHEET                      5
+#define DRFLAC_BLOCK_TYPE_PICTURE                       6
+#define DRFLAC_BLOCK_TYPE_INVALID                       127
+
+#define DRFLAC_SUBFRAME_CONSTANT                        0
+#define DRFLAC_SUBFRAME_VERBATIM                        1
+#define DRFLAC_SUBFRAME_FIXED                           8
+#define DRFLAC_SUBFRAME_LPC                             32
+#define DRFLAC_SUBFRAME_RESERVED                        255
+
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
+
+#define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
+#define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
+#define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
+#define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
+
+typedef struct
+{
+    uint64_t firstSample;
+    uint64_t frameOffset;   // The offset from the first byte of the header of the first frame.
+    uint16_t sampleCount;
+} drflac_seekpoint;
+
+#ifndef DR_FLAC_NO_STDIO
+#if defined(DR_FLAC_NO_WIN32_IO) || !defined(_WIN32)
+#include <stdio.h>
+
+static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+
+static bool drflac__on_seek_stdio(void* pUserData, int offset)
+{
+    return fseek((FILE*)pUserData, offset, SEEK_CUR) == 0;
+}
+
+drflac* drflac_open_file(const char* filename)
+{
+    FILE* pFile;
+#ifdef _MSC_VER
+    if (fopen_s(&pFile, filename, "rb") != 0) {
+        return false;
+    }
+#else
+    pFile = fopen(filename, "rb");
+    if (pFile == NULL) {
+        return false;
+    }
+#endif
+
+    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, pFile);
+}
+#else
+#include <windows.h>
+
+static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    assert(bytesToRead < 0xFFFFFFFF);   // dr_flac will never request huge amounts of data at a time. This is a safe assertion.
+
+    DWORD bytesRead;
+    ReadFile((HANDLE)pUserData, bufferOut, (DWORD)bytesToRead, &bytesRead, NULL);
+
+    return (size_t)bytesRead;
+}
+
+static bool drflac__on_seek_stdio(void* pUserData, int offset)
+{
+    return SetFilePointer((HANDLE)pUserData, offset, NULL, FILE_CURRENT) != INVALID_SET_FILE_POINTER;
+}
+
+static drflac* drflac_open_file(const char* filename)
+{
+    HANDLE hFile = CreateFileA(filename, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        return false;
+    }
+
+    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, (void*)hFile);
+}
+#endif
+#endif  //DR_FLAC_NO_STDIO
+
+
+typedef struct
+{
+    /// A pointer to the beginning of the data. We use a char as the type here for easy offsetting.
+    const unsigned char* data;
+
+    /// The size of the data.
+    size_t dataSize;
+
+    /// The position we're currently sitting at.
+    size_t currentReadPos;
+
+} drflac_memory;
+
+static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    drflac_memory* memory = (drflac_memory*)pUserData;
+    assert(memory != NULL);
+    assert(memory->dataSize >= memory->currentReadPos);
+
+    size_t bytesRemaining = memory->dataSize - memory->currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (bytesToRead > 0) {
+        memcpy(bufferOut, memory->data + memory->currentReadPos, bytesToRead);
+        memory->currentReadPos += bytesToRead;
+    }
+
+    return bytesToRead;
+}
+
+static bool drflac__on_seek_memory(void* pUserData, int offset)
+{
+    drflac_memory* memory = (drflac_memory*)pUserData;
+    assert(memory != NULL);
+
+    if (offset > 0) {
+        if (memory->currentReadPos + offset > memory->dataSize) {
+            offset = (int)(memory->dataSize - memory->currentReadPos);     // Trying to seek too far forward.
+        }
+    } else {
+        if (memory->currentReadPos < (size_t)-offset) {
+            offset = -(int)memory->currentReadPos;                  // Trying to seek too far backwards.
+        }
+    }
+
+    // This will never underflow thanks to the clamps above.
+    memory->currentReadPos += offset;
+
+    return 1;
+}
+
+static drflac* drflac_open_memory(const void* data, size_t dataSize)
+{
+    drflac_memory* pUserData = (drflac_memory*)malloc(sizeof(*pUserData));
+    if (pUserData == NULL) {
+        return false;
+    }
+
+    pUserData->data = (const unsigned char*)data;
+    pUserData->dataSize = dataSize;
+    pUserData->currentReadPos = 0;
+    return drflac_open(drflac__on_read_memory, drflac__on_seek_memory, pUserData);
+}
+
+
+//// Endian Management ////
+static DRFLAC_INLINE bool drflac__is_little_endian()
+{
+    int n = 1;
+    return (*(char*)&n) == 1;
+}
+
+static DRFLAC_INLINE uint32_t drflac__swap_endian_uint32(uint32_t n)
+{
+#ifdef _MSC_VER
+    return _byteswap_ulong(n);
+#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC__ >= 3))
+    return __builtin_bswap32(n);
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+
+static DRFLAC_INLINE uint64_t drflac__swap_endian_uint64(uint64_t n)
+{
+#ifdef _MSC_VER
+    return _byteswap_uint64(n);
+#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC__ >= 3))
+    return __builtin_bswap64(n);
+#else
+    return ((n & 0xFF00000000000000ULL) >> 56) |
+           ((n & 0x00FF000000000000ULL) >> 40) |
+           ((n & 0x0000FF0000000000ULL) >> 24) |
+           ((n & 0x000000FF00000000ULL) >>  8) |
+           ((n & 0x00000000FF000000ULL) <<  8) |
+           ((n & 0x0000000000FF0000ULL) << 24) |
+           ((n & 0x000000000000FF00ULL) << 40) |
+           ((n & 0x00000000000000FFULL) << 56);
+#endif
+}
+
+
+static DRFLAC_INLINE uint32_t drflac__be2host_32(uint32_t n)
+{
+#ifdef __linux__
+    return be32toh(n);
+#else
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint32(n);
+    }
+
+    return n;
+#endif
+}
+
+static DRFLAC_INLINE uint64_t drflac__be2host_64(uint64_t n)
+{
+#ifdef __linux__
+    return be64toh(n);
+#else
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint64(n);
+    }
+
+    return n;
+#endif
+}
+
+#ifdef DRFLAC_64BIT
+#define drflac__be2host__cache_line drflac__be2host_64
+#else
+#define drflac__be2host__cache_line drflac__be2host_32
+#endif
+
+
+// BIT READING ATTEMPT #2
+//
+// This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
+// on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
+// is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
+// array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
+// from onRead() is read into.
+#define DRFLAC_CACHE_L1_SIZE_BYTES                  (sizeof(pFlac->cache))
+#define DRFLAC_CACHE_L1_SIZE_BITS                   (sizeof(pFlac->cache)*8)
+#define DRFLAC_CACHE_L1_BITS_REMAINING              (DRFLAC_CACHE_L1_SIZE_BITS - (pFlac->consumedBits))
+#ifdef DRFLAC_64BIT
+#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint64_t)-1LL) >> (_bitCount)))
+#else
+#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint32_t)-1) >> (_bitCount)))
+#endif
+#define DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount)  (DRFLAC_CACHE_L1_SIZE_BITS - (_bitCount))
+#define DRFLAC_CACHE_L1_SELECT(_bitCount)           ((pFlac->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
+#define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(_bitCount) (DRFLAC_CACHE_L1_SELECT(_bitCount) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount))
+#define DRFLAC_CACHE_L2_SIZE_BYTES                  (sizeof(pFlac->cacheL2))
+#define DRFLAC_CACHE_L2_LINE_COUNT                  (DRFLAC_CACHE_L2_SIZE_BYTES / sizeof(pFlac->cacheL2[0]))
+#define DRFLAC_CACHE_L2_LINES_REMAINING             (DRFLAC_CACHE_L2_LINE_COUNT - pFlac->nextL2Line)
+
+static DRFLAC_INLINE bool drflac__reload_l1_cache_from_l2(drflac* pFlac)
+{
+    // Fast path. Try loading straight from L2.
+    if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+
+    // If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client.
+    size_t bytesRead = pFlac->onRead(pFlac->pUserData, pFlac->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES);
+    pFlac->currentBytePos += bytesRead;
+
+    pFlac->nextL2Line = 0;
+    if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES) {
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+
+
+    // If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
+    // means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
+    // and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
+    // the size of the L1 so we'll need to seek backwards by any misaligned bytes.
+    size_t alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES;
+    if (alignedL1LineCount > 0)
+    {
+        size_t offset = DRFLAC_CACHE_L2_LINE_COUNT - alignedL1LineCount;
+        for (size_t i = alignedL1LineCount; i > 0; --i) {
+            pFlac->cacheL2[i-1 + offset] = pFlac->cacheL2[i-1];
+        }
+
+        pFlac->nextL2Line = offset;
+        pFlac->unusedL2Lines = offset;
+
+        // At this point there may be some leftover unaligned bytes. We need to seek backwards so we don't lose
+        // those bytes.
+        size_t unalignedBytes = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES);
+        if (unalignedBytes > 0) {
+            pFlac->onSeek(pFlac->pUserData, -(int)unalignedBytes);
+            pFlac->currentBytePos -= unalignedBytes;
+        }
+
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+    else
+    {
+        // If we get into this branch it means we weren't able to load any L1-aligned data. We just need to seek
+        // backwards by the leftover bytes and return false.
+        if (bytesRead > 0) {
+            pFlac->onSeek(pFlac->pUserData, -(int)bytesRead);
+            pFlac->currentBytePos -= bytesRead;
+        }
+
+        pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT;
+        return false;
+    }
+}
+
+static bool drflac__reload_cache(drflac* pFlac)
+{
+    // Fast path. Try just moving the next value in the L2 cache to the L1 cache.
+    if (drflac__reload_l1_cache_from_l2(pFlac)) {
+        pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
+        pFlac->consumedBits = 0;
+        return true;
+    }
+
+    // Slow path.
+
+    // If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
+    // few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
+    // data straight from the client into the L1 cache. This should only really happen once per stream so efficiency is not important.
+    size_t bytesRead = pFlac->onRead(pFlac->pUserData, &pFlac->cache, DRFLAC_CACHE_L1_SIZE_BYTES);
+    if (bytesRead == 0) {
+        return false;
+    }
+
+    pFlac->currentBytePos += bytesRead;
+
+    assert(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES);
+    pFlac->consumedBits = (DRFLAC_CACHE_L1_SIZE_BYTES - bytesRead) * 8;
+
+    pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
+    pFlac->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_SIZE_BITS - pFlac->consumedBits);    // <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property.
+    return true;
+}
+
+static bool drflac__seek_bits(drflac* pFlac, size_t bitsToSeek)
+{
+    if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING) {
+        pFlac->consumedBits += bitsToSeek;
+        pFlac->cache <<= bitsToSeek;
+        return true;
+    } else {
+        // It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here.
+        bitsToSeek -= DRFLAC_CACHE_L1_BITS_REMAINING;
+        pFlac->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING;
+        pFlac->cache = 0;
+
+        size_t wholeBytesRemaining = bitsToSeek/8;
+        if (wholeBytesRemaining > 0)
+        {
+            // The next bytes to seek will be located in the L2 cache. The problem is that the L2 cache is not byte aligned,
+            // but rather DRFLAC_CACHE_L1_SIZE_BYTES aligned (usually 4 or 8). If, for example, the number of bytes to seek is
+            // 3, we'll need to handle it in a special way.
+            size_t wholeCacheLinesRemaining = wholeBytesRemaining / DRFLAC_CACHE_L1_SIZE_BYTES;
+            if (wholeCacheLinesRemaining < DRFLAC_CACHE_L2_LINES_REMAINING)
+            {
+                wholeBytesRemaining -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BYTES;
+                bitsToSeek -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BITS;
+                pFlac->nextL2Line += wholeCacheLinesRemaining;
+            }
+            else
+            {
+                wholeBytesRemaining -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BYTES;
+                bitsToSeek -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BITS;
+                pFlac->nextL2Line += DRFLAC_CACHE_L2_LINES_REMAINING;
+
+                pFlac->onSeek(pFlac->pUserData, (int)wholeBytesRemaining);
+                pFlac->currentBytePos += wholeBytesRemaining;
+                bitsToSeek -= wholeBytesRemaining*8;
+            }
+        }
+
+
+        if (bitsToSeek > 0) {
+            if (!drflac__reload_cache(pFlac)) {
+                return false;
+            }
+
+            return drflac__seek_bits(pFlac, bitsToSeek);
+        }
+
+        return true;
+    }
+}
+
+static bool drflac__read_uint32(drflac* pFlac, unsigned int bitCount, uint32_t* pResultOut)
+{
+    assert(pFlac != NULL);
+    assert(pResultOut != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 32);
+
+    if (pFlac->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS) {
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+    }
+
+    if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING) {
+        if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS) {
+            *pResultOut = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCount);
+            pFlac->consumedBits += bitCount;
+            pFlac->cache <<= bitCount;
+        } else {
+            *pResultOut = (uint32_t)pFlac->cache;
+            pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+            pFlac->cache = 0;
+        }
+        return true;
+    } else {
+        // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
+        size_t bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING;
+        size_t bitCountLo = bitCount - bitCountHi;
+        uint32_t resultHi = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountHi);
+
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+
+        *pResultOut = (resultHi << bitCountLo) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo);
+        pFlac->consumedBits += bitCountLo;
+        pFlac->cache <<= bitCountLo;
+        return true;
+    }
+}
+
+static bool drflac__read_int32(drflac* pFlac, unsigned int bitCount, int32_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 32);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    if ((result & (1 << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
+        result |= (-1 << bitCount);
+    }
+
+    *pResult = (int32_t)result;
+    return true;
+}
+
+static bool drflac__read_uint64(drflac* pFlac, unsigned int bitCount, uint64_t* pResultOut)
+{
+    assert(bitCount <= 64);
+    assert(bitCount >  32);
+
+    uint32_t resultHi;
+    if (!drflac__read_uint32(pFlac, bitCount - 32, &resultHi)) {
+        return false;
+    }
+
+    uint32_t resultLo;
+    if (!drflac__read_uint32(pFlac, 32, &resultLo)) {
+        return false;
+    }
+
+    *pResultOut = (((uint64_t)resultHi) << 32) | ((uint64_t)resultLo);
+    return true;
+}
+
+static bool drflac__read_int64(drflac* pFlac, unsigned int bitCount, int64_t* pResultOut)
+{
+    assert(bitCount <= 64);
+
+    uint64_t result;
+    if (!drflac__read_uint64(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    if ((result & (1ULL << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
+        result |= (-1LL << bitCount);
+    }
+
+    *pResultOut = (int64_t)result;
+    return true;
+}
+
+static bool drflac__read_uint16(drflac* pFlac, unsigned int bitCount, uint16_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 16);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (uint16_t)result;
+    return true;
+}
+
+static bool drflac__read_int16(drflac* pFlac, unsigned int bitCount, int16_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 16);
+
+    int32_t result;
+    if (!drflac__read_int32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (int16_t)result;
+    return true;
+}
+
+static bool drflac__read_uint8(drflac* pFlac, unsigned int bitCount, uint8_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 8);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (uint8_t)result;
+    return true;
+}
+
+static bool drflac__read_int8(drflac* pFlac, unsigned int bitCount, int8_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 8);
+
+    int32_t result;
+    if (!drflac__read_int32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (int8_t)result;
+    return true;
+}
+
+
+static inline bool drflac__seek_past_next_set_bit(drflac* pFlac, unsigned int* pOffsetOut)
+{
+    unsigned int zeroCounter = 0;
+    while (pFlac->cache == 0) {
+        zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+    }
+
+    // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
+    // no need for us to perform any cache reloading logic here which should make things much faster.
+    assert(pFlac->cache != 0);
+
+    unsigned int bitOffsetTable[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
+    if (setBitOffsetPlus1 == 0) {
+        if (pFlac->cache == 1) {
+            setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
+        } else {
+            setBitOffsetPlus1 = 5;
+            for (;;)
+            {
+                if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
+                    break;
+                }
+
+                setBitOffsetPlus1 += 1;
+            }
+        }
+    }
+
+    pFlac->consumedBits += setBitOffsetPlus1;
+    pFlac->cache <<= setBitOffsetPlus1;
+
+    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
+    return true;
+}
+
+
+
+static bool drflac__seek_to_byte(drflac* pFlac, long long offsetFromStart)
+{
+    assert(pFlac != NULL);
+
+    long long bytesToMove = offsetFromStart - pFlac->currentBytePos;
+    if (bytesToMove == 0) {
+        return 1;
+    }
+
+    if (bytesToMove > 0x7FFFFFFF) {
+        while (bytesToMove > 0x7FFFFFFF) {
+            if (!pFlac->onSeek(pFlac->pUserData, 0x7FFFFFFF)) {
+                return 0;
+            }
+
+            pFlac->currentBytePos += 0x7FFFFFFF;
+            bytesToMove -= 0x7FFFFFFF;
+        }
+    } else {
+        while (bytesToMove < (int)0x80000000) {
+            if (!pFlac->onSeek(pFlac->pUserData, (int)0x80000000)) {
+                return 0;
+            }
+
+            pFlac->currentBytePos += (int)0x80000000;
+            bytesToMove -= (int)0x80000000;
+        }
+    }
+
+    assert(bytesToMove <= 0x7FFFFFFF && bytesToMove >= (int)0x80000000);
+
+    bool result = pFlac->onSeek(pFlac->pUserData, (int)bytesToMove);    // <-- Safe cast as per the assert above.
+    pFlac->currentBytePos += (int)bytesToMove;
+
+    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+    pFlac->cache = 0;
+    pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT; // <-- This clears the L2 cache.
+
+    return result;
+}
+
+static long long drflac__tell(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    size_t unreadBytesFromL1 = (DRFLAC_CACHE_L1_SIZE_BYTES - (pFlac->consumedBits/8));
+    size_t unreadBytesFromL2 = (DRFLAC_CACHE_L2_SIZE_BYTES - ((pFlac->nextL2Line - pFlac->unusedL2Lines)*DRFLAC_CACHE_L1_SIZE_BYTES));
+
+    return pFlac->currentBytePos - unreadBytesFromL1 - unreadBytesFromL2;
+}
+
+
+
+static bool drflac__read_utf8_coded_number(drflac* pFlac, unsigned long long* pNumberOut)
+{
+    assert(pFlac != NULL);
+    assert(pNumberOut != NULL);
+
+    // We should never need to read UTF-8 data while not being aligned to a byte boundary. Therefore we can grab the data
+    // directly from the input stream rather than using drflac__read_uint8().
+    assert((pFlac->consumedBits & 7) == 0);
+
+    unsigned char utf8[7] = {0};
+    if (!drflac__read_uint8(pFlac, 8, utf8)) {
+        *pNumberOut = 0;
+        return false;
+    }
+
+    if ((utf8[0] & 0x80) == 0) {
+        *pNumberOut = utf8[0];
+        return true;
+    }
+
+    int byteCount = 1;
+    if ((utf8[0] & 0xE0) == 0xC0) {
+        byteCount = 2;
+    } else if ((utf8[0] & 0xF0) == 0xE0) {
+        byteCount = 3;
+    } else if ((utf8[0] & 0xF8) == 0xF0) {
+        byteCount = 4;
+    } else if ((utf8[0] & 0xFC) == 0xF8) {
+        byteCount = 5;
+    } else if ((utf8[0] & 0xFE) == 0xFC) {
+        byteCount = 6;
+    } else if ((utf8[0] & 0xFF) == 0xFE) {
+        byteCount = 7;
+    } else {
+        *pNumberOut = 0;
+        return false;     // Bad UTF-8 encoding.
+    }
+
+    // Read extra bytes.
+    assert(byteCount > 1);
+
+    unsigned long long result = ((long long)(utf8[0] & (0xFF >> (byteCount + 1))));
+    for (int i = 1; i < byteCount; ++i) {
+        if (!drflac__read_uint8(pFlac, 8, utf8 + i)) {
+            *pNumberOut = 0;
+            return false;
+        }
+
+        result = (result << 6) | (utf8[i] & 0x3F);
+    }
+
+    *pNumberOut = result;
+    return true;
+}
+
+
+
+static DRFLAC_INLINE bool drflac__read_and_seek_rice(drflac* pFlac, unsigned char m)
+{
+    unsigned int unused;
+    if (!drflac__seek_past_next_set_bit(pFlac, &unused)) {
+        return false;
+    }
+
+    if (m > 0) {
+        if (!drflac__seek_bits(pFlac, m)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+// The next two functions are responsible for calculating the prediction.
+//
+// When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
+// safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
+//
+//
+// Optimization Experiment #1
+//
+// The first optimization experiment I'm trying here is a loop unroll for the most common LPC orders. I've done a little test
+// and the results are as follows, in order of most common:
+// 1)  order = 8  : 93.1M
+// 2)  order = 7  : 36.6M
+// 3)  order = 3  : 33.2M
+// 4)  order = 6  : 20.9M
+// 5)  order = 5  : 18.1M
+// 6)  order = 4  : 15.8M
+// 7)  order = 12 : 10.8M
+// 8)  order = 2  :  9.8M
+// 9)  order = 1  :  1.6M
+// 10) order = 10 :  1.0M
+// 11) order = 9  :  0.8M
+// 12) order = 11 :  0.8M
+//
+// We'll experiment with unrolling the top 8 most common ones. We'll ignore the least common ones since there seems to be a
+// large drop off there.
+//
+// Result: There's a tiny improvement in some cases, but it could just be within margin of error so unsure if it's worthwhile
+// just yet.
+static DRFLAC_INLINE int32_t drflac__calculate_prediction_32(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+{
+    assert(order <= 32);
+
+    // 32-bit version.
+
+    // This method is slower on both 32- and 64-bit builds with VC++. Leaving this here for now just in case we need it later
+    // for whatever reason.
+#if 0
+    int prediction;
+    if (order == 8)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+        prediction += coefficients[6] * pDecodedSamples[-7];
+        prediction += coefficients[7] * pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+        prediction += coefficients[6] * pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+        prediction += coefficients[10] * pDecodedSamples[-11];
+        prediction += coefficients[11] * pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+        prediction += coefficients[10] * pDecodedSamples[-11];
+    }
+    else
+    {
+        prediction = 0;
+        for (int j = 0; j < (int)order; ++j) {
+            prediction += coefficients[j] * pDecodedSamples[-j-1];
+        }
+    }
+#endif
+
+    // Experiment #2. See if we can use a switch and let the compiler optimize it to a jump table.
+    // Result: VC++ definitely optimizes this to a single jmp as expected. I expect other compilers should do the same, but I've
+    // not verified yet.
+#if 1
+    int prediction = 0;
+
+    switch (order)
+    {
+    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
+    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
+    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
+    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
+    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
+    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
+    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
+    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
+    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
+    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
+    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
+    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
+    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
+    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
+    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
+    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
+    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
+    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
+    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
+    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
+    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
+    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
+    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
+    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
+    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
+    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
+    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
+    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
+    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
+    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
+    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
+    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
+    }
+#endif
+
+    return (int32_t)(prediction >> shift);
+}
+
+static DRFLAC_INLINE int32_t drflac__calculate_prediction(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+{
+    assert(order <= 32);
+
+    // 64-bit version.
+
+    // This method is faster on the 32-bit build when compiling with VC++. See note below.
+#ifndef DRFLAC_64BIT
+    long long prediction;
+    if (order == 8)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7] * (long long)pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+        prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+    }
+    else
+    {
+        prediction = 0;
+        for (int j = 0; j < (int)order; ++j) {
+            prediction += (long long)coefficients[j] * (long long)pDecodedSamples[-j-1];
+        }
+    }
+#endif
+
+    // Experiment #2. See if we can use a switch and let the compiler optimize it to a single jmp instruction.
+    // Result: VC++ optimizes this to a single jmp on the 64-bit build, but for some reason the 32-bit version compiles to less efficient
+    // code. Thus, we use this version on the 64-bit build and the uglier version above for the 32-bit build. If anyone has an idea on how
+    // I can get VC++ to generate an efficient jump table for the 32-bit build let me know.
+#ifdef DRFLAC_64BIT
+    long long prediction = 0;
+
+    switch (order)
+    {
+    case 32: prediction += (long long)coefficients[31] * (long long)pDecodedSamples[-32];
+    case 31: prediction += (long long)coefficients[30] * (long long)pDecodedSamples[-31];
+    case 30: prediction += (long long)coefficients[29] * (long long)pDecodedSamples[-30];
+    case 29: prediction += (long long)coefficients[28] * (long long)pDecodedSamples[-29];
+    case 28: prediction += (long long)coefficients[27] * (long long)pDecodedSamples[-28];
+    case 27: prediction += (long long)coefficients[26] * (long long)pDecodedSamples[-27];
+    case 26: prediction += (long long)coefficients[25] * (long long)pDecodedSamples[-26];
+    case 25: prediction += (long long)coefficients[24] * (long long)pDecodedSamples[-25];
+    case 24: prediction += (long long)coefficients[23] * (long long)pDecodedSamples[-24];
+    case 23: prediction += (long long)coefficients[22] * (long long)pDecodedSamples[-23];
+    case 22: prediction += (long long)coefficients[21] * (long long)pDecodedSamples[-22];
+    case 21: prediction += (long long)coefficients[20] * (long long)pDecodedSamples[-21];
+    case 20: prediction += (long long)coefficients[19] * (long long)pDecodedSamples[-20];
+    case 19: prediction += (long long)coefficients[18] * (long long)pDecodedSamples[-19];
+    case 18: prediction += (long long)coefficients[17] * (long long)pDecodedSamples[-18];
+    case 17: prediction += (long long)coefficients[16] * (long long)pDecodedSamples[-17];
+    case 16: prediction += (long long)coefficients[15] * (long long)pDecodedSamples[-16];
+    case 15: prediction += (long long)coefficients[14] * (long long)pDecodedSamples[-15];
+    case 14: prediction += (long long)coefficients[13] * (long long)pDecodedSamples[-14];
+    case 13: prediction += (long long)coefficients[12] * (long long)pDecodedSamples[-13];
+    case 12: prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
+    case 11: prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+    case 10: prediction += (long long)coefficients[ 9] * (long long)pDecodedSamples[-10];
+    case  9: prediction += (long long)coefficients[ 8] * (long long)pDecodedSamples[- 9];
+    case  8: prediction += (long long)coefficients[ 7] * (long long)pDecodedSamples[- 8];
+    case  7: prediction += (long long)coefficients[ 6] * (long long)pDecodedSamples[- 7];
+    case  6: prediction += (long long)coefficients[ 5] * (long long)pDecodedSamples[- 6];
+    case  5: prediction += (long long)coefficients[ 4] * (long long)pDecodedSamples[- 5];
+    case  4: prediction += (long long)coefficients[ 3] * (long long)pDecodedSamples[- 4];
+    case  3: prediction += (long long)coefficients[ 2] * (long long)pDecodedSamples[- 3];
+    case  2: prediction += (long long)coefficients[ 1] * (long long)pDecodedSamples[- 2];
+    case  1: prediction += (long long)coefficients[ 0] * (long long)pDecodedSamples[- 1];
+    }
+#endif
+
+    return (int32_t)(prediction >> shift);
+}
+
+
+// Reads and decodes a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
+//
+// This is the most frequently called function in the library. It does both the Rice decoding and the prediction in a single loop
+// iteration.
+static bool drflac__decode_samples_with_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+    assert(pSamplesOut != NULL);
+
+    static unsigned int bitOffsetTable[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    drflac_cache_t riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(riceParam);
+    drflac_cache_t resultHiShift = DRFLAC_CACHE_L1_SIZE_BITS - riceParam;
+
+    for (int i = 0; i < (int)count; ++i)
+    {
+        unsigned int zeroCounter = 0;
+        while (pFlac->cache == 0) {
+            zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
+            if (!drflac__reload_cache(pFlac)) {
+                return false;
+            }
+        }
+
+        // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
+        // no need for us to perform any cache reloading logic here which should make things much faster.
+        assert(pFlac->cache != 0);
+        unsigned int decodedRice;
+
+        unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
+        if (setBitOffsetPlus1 > 0) {
+            decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
+        } else {
+            if (pFlac->cache == 1) {
+                setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
+                decodedRice = (zeroCounter + (DRFLAC_CACHE_L1_SIZE_BITS-1)) << riceParam;
+            } else {
+                setBitOffsetPlus1 = 5;
+                for (;;)
+                {
+                    if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
+                        decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
+                        break;
+                    }
+
+                    setBitOffsetPlus1 += 1;
+                }
+            }
+        }
+
+
+        unsigned int bitsLo = 0;
+        unsigned int riceLength = setBitOffsetPlus1 + riceParam;
+        if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING)
+        {
+            bitsLo = (unsigned int)((pFlac->cache & (riceParamMask >> setBitOffsetPlus1)) >> (DRFLAC_CACHE_L1_SIZE_BITS - riceLength));
+
+            pFlac->consumedBits += riceLength;
+            pFlac->cache <<= riceLength;
+        }
+        else
+        {
+            pFlac->consumedBits += riceLength;
+            pFlac->cache <<= setBitOffsetPlus1;
+
+            // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
+            size_t bitCountLo = pFlac->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS;
+            drflac_cache_t resultHi = pFlac->cache & riceParamMask;    // <-- This mask is OK because all bits after the first bits are always zero.
+
+
+            if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
+                pFlac->cache = drflac__be2host__cache_line(pFlac->cacheL2[pFlac->nextL2Line++]);
+            } else {
+                // Slow path. We need to fetch more data from the client.
+                if (!drflac__reload_cache(pFlac)) {
+                    return false;
+                }
+            }
+
+            bitsLo = (unsigned int)((resultHi >> resultHiShift) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo));
+            pFlac->consumedBits = bitCountLo;
+            pFlac->cache <<= bitCountLo;
+        }
+
+
+        decodedRice |= bitsLo;
+        if ((decodedRice & 0x01)) {
+            decodedRice = ~(decodedRice >> 1);
+        } else {
+            decodedRice = (decodedRice >> 1);
+        }
+
+
+        // In order to properly calculate the prediction when the bits per sample is >16 we need to do it using 64-bit arithmetic. We can assume this
+        // is probably going to be slower on 32-bit systems so we'll do a more optimized 32-bit version when the bits per sample is low enough.
+        if (pFlac->currentFrame.bitsPerSample > 16) {
+            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i));
+        } else {
+            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i));
+        }
+    }
+
+    return true;
+}
+
+
+// Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
+static bool drflac__read_and_seek_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+
+    for (unsigned int i = 0; i < count; ++i) {
+        if (!drflac__read_and_seek_rice(pFlac, riceParam)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples_with_residual__unencoded(drflac* pFlac, unsigned int count, unsigned char unencodedBitsPerSample, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+    assert(unencodedBitsPerSample > 0 && unencodedBitsPerSample <= 32);
+    assert(pSamplesOut != NULL);
+
+    for (unsigned int i = 0; i < count; ++i)
+    {
+        if (!drflac__read_int32(pFlac, unencodedBitsPerSample, pSamplesOut + i)) {
+            return false;
+        }
+
+        pSamplesOut[i] += drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i);
+    }
+
+    return true;
+}
+
+
+// Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
+// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
+// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+static bool drflac__decode_samples_with_residual(drflac* pFlac, unsigned int blockSize, unsigned int order, int shift, const short* coefficients, int* pDecodedSamples)
+{
+    assert(pFlac != NULL);
+    assert(blockSize != 0);
+    assert(pDecodedSamples != NULL);       // <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode?
+
+    unsigned char residualMethod;
+    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
+        return false;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return false;    // Unknown or unsupported residual coding method.
+    }
+
+    // Ignore the first <order> values.
+    pDecodedSamples += order;
+
+
+    unsigned char partitionOrder;
+    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
+        return false;
+    }
+
+
+    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    unsigned int partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        unsigned char riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 16) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 32) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__decode_samples_with_residual__rice(pFlac, samplesInPartition, riceParam, order, shift, coefficients, pDecodedSamples)) {
+                return false;
+            }
+        } else {
+            unsigned char unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
+                return false;
+            }
+
+            if (!drflac__decode_samples_with_residual__unencoded(pFlac, samplesInPartition, unencodedBitsPerSample, order, shift, coefficients, pDecodedSamples)) {
+                return false;
+            }
+        }
+
+        pDecodedSamples += samplesInPartition;
+
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+
+    return true;
+}
+
+// Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
+// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
+// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+static bool drflac__read_and_seek_residual(drflac* pFlac, unsigned int blockSize, unsigned int order)
+{
+    assert(pFlac != NULL);
+    assert(blockSize != 0);
+
+    unsigned char residualMethod;
+    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
+        return false;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return false;    // Unknown or unsupported residual coding method.
+    }
+
+    unsigned char partitionOrder;
+    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
+        return false;
+    }
+
+    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    unsigned int partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        unsigned char riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 16) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 32) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__read_and_seek_residual__rice(pFlac, samplesInPartition, riceParam)) {
+                return false;
+            }
+        } else {
+            unsigned char unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
+                return false;
+            }
+
+            if (!drflac__seek_bits(pFlac, unencodedBitsPerSample * samplesInPartition)) {
+                return false;
+            }
+        }
+
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+
+    return true;
+}
+
+
+static bool drflac__decode_samples__constant(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    // Only a single sample needs to be decoded here.
+    int sample;
+    if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+        return false;
+    }
+
+    // We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
+    // we'll want to look at a more efficient way.
+    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__verbatim(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__fixed(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    short lpcCoefficientsTable[5][4] = {
+        {0,  0, 0,  0},
+        {1,  0, 0,  0},
+        {2, -1, 0,  0},
+        {3, -3, 1,  0},
+        {4, -6, 4, -1}
+    };
+
+    // Warm up samples and coefficients.
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+
+    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, 0, lpcCoefficientsTable[pSubframe->lpcOrder], pSubframe->pDecodedSamples)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__lpc(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    // Warm up samples.
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    unsigned char lpcPrecision;
+    if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
+        return false;
+    }
+    if (lpcPrecision == 15) {
+        return false;    // Invalid.
+    }
+    lpcPrecision += 1;
+
+
+    signed char lpcShift;
+    if (!drflac__read_int8(pFlac, 5, &lpcShift)) {
+        return false;
+    }
+
+
+    short coefficients[32];
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        if (!drflac__read_int16(pFlac, lpcPrecision, coefficients + i)) {
+            return false;
+        }
+    }
+
+    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, lpcShift, coefficients, pSubframe->pDecodedSamples)) {
+        return false;
+    }
+
+    return true;
+}
+
+
+static bool drflac__read_next_frame_header(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+    assert(pFlac->onRead != NULL);
+
+    // At the moment the sync code is as a form of basic validation. The CRC is stored, but is unused at the moment. This
+    // should probably be handled better in the future.
+
+    const int sampleRateTable[12]       = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
+    const uint8_t bitsPerSampleTable[8] = {0, 8, 12, (uint8_t)-1, 16, 20, 24, (uint8_t)-1};   // -1 = reserved.
+
+    unsigned short syncCode = 0;
+    if (!drflac__read_uint16(pFlac, 14, &syncCode)) {
+        return false;
+    }
+
+    if (syncCode != 0x3FFE) {
+        // TODO: Try and recover by attempting to seek to and read the next frame?
+        return false;
+    }
+
+    unsigned char reserved;
+    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
+        return false;
+    }
+
+    unsigned char blockingStrategy = 0;
+    if (!drflac__read_uint8(pFlac, 1, &blockingStrategy)) {
+        return false;
+    }
+
+
+
+    unsigned char blockSize = 0;
+    if (!drflac__read_uint8(pFlac, 4, &blockSize)) {
+        return false;
+    }
+
+    unsigned char sampleRate = 0;
+    if (!drflac__read_uint8(pFlac, 4, &sampleRate)) {
+        return false;
+    }
+
+    unsigned char channelAssignment = 0;
+    if (!drflac__read_uint8(pFlac, 4, &channelAssignment)) {
+        return false;
+    }
+
+    unsigned char bitsPerSample = 0;
+    if (!drflac__read_uint8(pFlac, 3, &bitsPerSample)) {
+        return false;
+    }
+
+    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
+        return false;
+    }
+
+
+    unsigned char isVariableBlockSize = blockingStrategy == 1;
+    if (isVariableBlockSize) {
+        pFlac->currentFrame.frameNumber = 0;
+        if (!drflac__read_utf8_coded_number(pFlac, &pFlac->currentFrame.sampleNumber)) {
+            return false;
+        }
+    } else {
+        unsigned long long frameNumber = 0;
+        if (!drflac__read_utf8_coded_number(pFlac, &frameNumber)) {
+            return false;
+        }
+        pFlac->currentFrame.frameNumber  = (unsigned int)frameNumber;   // <-- Safe cast.
+        pFlac->currentFrame.sampleNumber = 0;
+    }
+
+
+    if (blockSize == 1) {
+        pFlac->currentFrame.blockSize = 192;
+    } else if (blockSize >= 2 && blockSize <= 5) {
+        pFlac->currentFrame.blockSize = 576 * (1 << (blockSize - 2));
+    } else if (blockSize == 6) {
+        if (!drflac__read_uint16(pFlac, 8, &pFlac->currentFrame.blockSize)) {
+            return false;
+        }
+        pFlac->currentFrame.blockSize += 1;
+    } else if (blockSize == 7) {
+        if (!drflac__read_uint16(pFlac, 16, &pFlac->currentFrame.blockSize)) {
+            return false;
+        }
+        pFlac->currentFrame.blockSize += 1;
+    } else {
+        pFlac->currentFrame.blockSize = 256 * (1 << (blockSize - 8));
+    }
+
+
+    if (sampleRate <= 11) {
+        pFlac->currentFrame.sampleRate = sampleRateTable[sampleRate];
+    } else if (sampleRate == 12) {
+        if (!drflac__read_uint32(pFlac, 8, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+        pFlac->currentFrame.sampleRate *= 1000;
+    } else if (sampleRate == 13) {
+        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+    } else if (sampleRate == 14) {
+        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+        pFlac->currentFrame.sampleRate *= 10;
+    } else {
+        return false;  // Invalid.
+    }
+
+
+    pFlac->currentFrame.channelAssignment = channelAssignment;
+
+    pFlac->currentFrame.bitsPerSample = bitsPerSampleTable[bitsPerSample];
+    if (pFlac->currentFrame.bitsPerSample == 0) {
+        pFlac->currentFrame.bitsPerSample = pFlac->bitsPerSample;
+    }
+
+    if (drflac__read_uint8(pFlac, 8, &pFlac->currentFrame.crc8) != 1) {
+        return false;
+    }
+
+    memset(pFlac->currentFrame.subframes, 0, sizeof(pFlac->currentFrame.subframes));
+
+    return true;
+}
+
+static bool drflac__read_subframe_header(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    unsigned char header;
+    if (!drflac__read_uint8(pFlac, 8, &header)) {
+        return false;
+    }
+
+    // First bit should always be 0.
+    if ((header & 0x80) != 0) {
+        return false;
+    }
+
+    int type = (header & 0x7E) >> 1;
+    if (type == 0) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
+    } else if (type == 1) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
+    } else {
+        if ((type & 0x20) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
+            pSubframe->lpcOrder = (type & 0x1F) + 1;
+        } else if ((type & 0x08) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
+            pSubframe->lpcOrder = (type & 0x07);
+            if (pSubframe->lpcOrder > 4) {
+                pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+                pSubframe->lpcOrder = 0;
+            }
+        } else {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+        }
+    }
+
+    if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
+        return false;
+    }
+
+    // Wasted bits per sample.
+    pSubframe->wastedBitsPerSample = 0;
+    if ((header & 0x01) == 1) {
+        unsigned int wastedBitsPerSample;
+        if (!drflac__seek_past_next_set_bit(pFlac, &wastedBitsPerSample)) {
+            return false;
+        }
+        pSubframe->wastedBitsPerSample = (unsigned char)wastedBitsPerSample + 1;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_subframe(drflac* pFlac, int subframeIndex)
+{
+    assert(pFlac != NULL);
+
+    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
+    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
+        return false;
+    }
+
+    // Side channels require an extra bit per sample. Took a while to figure that one out...
+    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
+    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        pSubframe->bitsPerSample += 1;
+    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        pSubframe->bitsPerSample += 1;
+    }
+
+    // Need to handle wasted bits per sample.
+    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            drflac__decode_samples__constant(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            drflac__decode_samples__verbatim(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            drflac__decode_samples__fixed(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            drflac__decode_samples__lpc(pFlac, pSubframe);
+        } break;
+
+        default: return false;
+    }
+
+    return true;
+}
+
+static bool drflac__seek_subframe(drflac* pFlac, int subframeIndex)
+{
+    assert(pFlac != NULL);
+
+    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
+    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
+        return false;
+    }
+
+    // Side channels require an extra bit per sample. Took a while to figure that one out...
+    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
+    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        pSubframe->bitsPerSample += 1;
+    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        pSubframe->bitsPerSample += 1;
+    }
+
+    // Need to handle wasted bits per sample.
+    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            if (!drflac__seek_bits(pFlac, pSubframe->bitsPerSample)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            unsigned int bitsToSeek = pFlac->currentFrame.blockSize * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            unsigned char lpcPrecision;
+            if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
+                return false;
+            }
+            if (lpcPrecision == 15) {
+                return false;    // Invalid.
+            }
+            lpcPrecision += 1;
+
+
+            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    // +5 for shift.
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
+                return false;
+            }
+        } break;
+
+        default: return false;
+    }
+
+    return true;
+}
+
+
+static DRFLAC_INLINE int drflac__get_channel_count_from_channel_assignment(int channelAssignment)
+{
+    assert(channelAssignment <= 10);
+
+    int lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
+    return lookup[channelAssignment];
+}
+
+static bool drflac__decode_frame(drflac* pFlac)
+{
+    // This function should be called while the stream is sitting on the first byte after the frame header.
+
+    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+    for (int i = 0; i < channelCount; ++i)
+    {
+        if (!drflac__decode_subframe(pFlac, i)) {
+            return false;
+        }
+    }
+
+    // At the end of the frame sits the padding and CRC. We don't use these so we can just seek past.
+    if (!drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16)) {
+        return false;
+    }
+
+
+    pFlac->currentFrame.samplesRemaining = pFlac->currentFrame.blockSize * channelCount;
+
+    return true;
+}
+
+static bool drflac__seek_frame(drflac* pFlac)
+{
+    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+    for (int i = 0; i < channelCount; ++i)
+    {
+        if (!drflac__seek_subframe(pFlac, i)) {
+            return false;
+        }
+    }
+
+    // Padding and CRC.
+    return drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16);
+}
+
+static bool drflac__read_and_decode_next_frame(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    if (!drflac__read_next_frame_header(pFlac)) {
+        return false;
+    }
+
+    return drflac__decode_frame(pFlac);
+}
+
+static unsigned int drflac__read_block_header(drflac* pFlac, unsigned int* pBlockSizeOut, bool* pIsLastBlockOut)    // Returns the block type.
+{
+    assert(pFlac != NULL);
+
+    unsigned char isLastBlock = 1;
+    unsigned char blockType = DRFLAC_BLOCK_TYPE_INVALID;
+    unsigned int blockSize = 0;
+
+    if (!drflac__read_uint8(pFlac, 1, &isLastBlock)) {
+        goto done_reading_block_header;
+    }
+
+    if (!drflac__read_uint8(pFlac, 7, &blockType)) {
+        goto done_reading_block_header;
+    }
+
+    if (!drflac__read_uint32(pFlac, 24, &blockSize)) {
+        goto done_reading_block_header;
+    }
+
+
+done_reading_block_header:
+    if (pBlockSizeOut) {
+        *pBlockSizeOut = blockSize;
+    }
+
+    if (pIsLastBlockOut) {
+        *pIsLastBlockOut = isLastBlock;
+    }
+
+    return blockType;
+}
+
+
+static void drflac__get_current_frame_sample_range(drflac* pFlac, uint64_t* pFirstSampleInFrameOut, uint64_t* pLastSampleInFrameOut)
+{
+    assert(pFlac != NULL);
+
+    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+
+    uint64_t firstSampleInFrame = pFlac->currentFrame.sampleNumber;
+    if (firstSampleInFrame == 0) {
+        firstSampleInFrame = pFlac->currentFrame.frameNumber * pFlac->maxBlockSize*channelCount;
+    }
+
+    uint64_t lastSampleInFrame = firstSampleInFrame + (pFlac->currentFrame.blockSize*channelCount);
+    if (lastSampleInFrame > 0) {
+        lastSampleInFrame -= 1; // Needs to be zero based.
+    }
+
+
+    if (pFirstSampleInFrameOut) {
+        *pFirstSampleInFrameOut = firstSampleInFrame;
+    }
+    if (pLastSampleInFrameOut) {
+        *pLastSampleInFrameOut = lastSampleInFrame;
+    }
+}
+
+static bool drflac__seek_to_first_frame(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    bool result = drflac__seek_to_byte(pFlac, (long long)pFlac->firstFramePos);
+    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+    pFlac->cache = 0;
+
+    memset(&pFlac->currentFrame, 0, sizeof(pFlac->currentFrame));
+
+
+    return result;
+}
+
+static DRFLAC_INLINE bool drflac__seek_to_next_frame(drflac* pFlac)
+{
+    // This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section.
+    assert(pFlac != NULL);
+    return drflac__seek_frame(pFlac);
+}
+
+static bool drflac__seek_to_frame_containing_sample(drflac* pFlac, uint64_t sampleIndex)
+{
+    assert(pFlac != NULL);
+
+    if (!drflac__seek_to_first_frame(pFlac)) {
+        return false;
+    }
+
+    uint64_t firstSampleInFrame = 0;
+    uint64_t lastSampleInFrame = 0;
+    for (;;)
+    {
+        // We need to read the frame's header in order to determine the range of samples it contains.
+        if (!drflac__read_next_frame_header(pFlac)) {
+            return false;
+        }
+
+        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
+        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
+            break;  // The sample is in this frame.
+        }
+
+        if (!drflac__seek_to_next_frame(pFlac)) {
+            return false;
+        }
+    }
+
+    // If we get here we should be right at the start of the frame containing the sample.
+    return true;
+}
+
+static bool drflac__seek_to_sample__brute_force(drflac* pFlac, uint64_t sampleIndex)
+{
+    if (!drflac__seek_to_frame_containing_sample(pFlac, sampleIndex)) {
+        return false;
+    }
+
+    // At this point we should be sitting on the first byte of the frame containing the sample. We need to decode every sample up to (but
+    // not including) the sample we're seeking to.
+    uint64_t firstSampleInFrame = 0;
+    drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, NULL);
+
+    assert(firstSampleInFrame <= sampleIndex);
+    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
+    if (samplesToDecode == 0) {
+        return true;
+    }
+
+    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
+    if (!drflac__decode_frame(pFlac)) {
+        return false;
+    }
+
+    return drflac_read_s16(pFlac, samplesToDecode, NULL);
+}
+
+static bool drflac__seek_to_sample__seek_table(drflac* pFlac, uint64_t sampleIndex)
+{
+    assert(pFlac != NULL);
+
+    if (pFlac->seektableBlock.pos == 0) {
+        return false;
+    }
+
+    if (!drflac__seek_to_byte(pFlac, pFlac->seektableBlock.pos)) {
+        return false;
+    }
+
+    // The number of seek points is derived from the size of the SEEKTABLE block.
+    unsigned int seekpointCount = pFlac->seektableBlock.sizeInBytes / 18;   // 18 = the size of each seek point.
+    if (seekpointCount == 0) {
+        return false;   // Would this ever happen?
+    }
+
+
+    drflac_seekpoint closestSeekpoint = {0};
+
+    unsigned int seekpointsRemaining = seekpointCount;
+    while (seekpointsRemaining > 0)
+    {
+        drflac_seekpoint seekpoint;
+        if (!drflac__read_uint64(pFlac, 64, &seekpoint.firstSample)) {
+            break;
+        }
+        if (!drflac__read_uint64(pFlac, 64, &seekpoint.frameOffset)) {
+            break;
+        }
+        if (!drflac__read_uint16(pFlac, 16, &seekpoint.sampleCount)) {
+            break;
+        }
+
+        if (seekpoint.firstSample * pFlac->channels > sampleIndex) {
+            break;
+        }
+
+        closestSeekpoint = seekpoint;
+        seekpointsRemaining -= 1;
+    }
+
+    // At this point we should have found the seekpoint closest to our sample. We need to seek to it using basically the same
+    // technique as we use with the brute force method.
+    drflac__seek_to_byte(pFlac, pFlac->firstFramePos + closestSeekpoint.frameOffset);
+
+    uint64_t firstSampleInFrame = 0;
+    uint64_t lastSampleInFrame = 0;
+    for (;;)
+    {
+        // We need to read the frame's header in order to determine the range of samples it contains.
+        if (!drflac__read_next_frame_header(pFlac)) {
+            return false;
+        }
+
+        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
+        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
+            break;  // The sample is in this frame.
+        }
+
+        if (!drflac__seek_to_next_frame(pFlac)) {
+            return false;
+        }
+    }
+
+    assert(firstSampleInFrame <= sampleIndex);
+
+    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
+    if (!drflac__decode_frame(pFlac)) {
+        return false;
+    }
+
+    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
+    return drflac_read_s16(pFlac, samplesToDecode, NULL) == samplesToDecode;
+}
+
+
+static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData)
+{
+    if (onRead == NULL || onSeek == NULL) {
+        return false;
+    }
+
+    unsigned char id[4];
+    if (onRead(pUserData, id, 4) != 4 || id[0] != 'f' || id[1] != 'L' || id[2] != 'a' || id[3] != 'C') {
+        return false;    // Not a FLAC stream.
+    }
+
+    drflac tempFlac;
+    memset(&tempFlac, 0, sizeof(tempFlac));
+    tempFlac.onRead         = onRead;
+    tempFlac.onSeek         = onSeek;
+    tempFlac.pUserData      = pUserData;
+    tempFlac.currentBytePos = 4;
+    tempFlac.nextL2Line     = sizeof(tempFlac.cacheL2) / sizeof(tempFlac.cacheL2[0]); // <-- Initialize to this to force a client-side data retrieval right from the start.
+    tempFlac.consumedBits   = sizeof(tempFlac.cache)*8;
+
+    // The first metadata block should be the STREAMINFO block. We don't care about everything in here.
+    unsigned int blockSize;
+    bool isLastBlock;
+    int blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
+    if (blockType != DRFLAC_BLOCK_TYPE_STREAMINFO && blockSize != 34) {
+        return false;
+    }
+
+    if (!drflac__seek_bits(&tempFlac, 16)) {   // minBlockSize
+        return false;
+    }
+    if (!drflac__read_uint16(&tempFlac, 16, &tempFlac.maxBlockSize)) {
+        return false;
+    }
+    if (!drflac__seek_bits(&tempFlac, 48)) {   // minFrameSize + maxFrameSize
+        return false;
+    }
+    if (!drflac__read_uint32(&tempFlac, 20, &tempFlac.sampleRate)) {
+        return false;
+    }
+    if (!drflac__read_uint8(&tempFlac, 3, &tempFlac.channels)) {
+        return false;
+    }
+    if (!drflac__read_uint8(&tempFlac, 5, &tempFlac.bitsPerSample)) {
+        return false;
+    }
+    if (!drflac__read_uint64(&tempFlac, 36, &tempFlac.totalSampleCount)) {
+        return false;
+    }
+    if (!drflac__seek_bits(&tempFlac, 128)) {  // MD5
+        return false;
+    }
+
+    tempFlac.channels += 1;
+    tempFlac.bitsPerSample += 1;
+    tempFlac.totalSampleCount *= tempFlac.channels;
+
+    while (!isLastBlock)
+    {
+        blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
+
+        switch (blockType)
+        {
+            case DRFLAC_BLOCK_TYPE_APPLICATION:
+            {
+                tempFlac.applicationBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.applicationBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_SEEKTABLE:
+            {
+                tempFlac.seektableBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.seektableBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_VORBIS_COMMENT:
+            {
+                tempFlac.vorbisCommentBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.vorbisCommentBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_CUESHEET:
+            {
+                tempFlac.cuesheetBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.cuesheetBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_PICTURE:
+            {
+                tempFlac.pictureBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.pictureBlock.sizeInBytes = blockSize;
+            } break;
+
+
+            // These blocks we either don't care about or aren't supporting.
+            case DRFLAC_BLOCK_TYPE_PADDING:
+            case DRFLAC_BLOCK_TYPE_INVALID:
+            default: break;
+        }
+
+        if (!drflac__seek_bits(&tempFlac, blockSize*8)) {
+            return false;
+        }
+    }
+
+
+    // At this point we should be sitting right at the start of the very first frame.
+    tempFlac.firstFramePos = drflac__tell(&tempFlac);
+
+    drflac* pFlac = (drflac*)malloc(sizeof(*pFlac) - sizeof(pFlac->pExtraData) + (tempFlac.maxBlockSize * tempFlac.channels * sizeof(int32_t)));
+    memcpy(pFlac, &tempFlac, sizeof(tempFlac) - sizeof(pFlac->pExtraData));
+    pFlac->pDecodedSamples = (int32_t*)pFlac->pExtraData;
+
+    return pFlac;
+}
+
+static void drflac_close(drflac* pFlac)
+{
+    if (pFlac == NULL) {
+        return;
+    }
+
+#ifndef DR_FLAC_NO_STDIO
+    // If we opened the file with drflac_open_file() we will want to close the file handle. We can know whether or not drflac_open_file()
+    // was used by looking at the callbacks.
+    if (pFlac->onRead == drflac__on_read_stdio) {
+#if defined(DR_OPUS_NO_WIN32_IO) || !defined(_WIN32)
+        fclose((FILE*)pFlac->pUserData);
+#else
+        CloseHandle((HANDLE)pFlac->pUserData);
+#endif
+    }
+#endif
+
+    // If we opened the file with drflac_open_memory() we will want to free() the user data.
+    if (pFlac->onRead == drflac__on_read_memory) {
+        free(pFlac->pUserData);
+    }
+
+    free(pFlac);
+}
+
+static uint64_t drflac__read_s16__misaligned(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
+{
+    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+
+    // We should never be calling this when the number of samples to read is >= the sample count.
+    assert(samplesToRead < channelCount);
+    assert(pFlac->currentFrame.samplesRemaining > 0 && samplesToRead <= pFlac->currentFrame.samplesRemaining);
+
+
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
+        uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
+        unsigned int channelIndex = samplesReadFromFrameSoFar % channelCount;
+
+        unsigned long long nextSampleInFrame = samplesReadFromFrameSoFar / channelCount;
+
+        int decodedSample = 0;
+        switch (pFlac->currentFrame.channelAssignment)
+        {
+            case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+            {
+                if (channelIndex == 0) {
+                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+                } else {
+                    int side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    int left = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
+                    decodedSample = left - side;
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+            {
+                if (channelIndex == 0) {
+                    int side  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    int right = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
+                    decodedSample = side + right;
+                } else {
+                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+            {
+                int mid;
+                int side;
+                if (channelIndex == 0) {
+                    mid  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    side = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
+
+                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
+                    decodedSample = (mid + side) >> 1;
+                } else {
+                    mid  = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
+                    side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+
+                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
+                    decodedSample = (mid - side) >> 1;
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+            default:
+            {
+                decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+            } break;
+        }
+
+        int shift = (16 - pFlac->bitsPerSample) + pFlac->currentFrame.subframes[channelIndex].wastedBitsPerSample;
+        if (shift >= 0) {
+            decodedSample <<= shift;
+        } else {
+            decodedSample >>= -shift;
+        }
+
+        if (bufferOut) {
+            *bufferOut++ = decodedSample;
+        }
+
+        samplesRead += 1;
+        pFlac->currentFrame.samplesRemaining -= 1;
+        samplesToRead -= 1;
+    }
+
+    return samplesRead;
+}
+
+static uint64_t drflac__seek_forward_by_samples(drflac* pFlac, uint64_t samplesToRead)
+{
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        if (pFlac->currentFrame.samplesRemaining == 0)
+        {
+            if (!drflac__read_and_decode_next_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        }
+        else
+        {
+            samplesRead += 1;
+            pFlac->currentFrame.samplesRemaining -= 1;
+            samplesToRead -= 1;
+        }
+    }
+
+    return samplesRead;
+}
+
+static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
+{
+    // Note that <bufferOut> is allowed to be null, in which case this will be treated as something like a seek.
+    if (pFlac == NULL || samplesToRead == 0) {
+        return 0;
+    }
+
+    if (bufferOut == NULL) {
+        return drflac__seek_forward_by_samples(pFlac, samplesToRead);
+    }
+
+
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        // If we've run out of samples in this frame, go to the next.
+        if (pFlac->currentFrame.samplesRemaining == 0)
+        {
+            if (!drflac__read_and_decode_next_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        }
+        else
+        {
+            // Here is where we grab the samples and interleave them.
+
+            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+            uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
+            uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
+
+            int misalignedSampleCount = samplesReadFromFrameSoFar % channelCount;
+            if (misalignedSampleCount > 0) {
+                uint64_t misalignedSamplesRead = drflac__read_s16__misaligned(pFlac, misalignedSampleCount, bufferOut);
+                samplesRead   += misalignedSamplesRead;
+                samplesReadFromFrameSoFar += misalignedSamplesRead;
+                bufferOut     += misalignedSamplesRead;
+                samplesToRead -= misalignedSamplesRead;
+            }
+
+
+            uint64_t alignedSampleCountPerChannel = samplesToRead / channelCount;
+            if (alignedSampleCountPerChannel > pFlac->currentFrame.samplesRemaining / channelCount) {
+                alignedSampleCountPerChannel = pFlac->currentFrame.samplesRemaining / channelCount;
+            }
+
+            uint64_t firstAlignedSampleInFrame = samplesReadFromFrameSoFar / channelCount;
+            int unusedBitsPerSample = 16 - pFlac->bitsPerSample;
+
+            if (unusedBitsPerSample >= 0) {
+                int lshift0 = unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+                int lshift1 = unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+                switch (pFlac->currentFrame.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int left  = pDecodedSamples0[i];
+                            int side  = pDecodedSamples1[i];
+                            int right = left - side;
+
+                            bufferOut[i*2+0] = left  << lshift0;
+                            bufferOut[i*2+1] = right << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side  = pDecodedSamples0[i];
+                            int right = pDecodedSamples1[i];
+                            int left  = right + side;
+
+                            bufferOut[i*2+0] = left  << lshift0;
+                            bufferOut[i*2+1] = right << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side = pDecodedSamples1[i];
+                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+
+                            bufferOut[i*2+0] = ((mid + side) >> 1) << lshift0;
+                            bufferOut[i*2+1] = ((mid - side) >> 1) << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
+                        {
+                            // Stereo optimized inner loop unroll.
+                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                bufferOut[i*2+0] = pDecodedSamples0[i] << lshift0;
+                                bufferOut[i*2+1] = pDecodedSamples1[i] << lshift1;
+                            }
+                        }
+                        else
+                        {
+                            // Generic interleaving.
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                for (unsigned int j = 0; j < channelCount; ++j) {
+                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) << (unusedBitsPerSample + pFlac->currentFrame.subframes[j].wastedBitsPerSample);
+                                }
+                            }
+                        }
+                    } break;
+                }
+            } else {
+                int rshift0 = -unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+                int rshift1 = -unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+                switch (pFlac->currentFrame.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int left  = pDecodedSamples0[i];
+                            int side  = pDecodedSamples1[i];
+                            int right = left - side;
+
+                            bufferOut[i*2+0] = left  >> rshift0;
+                            bufferOut[i*2+1] = right >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side  = pDecodedSamples0[i];
+                            int right = pDecodedSamples1[i];
+                            int left  = right + side;
+
+                            bufferOut[i*2+0] = left  >> rshift0;
+                            bufferOut[i*2+1] = right >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side = pDecodedSamples1[i];
+                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+
+                            bufferOut[i*2+0] = ((mid + side) >> 1) >> rshift0;
+                            bufferOut[i*2+1] = ((mid - side) >> 1) >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
+                        {
+                            // Stereo optimized inner loop unroll.
+                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                bufferOut[i*2+0] = pDecodedSamples0[i] >> rshift0;
+                                bufferOut[i*2+1] = pDecodedSamples1[i] >> rshift1;
+                            }
+                        }
+                        else
+                        {
+                            // Generic interleaving.
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                for (unsigned int j = 0; j < channelCount; ++j) {
+                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) >> (pFlac->currentFrame.subframes[j].wastedBitsPerSample - unusedBitsPerSample);
+                                }
+                            }
+                        }
+                    } break;
+                }
+            }
+
+            uint64_t alignedSamplesRead = alignedSampleCountPerChannel * channelCount;
+            samplesRead   += alignedSamplesRead;
+            samplesReadFromFrameSoFar += alignedSamplesRead;
+            bufferOut     += alignedSamplesRead;
+            samplesToRead -= alignedSamplesRead;
+            pFlac->currentFrame.samplesRemaining -= (unsigned int)alignedSamplesRead;
+
+
+
+            // At this point we may still have some excess samples left to read.
+            if (samplesToRead > 0 && pFlac->currentFrame.samplesRemaining > 0)
+            {
+                uint64_t excessSamplesRead = 0;
+                if (samplesToRead < pFlac->currentFrame.samplesRemaining) {
+                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, samplesToRead, bufferOut);
+                } else {
+                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, pFlac->currentFrame.samplesRemaining, bufferOut);
+                }
+
+                samplesRead   += excessSamplesRead;
+                samplesReadFromFrameSoFar += excessSamplesRead;
+                bufferOut     += excessSamplesRead;
+                samplesToRead -= excessSamplesRead;
+            }
+        }
+    }
+
+    return samplesRead;
+}
+
+static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex)
+{
+    if (pFlac == NULL) {
+        return false;
+    }
+
+    if (sampleIndex == 0) {
+        return drflac__seek_to_first_frame(pFlac);
+    }
+
+    // Clamp the sample to the end.
+    if (sampleIndex >= pFlac->totalSampleCount) {
+        sampleIndex  = pFlac->totalSampleCount - 1;
+    }
+
+
+    // First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower.
+    if (!drflac__seek_to_sample__seek_table(pFlac, sampleIndex)) {
+        return drflac__seek_to_sample__brute_force(pFlac, sampleIndex);
+    }
+
+    return true;
+}
+
+
+#endif  //DR_FLAC_IMPLEMENTATION
+
+
+/*
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+*/

+ 12 - 0
panda/src/movies/flacAudio.I

@@ -0,0 +1,12 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.I
+ * @author rdb
+ * @date 2016-04-27
+ */

+ 64 - 0
panda/src/movies/flacAudio.cxx

@@ -0,0 +1,64 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.cxx
+ * @author rdb
+ * @date 2016-04-27
+ */
+
+#include "flacAudio.h"
+#include "flacAudioCursor.h"
+#include "virtualFileSystem.h"
+#include "dcast.h"
+
+TypeHandle FlacAudio::_type_handle;
+
+/**
+ * xxx
+ */
+FlacAudio::
+FlacAudio(const Filename &name) :
+  MovieAudio(name)
+{
+  _filename = name;
+}
+
+/**
+ * xxx
+ */
+FlacAudio::
+~FlacAudio() {
+}
+
+/**
+ * Open this audio, returning a MovieAudioCursor
+ */
+PT(MovieAudioCursor) FlacAudio::
+open() {
+  VirtualFileSystem *vfs = VirtualFileSystem::get_global_ptr();
+  istream *stream = vfs->open_read_file(_filename, true);
+
+  if (stream == NULL) {
+    return NULL;
+  } else {
+    PT(FlacAudioCursor) cursor = new FlacAudioCursor(this, stream);
+    if (cursor == NULL || !cursor->_is_valid) {
+      return NULL;
+    } else {
+      return DCAST(MovieAudioCursor, cursor);
+    }
+  }
+}
+
+/**
+ * Obtains a MovieAudio that references a file.
+ */
+PT(MovieAudio) FlacAudio::
+make(const Filename &name) {
+  return DCAST(MovieAudio, new FlacAudio(name));
+}

+ 54 - 0
panda/src/movies/flacAudio.h

@@ -0,0 +1,54 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.h
+ * @author rdb
+ * @date 2016-04-27
+ */
+
+#ifndef FLACAUDIO_H
+#define FLACAUDIO_H
+
+#include "pandabase.h"
+#include "movieAudio.h"
+
+class FlacAudioCursor;
+
+/**
+ * Reads FLAC audio files.  Ogg-encapsulated FLAC files are not supported.
+ */
+class EXPCL_PANDA_MOVIES FlacAudio : public MovieAudio {
+PUBLISHED:
+  FlacAudio(const Filename &name);
+  virtual ~FlacAudio();
+  virtual PT(MovieAudioCursor) open();
+
+  static PT(MovieAudio) make(const Filename &name);
+
+private:
+  friend class FlacAudioCursor;
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    MovieAudio::init_type();
+    register_type(_type_handle, "FlacAudio",
+                  MovieAudio::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#endif // FLACAUDIO_H

+ 12 - 0
panda/src/movies/flacAudioCursor.I

@@ -0,0 +1,12 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file vorbisAudioCursor.I
+ * @author rdb
+ * @date 2013-08-23
+ */

+ 120 - 0
panda/src/movies/flacAudioCursor.cxx

@@ -0,0 +1,120 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudioCursor.cxx
+ * @author rdb
+ * @date 2013-08-23
+ */
+
+#include "flacAudioCursor.h"
+#include "virtualFileSystem.h"
+
+#define DR_FLAC_IMPLEMENTATION
+#define DR_FLAC_NO_STDIO
+extern "C" {
+  #include "dr_flac.h"
+}
+
+/**
+ * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
+ */
+static size_t cb_read_proc(void *user, void *buffer, size_t size) {
+  istream *stream = (istream *)user;
+  nassertr(stream != NULL, false);
+
+  stream->read((char *)buffer, size);
+
+  if (stream->eof()) {
+    // Gracefully handle EOF.
+    stream->clear();
+  }
+
+  return stream->gcount();
+}
+
+/**
+ * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
+ */
+static bool cb_seek_proc(void *user, int offset) {
+  istream *stream = (istream *)user;
+  nassertr(stream != NULL, false);
+
+  stream->seekg(offset, ios::cur);
+  return !stream->fail();
+}
+
+TypeHandle FlacAudioCursor::_type_handle;
+
+/**
+ * Reads the .wav header from the indicated stream.  This leaves the read
+ * pointer positioned at the start of the data.
+ */
+FlacAudioCursor::
+FlacAudioCursor(FlacAudio *src, istream *stream) :
+  MovieAudioCursor(src),
+  _is_valid(false),
+  _drflac(NULL)
+{
+  nassertv(stream != NULL);
+  nassertv(stream->good());
+
+  _drflac = drflac_open(&cb_read_proc, &cb_seek_proc, (void *)stream);
+
+  if (_drflac == NULL) {
+    movies_cat.error()
+      << "Failed to open FLAC file.\n";
+    _is_valid = false;
+  }
+
+  _length = (_drflac->totalSampleCount / _drflac->channels) / (double)_drflac->sampleRate;
+
+  _audio_channels = _drflac->channels;
+  _audio_rate = _drflac->sampleRate;
+
+  _can_seek = true;
+  _can_seek_fast = _can_seek;
+
+  _is_valid = true;
+}
+
+/**
+ * xxx
+ */
+FlacAudioCursor::
+~FlacAudioCursor() {
+  if (_drflac != NULL) {
+    drflac_close(_drflac);
+  }
+}
+
+/**
+ * Seeks to a target location.  Afterward, the packet_time is guaranteed to be
+ * less than or equal to the specified time.
+ */
+void FlacAudioCursor::
+seek(double t) {
+  t = max(t, 0.0);
+
+  uint64_t sample = t * _drflac->sampleRate;
+
+  if (drflac_seek_to_sample(_drflac, sample * _drflac->channels)) {
+    _last_seek = sample / (double)_drflac->sampleRate;
+    _samples_read = 0;
+  }
+}
+
+/**
+ * Read audio samples from the stream.  N is the number of samples you wish to
+ * read.  Your buffer must be equal in size to N * channels.  Multiple-channel
+ * audio will be interleaved.
+ */
+void FlacAudioCursor::
+read_samples(int n, PN_int16 *data) {
+  int desired = n * _audio_channels;
+  _samples_read += drflac_read_s16(_drflac, desired, data) / _audio_channels;
+}

+ 65 - 0
panda/src/movies/flacAudioCursor.h

@@ -0,0 +1,65 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudioCursor.h
+ * @author rdb
+ * @date 2013-08-23
+ */
+
+#ifndef FLACAUDIOCURSOR_H
+#define FLACAUDIOCURSOR_H
+
+#include "pandabase.h"
+#include "movieAudioCursor.h"
+
+#define DR_FLAC_NO_STDIO
+extern "C" {
+  #include "dr_flac.h"
+}
+
+class FlacAudio;
+
+/**
+ * Interfaces with the libvorbisfile library to implement decoding of Ogg
+ * Vorbis audio files.
+ */
+class EXPCL_PANDA_MOVIES FlacAudioCursor : public MovieAudioCursor {
+PUBLISHED:
+  FlacAudioCursor(FlacAudio *src, istream *stream);
+  virtual ~FlacAudioCursor();
+  virtual void seek(double offset);
+
+public:
+  virtual void read_samples(int n, PN_int16 *data);
+
+  bool _is_valid;
+
+protected:
+  drflac *_drflac;
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    MovieAudioCursor::init_type();
+    register_type(_type_handle, "FlacAudioCursor",
+                  MovieAudioCursor::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#include "flacAudioCursor.I"
+
+#endif // FLACAUDIOCURSOR_H

+ 2 - 0
panda/src/movies/p3movies_composite1.cxx

@@ -1,4 +1,6 @@
 #include "config_movies.cxx"
+#include "flacAudio.cxx"
+#include "flacAudioCursor.cxx"
 #include "inkblotVideo.cxx"
 #include "inkblotVideoCursor.cxx"
 #include "microphoneAudio.cxx"

+ 4 - 0
panda/src/pgraph/alphaTestAttrib.h

@@ -36,6 +36,10 @@ PUBLISHED:
   INLINE PN_stdfloat get_reference_alpha() const;
   INLINE PandaCompareFunc get_mode() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(reference_alpha, get_reference_alpha);
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
   virtual void output(ostream &out) const;
 

+ 5 - 0
panda/src/pgraph/antialiasAttrib.h

@@ -52,6 +52,11 @@ PUBLISHED:
   INLINE unsigned short get_mode_type() const;
   INLINE unsigned short get_mode_quality() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+  MAKE_PROPERTY(mode_type, get_mode_type);
+  MAKE_PROPERTY(mode_quality, get_mode_quality);
+
 public:
   virtual void output(ostream &out) const;
 

+ 3 - 0
panda/src/pgraph/audioVolumeAttrib.h

@@ -40,6 +40,9 @@ PUBLISHED:
   INLINE PN_stdfloat get_volume() const;
   CPT(RenderAttrib) set_volume(PN_stdfloat volume) const;
 
+PUBLISHED:
+  MAKE_PROPERTY2(volume, has_volume, get_volume);
+
 public:
   virtual void output(ostream &out) const;
 

+ 3 - 0
panda/src/pgraph/auxBitplaneAttrib.h

@@ -63,6 +63,9 @@ PUBLISHED:
 
   INLINE int get_outputs() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(outputs, get_outputs);
+
 public:
   virtual void output(ostream &out) const;
 

+ 8 - 3
panda/src/pgraph/camera.cxx

@@ -272,8 +272,10 @@ write_datagram(BamWriter *manager, Datagram &dg) {
   dg.add_bool(_active);
   dg.add_uint32(_camera_mask.get_word());
 
-  manager->write_pointer(dg, _initial_state);
-  dg.add_stdfloat(_lod_scale);
+  if (manager->get_file_minor_ver() >= 41) {
+    manager->write_pointer(dg, _initial_state);
+    dg.add_stdfloat(_lod_scale);
+  }
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -286,7 +288,10 @@ write_datagram(BamWriter *manager, Datagram &dg) {
 int Camera::
 complete_pointers(TypedWritable **p_list, BamReader *manager) {
   int pi = LensNode::complete_pointers(p_list, manager);
-  _initial_state = DCAST(RenderState, p_list[pi++]);
+
+  if (manager->get_file_minor_ver() >= 41) {
+    _initial_state = DCAST(RenderState, p_list[pi++]);
+  }
   return pi;
 }
 

+ 4 - 0
panda/src/pgraph/colorAttrib.h

@@ -42,6 +42,10 @@ PUBLISHED:
   INLINE Type get_color_type() const;
   INLINE const LColor &get_color() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(color_type, get_color_type);
+  MAKE_PROPERTY(color, get_color);
+
 public:
   virtual void output(ostream &out) const;
 

+ 44 - 15
panda/src/pgraph/colorBlendAttrib.I

@@ -19,6 +19,9 @@ ColorBlendAttrib() :
   _mode(M_none),
   _a(O_one),
   _b(O_one),
+  _alpha_mode(M_none),
+  _alpha_a(O_one),
+  _alpha_b(O_one),
   _color(LColor::zero()),
   _involves_constant_color(false),
   _involves_color_scale(false)
@@ -31,18 +34,29 @@ ColorBlendAttrib() :
 INLINE ColorBlendAttrib::
 ColorBlendAttrib(ColorBlendAttrib::Mode mode,
                  ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
+                 ColorBlendAttrib::Mode alpha_mode,
+                 ColorBlendAttrib::Operand alpha_a, ColorBlendAttrib::Operand alpha_b,
                  const LColor &color) :
   _mode(mode),
   _a(a),
   _b(b),
+  _alpha_mode(alpha_mode),
+  _alpha_a(alpha_a),
+  _alpha_b(alpha_b),
   _color(color),
-  _involves_constant_color(involves_constant_color(a) || involves_constant_color(b)),
-  _involves_color_scale(involves_color_scale(a) || involves_color_scale(b))
+  _involves_constant_color(involves_constant_color(a) ||
+                           involves_constant_color(b) ||
+                           involves_constant_color(alpha_a) ||
+                           involves_constant_color(alpha_b)),
+  _involves_color_scale(involves_color_scale(a) ||
+                        involves_color_scale(b) ||
+                        involves_color_scale(alpha_a) ||
+                        involves_color_scale(alpha_b))
 {
 }
 
 /**
- * Returns the colorBlend mode.
+ * Returns the blending mode for the RGB channels.
  */
 INLINE ColorBlendAttrib::Mode ColorBlendAttrib::
 get_mode() const {
@@ -50,7 +64,7 @@ get_mode() const {
 }
 
 /**
- * Returns the multiplier for the first component.
+ * Returns the RGB multiplier for the first component.
  */
 INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
 get_operand_a() const {
@@ -58,13 +72,37 @@ get_operand_a() const {
 }
 
 /**
- * Returns the multiplier for the second component.
+ * Returns the RGB multiplier for the second component.
  */
 INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
 get_operand_b() const {
   return _b;
 }
 
+/**
+ * Returns the blending mode for the alpha channel.
+ */
+INLINE ColorBlendAttrib::Mode ColorBlendAttrib::
+get_alpha_mode() const {
+  return _alpha_mode;
+}
+
+/**
+ * Returns the alpha multiplier for the first component.
+ */
+INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
+get_alpha_operand_a() const {
+  return _alpha_a;
+}
+
+/**
+ * Returns the alpha multiplier for the second component.
+ */
+INLINE ColorBlendAttrib::Operand ColorBlendAttrib::
+get_alpha_operand_b() const {
+  return _alpha_b;
+}
+
 /**
  * Returns the constant color associated with the attrib.
  */
@@ -114,14 +152,5 @@ involves_constant_color(ColorBlendAttrib::Operand operand) {
  */
 INLINE bool ColorBlendAttrib::
 involves_color_scale(ColorBlendAttrib::Operand operand) {
-  switch (operand) {
-  case O_color_scale:
-  case O_one_minus_color_scale:
-  case O_alpha_scale:
-  case O_one_minus_alpha_scale:
-    return true;
-
-  default:
-    return false;
-  }
+  return (operand >= O_color_scale);
 }

+ 67 - 5
panda/src/pgraph/colorBlendAttrib.cxx

@@ -39,19 +39,38 @@ make_off() {
 CPT(RenderAttrib) ColorBlendAttrib::
 make(ColorBlendAttrib::Mode mode) {
   ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, O_one, O_one,
+                                                  mode, O_one, O_one,
                                                   LColor::zero());
   return return_new(attrib);
 }
 
 /**
  * Constructs a new ColorBlendAttrib object that enables special-effect
- * blending.  This supercedes transparency.
+ * blending.  This supercedes transparency.  The given mode and operands are
+ * used for both the RGB and alpha channels.
  */
 CPT(RenderAttrib) ColorBlendAttrib::
 make(ColorBlendAttrib::Mode mode,
      ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
      const LColor &color) {
-  ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, a, b, color);
+  ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, a, b, mode, a, b, color);
+  return return_new(attrib);
+}
+
+/**
+ * Constructs a new ColorBlendAttrib object that enables special-effect
+ * blending.  This supercedes transparency.  This form is used to specify
+ * separate blending parameters for the RGB and alpha channels.
+ */
+CPT(RenderAttrib) ColorBlendAttrib::
+make(ColorBlendAttrib::Mode mode,
+     ColorBlendAttrib::Operand a, ColorBlendAttrib::Operand b,
+     ColorBlendAttrib::Mode alpha_mode,
+     ColorBlendAttrib::Operand alpha_a, ColorBlendAttrib::Operand alpha_b,
+     const LColor &color) {
+  ColorBlendAttrib *attrib = new ColorBlendAttrib(mode, a, b,
+                                                  alpha_mode, alpha_a, alpha_b,
+                                                  color);
   return return_new(attrib);
 }
 
@@ -156,6 +175,13 @@ write_datagram(BamWriter *manager, Datagram &dg) {
   dg.add_uint8(_mode);
   dg.add_uint8(_a);
   dg.add_uint8(_b);
+
+  if (manager->get_file_minor_ver() >= 42) {
+    dg.add_uint8(_alpha_mode);
+    dg.add_uint8(_alpha_a);
+    dg.add_uint8(_alpha_b);
+  }
+
   _color.write_datagram(dg);
 }
 
@@ -187,10 +213,34 @@ fillin(DatagramIterator &scan, BamReader *manager) {
   _mode = (Mode)scan.get_uint8();
   _a = (Operand)scan.get_uint8();
   _b = (Operand)scan.get_uint8();
+
+  if (manager->get_file_minor_ver() >= 42) {
+    _alpha_mode = (Mode)scan.get_uint8();
+    _alpha_a = (Operand)scan.get_uint8();
+    _alpha_b = (Operand)scan.get_uint8();
+  } else {
+    // Before bam 6.42, these were shifted by four.
+    if (_a >= O_incoming1_color) {
+      _a = (Operand)(_a + 4);
+    }
+    if (_b >= O_incoming1_color) {
+      _b = (Operand)(_b + 4);
+    }
+
+    // And there was only one set of blend constants for both RGB and alpha.
+    _alpha_mode = _mode;
+    _alpha_a = _a;
+    _alpha_b = _b;
+  }
+
   _color.read_datagram(scan);
 
-  _involves_constant_color = involves_constant_color(_a) || involves_constant_color(_b);
-  _involves_color_scale = involves_color_scale(_a) || involves_color_scale(_b);
+  _involves_constant_color =
+    involves_constant_color(_a) || involves_constant_color(_alpha_a) ||
+    involves_constant_color(_b) || involves_constant_color(_alpha_b);
+  _involves_color_scale =
+    involves_color_scale(_a) || involves_color_scale(_alpha_a) ||
+    involves_color_scale(_b) || involves_color_scale(_alpha_b);
 }
 
 /**
@@ -234,7 +284,7 @@ operator << (ostream &out, ColorBlendAttrib::Operand operand) {
     return out << "one";
 
   case ColorBlendAttrib::O_incoming_color:
-    return out << "incomfing_color";
+    return out << "incoming_color";
 
   case ColorBlendAttrib::O_one_minus_incoming_color:
     return out << "one_minus_incoming_color";
@@ -283,6 +333,18 @@ operator << (ostream &out, ColorBlendAttrib::Operand operand) {
 
   case ColorBlendAttrib::O_one_minus_alpha_scale:
     return out << "one_minus_alpha_scale";
+
+  case ColorBlendAttrib::O_incoming1_color:
+    return out << "incoming1_color";
+
+  case ColorBlendAttrib::O_one_minus_incoming1_color:
+    return out << "one_minus_incoming1_color";
+
+  case ColorBlendAttrib::O_incoming1_alpha:
+    return out << "incoming1_alpha";
+
+  case ColorBlendAttrib::O_one_minus_incoming1_alpha:
+    return out << "one_minus_incoming1_alpha";
   }
 
   return out << "**invalid ColorBlendAttrib::Operand(" << (int)operand << ")**";

+ 36 - 5
panda/src/pgraph/colorBlendAttrib.h

@@ -52,11 +52,20 @@ PUBLISHED:
     O_one_minus_constant_alpha,
     O_incoming_color_saturate,  // valid only for operand a
 
-    // If you set either of the operands to any of the below, the blend color
-    // is taken from the current ColorScaleAttrib.  This also inhibits the
-    // normal behavior of the ColorScaleAttrib; it no longer directly scales
-    // the vertex colors, on the assumption that you will instead take care of
-    // the scale here, in the blend mode.
+    // The following are used for dual-source blending, where the fragment
+    // shader outputs a second color that will be used for blending.
+    O_incoming1_color,
+    O_one_minus_incoming1_color,
+    O_incoming1_alpha,
+    O_one_minus_incoming1_alpha,
+
+    // If you set any of the operands to any of the below, the blend color is
+    // taken from the current ColorScaleAttrib.  This also inhibits the normal
+    // behavior of the ColorScaleAttrib; it no longer directly scales the
+    // vertex colors, on the assumption that you will instead take care of the
+    // scale here, in the blend mode.
+    //
+    // These modes are being considered for deprecation.
     O_color_scale,
     O_one_minus_color_scale,
     O_alpha_scale,
@@ -66,6 +75,7 @@ PUBLISHED:
 private:
   INLINE ColorBlendAttrib();
   INLINE ColorBlendAttrib(Mode mode, Operand a, Operand b,
+                          Mode alpha_mode, Operand alpha_a, Operand alpha_b,
                           const LColor &color);
 
 PUBLISHED:
@@ -73,11 +83,19 @@ PUBLISHED:
   static CPT(RenderAttrib) make(Mode mode);
   static CPT(RenderAttrib) make(Mode mode, Operand a, Operand b,
                                 const LColor &color = LColor::zero());
+  static CPT(RenderAttrib) make(Mode rgb_mode, Operand rgb_a, Operand rgb_b,
+                                Mode alpha_mode, Operand alpha_a, Operand alpha_b,
+                                const LColor &color = LColor::zero());
   static CPT(RenderAttrib) make_default();
 
   INLINE Mode get_mode() const;
   INLINE Operand get_operand_a() const;
   INLINE Operand get_operand_b() const;
+
+  INLINE Mode get_alpha_mode() const;
+  INLINE Operand get_alpha_operand_a() const;
+  INLINE Operand get_alpha_operand_b() const;
+
   INLINE LColor get_color() const;
 
   INLINE bool involves_constant_color() const;
@@ -86,6 +104,17 @@ PUBLISHED:
   INLINE static bool involves_constant_color(Operand operand);
   INLINE static bool involves_color_scale(Operand operand);
 
+PUBLISHED:
+  MAKE_PROPERTY(rgb_mode, get_mode);
+  MAKE_PROPERTY(rgb_operand_a, get_operand_a);
+  MAKE_PROPERTY(rgb_operand_b, get_operand_b);
+
+  MAKE_PROPERTY(alpha_mode, get_alpha_mode);
+  MAKE_PROPERTY(alpha_operand_a, get_alpha_operand_a);
+  MAKE_PROPERTY(alpha_operand_b, get_alpha_operand_b);
+
+  MAKE_PROPERTY(color, get_color);
+
 public:
   virtual void output(ostream &out) const;
 
@@ -97,6 +126,8 @@ protected:
 private:
   Mode _mode;
   Operand _a, _b;
+  Mode _alpha_mode;
+  Operand _alpha_a, _alpha_b;
   LColor _color;
   bool _involves_constant_color;
   bool _involves_color_scale;

+ 3 - 0
panda/src/pgraph/colorScaleAttrib.h

@@ -43,6 +43,9 @@ PUBLISHED:
   INLINE const LVecBase4 &get_scale() const;
   CPT(RenderAttrib) set_scale(const LVecBase4 &scale) const;
 
+PUBLISHED:
+  MAKE_PROPERTY2(scale, has_scale, get_scale);
+
 public:
   virtual bool lower_attrib_can_override() const;
   virtual void output(ostream &out) const;

+ 3 - 0
panda/src/pgraph/colorWriteAttrib.h

@@ -48,6 +48,9 @@ PUBLISHED:
 
   INLINE unsigned int get_channels() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(channels, get_channels);
+
 public:
   virtual void output(ostream &out) const;
 

+ 4 - 0
panda/src/pgraph/cullBinAttrib.h

@@ -35,6 +35,10 @@ PUBLISHED:
   INLINE const string &get_bin_name() const;
   INLINE int get_draw_order() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(bin_name, get_bin_name);
+  MAKE_PROPERTY(draw_order, get_draw_order);
+
 public:
   virtual void output(ostream &out) const;
 

+ 5 - 0
panda/src/pgraph/cullFaceAttrib.h

@@ -44,6 +44,11 @@ PUBLISHED:
   INLINE bool get_reverse() const;
   Mode get_effective_mode() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_actual_mode);
+  MAKE_PROPERTY(reverse, get_reverse);
+  MAKE_PROPERTY(effective_mode, get_effective_mode);
+
 public:
   virtual void output(ostream &out) const;
 

+ 5 - 0
panda/src/pgraph/depthOffsetAttrib.h

@@ -60,6 +60,11 @@ PUBLISHED:
   INLINE PN_stdfloat get_min_value() const;
   INLINE PN_stdfloat get_max_value() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(offset, get_offset);
+  MAKE_PROPERTY(min_value, get_min_value);
+  MAKE_PROPERTY(max_value, get_max_value);
+
 public:
   virtual void output(ostream &out) const;
 

+ 3 - 0
panda/src/pgraph/depthTestAttrib.h

@@ -33,6 +33,9 @@ PUBLISHED:
 
   INLINE PandaCompareFunc get_mode() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
   virtual void output(ostream &out) const;
 

+ 3 - 0
panda/src/pgraph/depthWriteAttrib.h

@@ -39,6 +39,9 @@ PUBLISHED:
 
   INLINE Mode get_mode() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
   virtual void output(ostream &out) const;
 

+ 3 - 0
panda/src/pgraph/fogAttrib.h

@@ -34,6 +34,9 @@ PUBLISHED:
   INLINE bool is_off() const;
   INLINE Fog *get_fog() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(fog, get_fog);
+
 public:
   virtual void output(ostream &out) const;
 

+ 3 - 0
panda/src/pgraph/lightRampAttrib.h

@@ -52,6 +52,9 @@ PUBLISHED:
   INLINE PN_stdfloat get_level(int n) const;
   INLINE PN_stdfloat get_threshold(int n) const;
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+
 public:
   virtual void output(ostream &out) const;
 

+ 3 - 0
panda/src/pgraph/materialAttrib.h

@@ -36,6 +36,9 @@ PUBLISHED:
   INLINE bool is_off() const;
   INLINE Material *get_material() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(material, get_material);
+
 public:
   virtual void output(ostream &out) const;
 

+ 6 - 1
panda/src/pgraph/renderModeAttrib.h

@@ -63,9 +63,14 @@ PUBLISHED:
   INLINE PN_stdfloat get_thickness() const;
   INLINE bool get_perspective() const;
   INLINE const LColor &get_wireframe_color() const;
-
   INLINE int get_geom_rendering(int geom_rendering) const;
 
+PUBLISHED:
+  MAKE_PROPERTY(mode, get_mode);
+  MAKE_PROPERTY(thickness, get_thickness);
+  MAKE_PROPERTY(perspective, get_perspective);
+  MAKE_PROPERTY(wireframe_color, get_wireframe_color);
+
 public:
   virtual void output(ostream &out) const;
 

+ 1 - 0
panda/src/pgraph/rescaleNormalAttrib.h

@@ -49,6 +49,7 @@ PUBLISHED:
   INLINE static CPT(RenderAttrib) make_default();
 
   INLINE Mode get_mode() const;
+  MAKE_PROPERTY(mode, get_mode);
 
 public:
   virtual void output(ostream &out) const;

+ 3 - 0
panda/src/pgraph/scissorAttrib.h

@@ -47,6 +47,9 @@ PUBLISHED:
 
   INLINE const LVecBase4 &get_frame() const;
 
+PUBLISHED:
+  MAKE_PROPERTY(frame, get_frame);
+
 public:
   virtual void output(ostream &out) const;
 

+ 1 - 0
panda/src/pgraph/shadeModelAttrib.h

@@ -39,6 +39,7 @@ PUBLISHED:
   static CPT(RenderAttrib) make_default();
 
   INLINE Mode get_mode() const;
+  MAKE_PROPERTY(mode, get_mode);
 
 public:
   virtual void output(ostream &out) const;

+ 4 - 0
panda/src/pgraph/shaderAttrib.h

@@ -114,6 +114,10 @@ PUBLISHED:
 
   static void register_with_read_factory();
 
+PUBLISHED:
+  MAKE_PROPERTY(shader, get_shader);
+  MAKE_PROPERTY(instance_count, get_instance_count);
+
 public:
   virtual void output(ostream &out) const;
 

+ 1 - 0
panda/src/pgraph/transparencyAttrib.h

@@ -51,6 +51,7 @@ PUBLISHED:
   static CPT(RenderAttrib) make_default();
 
   INLINE Mode get_mode() const;
+  MAKE_PROPERTY(mode, get_mode);
 
 public:
   virtual void output(ostream &out) const;

+ 3 - 0
panda/src/pgraphnodes/config_pgraphnodes.cxx

@@ -29,6 +29,7 @@
 #include "selectiveChildNode.h"
 #include "sequenceNode.h"
 #include "shaderGenerator.h"
+#include "sphereLight.h"
 #include "spotlight.h"
 #include "switchNode.h"
 #include "uvScrollNode.h"
@@ -123,6 +124,7 @@ init_libpgraphnodes() {
   SelectiveChildNode::init_type();
   SequenceNode::init_type();
   ShaderGenerator::init_type();
+  SphereLight::init_type();
   Spotlight::init_type();
   SwitchNode::init_type();
   UvScrollNode::init_type();
@@ -137,6 +139,7 @@ init_libpgraphnodes() {
   PointLight::register_with_read_factory();
   SelectiveChildNode::register_with_read_factory();
   SequenceNode::register_with_read_factory();
+  SphereLight::register_with_read_factory();
   Spotlight::register_with_read_factory();
   SwitchNode::register_with_read_factory();
   UvScrollNode::register_with_read_factory();

+ 2 - 0
panda/src/pgraphnodes/p3pgraphnodes_composite1.cxx

@@ -7,3 +7,5 @@
 #include "fadeLodNodeData.cxx"
 #include "lightLensNode.cxx"
 #include "lightNode.cxx"
+#include "lodNode.cxx"
+#include "lodNodeType.cxx"

+ 1 - 2
panda/src/pgraphnodes/p3pgraphnodes_composite2.cxx

@@ -1,11 +1,10 @@
-#include "lodNode.cxx"
-#include "lodNodeType.cxx"
 #include "nodeCullCallbackData.cxx"
 #include "pointLight.cxx"
 #include "sceneGraphAnalyzer.cxx"
 #include "selectiveChildNode.cxx"
 #include "sequenceNode.cxx"
 #include "shaderGenerator.cxx"
+#include "sphereLight.cxx"
 #include "spotlight.cxx"
 #include "switchNode.cxx"
 #include "uvScrollNode.cxx"

+ 48 - 0
panda/src/pgraphnodes/sphereLight.I

@@ -0,0 +1,48 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file sphereLight.I
+ * @author rdb
+ * @date 2016-04-15
+ */
+
+/**
+ *
+ */
+INLINE SphereLight::CData::
+CData() :
+  _radius(0.01f)
+{
+}
+
+/**
+ *
+ */
+INLINE SphereLight::CData::
+CData(const SphereLight::CData &copy) :
+  _radius(copy._radius)
+{
+}
+
+/**
+ * Returns the radius of the sphere.
+ */
+INLINE PN_stdfloat SphereLight::
+get_radius() const {
+  CDReader cdata(_cycler);
+  return cdata->_radius;
+}
+
+/**
+ * Sets the radius of the sphere.
+ */
+INLINE void SphereLight::
+set_radius(PN_stdfloat radius) {
+  CDWriter cdata(_cycler);
+  cdata->_radius = radius;
+}

+ 146 - 0
panda/src/pgraphnodes/sphereLight.cxx

@@ -0,0 +1,146 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file sphereLight.cxx
+ * @author rdb
+ * @date 2016-04-15
+ */
+
+#include "sphereLight.h"
+#include "graphicsStateGuardianBase.h"
+#include "bamWriter.h"
+#include "bamReader.h"
+#include "datagram.h"
+#include "datagramIterator.h"
+
+TypeHandle SphereLight::_type_handle;
+
+/**
+ *
+ */
+CycleData *SphereLight::CData::
+make_copy() const {
+  return new CData(*this);
+}
+
+/**
+ * Writes the contents of this object to the datagram for shipping out to a
+ * Bam file.
+ */
+void SphereLight::CData::
+write_datagram(BamWriter *manager, Datagram &dg) const {
+  dg.add_stdfloat(_radius);
+}
+
+/**
+ * This internal function is called by make_from_bam to read in all of the
+ * relevant data from the BamFile for the new Light.
+ */
+void SphereLight::CData::
+fillin(DatagramIterator &scan, BamReader *manager) {
+  _radius = scan.get_stdfloat();
+}
+
+/**
+ *
+ */
+SphereLight::
+SphereLight(const string &name) :
+  PointLight(name)
+{
+}
+
+/**
+ * Do not call the copy constructor directly; instead, use make_copy() or
+ * copy_subgraph() to make a copy of a node.
+ */
+SphereLight::
+SphereLight(const SphereLight &copy) :
+  PointLight(copy),
+  _cycler(copy._cycler)
+{
+}
+
+/**
+ * Returns a newly-allocated PandaNode that is a shallow copy of this one.  It
+ * will be a different pointer, but its internal data may or may not be shared
+ * with that of the original PandaNode.  No children will be copied.
+ */
+PandaNode *SphereLight::
+make_copy() const {
+  return new SphereLight(*this);
+}
+
+/**
+ * Transforms the contents of this PandaNode by the indicated matrix, if it
+ * means anything to do so.  For most kinds of PandaNodes, this does nothing.
+ */
+void SphereLight::
+xform(const LMatrix4 &mat) {
+  PointLight::xform(mat);
+  CDWriter cdata(_cycler);
+  cdata->_radius = mat.xform_vec(LVector3(0, 0, cdata->_radius)).length();
+  mark_viz_stale();
+}
+
+/**
+ *
+ */
+void SphereLight::
+write(ostream &out, int indent_level) const {
+  PointLight::write(out, indent_level);
+  indent(out, indent_level) << *this << ":\n";
+  indent(out, indent_level + 2)
+    << "radius " << get_radius() << "\n";
+}
+
+/**
+ * Tells the BamReader how to create objects of type SphereLight.
+ */
+void SphereLight::
+register_with_read_factory() {
+  BamReader::get_factory()->register_factory(get_class_type(), make_from_bam);
+}
+
+/**
+ * Writes the contents of this object to the datagram for shipping out to a
+ * Bam file.
+ */
+void SphereLight::
+write_datagram(BamWriter *manager, Datagram &dg) {
+  PointLight::write_datagram(manager, dg);
+  manager->write_cdata(dg, _cycler);
+}
+
+/**
+ * This function is called by the BamReader's factory when a new object of
+ * type SphereLight is encountered in the Bam file.  It should create the
+ * SphereLight and extract its information from the file.
+ */
+TypedWritable *SphereLight::
+make_from_bam(const FactoryParams &params) {
+  SphereLight *node = new SphereLight("");
+  DatagramIterator scan;
+  BamReader *manager;
+
+  parse_params(params, scan, manager);
+  node->fillin(scan, manager);
+
+  return node;
+}
+
+/**
+ * This internal function is called by make_from_bam to read in all of the
+ * relevant data from the BamFile for the new SphereLight.
+ */
+void SphereLight::
+fillin(DatagramIterator &scan, BamReader *manager) {
+  PointLight::fillin(scan, manager);
+
+  manager->read_cdata(scan, _cycler);
+}

+ 90 - 0
panda/src/pgraphnodes/sphereLight.h

@@ -0,0 +1,90 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file sphereLight.h
+ * @author rdb
+ * @date 2016-04-15
+ */
+
+#ifndef SPHERELIGHT_H
+#define SPHERELIGHT_H
+
+#include "pandabase.h"
+
+#include "lightLensNode.h"
+
+/**
+ * A sphere light is like a point light, except that it represents a sphere
+ * with a radius, rather than being an infinitely thin point in space.
+ */
+class EXPCL_PANDA_PGRAPHNODES SphereLight : public PointLight {
+PUBLISHED:
+  SphereLight(const string &name);
+
+protected:
+  SphereLight(const SphereLight &copy);
+
+public:
+  virtual PandaNode *make_copy() const;
+  virtual void xform(const LMatrix4 &mat);
+  virtual void write(ostream &out, int indent_level) const;
+
+PUBLISHED:
+  INLINE PN_stdfloat get_radius() const;
+  INLINE void set_radius(PN_stdfloat radius);
+  MAKE_PROPERTY(radius, get_radius, set_radius);
+
+private:
+  // This is the data that must be cycled between pipeline stages.
+  class EXPCL_PANDA_PGRAPHNODES CData : public CycleData {
+  public:
+    INLINE CData();
+    INLINE CData(const CData &copy);
+    virtual CycleData *make_copy() const;
+    virtual void write_datagram(BamWriter *manager, Datagram &dg) const;
+    virtual void fillin(DatagramIterator &scan, BamReader *manager);
+    virtual TypeHandle get_parent_type() const {
+      return SphereLight::get_class_type();
+    }
+
+    PN_stdfloat _radius;
+  };
+
+  PipelineCycler<CData> _cycler;
+  typedef CycleDataReader<CData> CDReader;
+  typedef CycleDataWriter<CData> CDWriter;
+
+public:
+  static void register_with_read_factory();
+  virtual void write_datagram(BamWriter *manager, Datagram &dg);
+
+protected:
+  static TypedWritable *make_from_bam(const FactoryParams &params);
+  void fillin(DatagramIterator &scan, BamReader *manager);
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    PointLight::init_type();
+    register_type(_type_handle, "SphereLight",
+                  PointLight::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#include "sphereLight.I"
+
+#endif

+ 7 - 0
panda/src/pnmimagetypes/config_pnmimagetypes.cxx

@@ -22,6 +22,7 @@
 #include "pnmFileTypePNM.h"
 #include "pnmFileTypePfm.h"
 #include "pnmFileTypeTIFF.h"
+#include "pnmFileTypeStbImage.h"
 #include "sgi.h"
 
 #include "config_pnmimage.h"
@@ -240,6 +241,12 @@ init_libpnmimagetypes() {
   tr->register_type(new PNMFileTypeTIFF);
 #endif
 
+#ifdef HAVE_STB_IMAGE
+  PNMFileTypeStbImage::init_type();
+  PNMFileTypeStbImage::register_with_read_factory();
+  tr->register_type(new PNMFileTypeStbImage);
+#endif
+
   // And register with the PandaSystem.
   PandaSystem *ps = PandaSystem::get_global_ptr();
 

+ 1 - 0
panda/src/pnmimagetypes/p3pnmimagetypes_composite2.cxx

@@ -6,5 +6,6 @@
 #include "pnmFileTypeSGIReader.cxx"
 #include "pnmFileTypeSGIWriter.cxx"
 #include "pnmFileTypeSoftImage.cxx"
+#include "pnmFileTypeStbImage.cxx"
 #include "pnmFileTypeTGA.cxx"
 

+ 509 - 0
panda/src/pnmimagetypes/pnmFileTypeStbImage.cxx

@@ -0,0 +1,509 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file pnmFileTypeStbImage.cxx
+ * @author rdb
+ * @date 2016-03-31
+ */
+
+#include "pnmFileTypeStbImage.h"
+
+#ifdef HAVE_STB_IMAGE
+
+#include "config_pnmimagetypes.h"
+#include "pnmFileTypeRegistry.h"
+#include "bamReader.h"
+
+// We use the public domain stb_image library for loading images.  Define the
+// stb_image implementation.  We only use it in this unit.
+#define STB_IMAGE_STATIC
+#define STB_IMAGE_IMPLEMENTATION
+
+// Disable the stb_image implementation of these formats if we already support
+// it through different loaders.
+#ifndef HAVE_JPEG
+#define STBI_ONLY_JPEG
+#endif
+#ifndef HAVE_PNG
+#define STBI_ONLY_PNG
+#endif
+#ifndef HAVE_BMP
+#define STBI_ONLY_BMP
+#endif
+#ifndef HAVE_TGA
+#define STBI_ONLY_TGA
+#endif
+#ifndef HAVE_SOFTIMAGE_PIC
+#define STBI_ONLY_PIC
+#endif
+#ifndef HAVE_PNM
+#define STBI_ONLY_PNM
+#endif
+
+// These are always enabled because we don't support these via other means.
+#define STBI_ONLY_PSD
+#define STBI_ONLY_HDR
+#define STBI_ONLY_GIF
+
+#ifndef NDEBUG
+// Get friendlier error messages in development builds.
+#define STBI_FAILURE_USERMSG
+#endif
+
+// We read via callbacks, so no need for stbi_load_from_file.
+#define STBI_NO_STDIO
+
+#include "stb_image.h"
+
+static const char *const stb_extensions[] = {
+  // Expose the extensions that we don't already expose through other loaders.
+#ifndef HAVE_JPEG
+  "jpg", "jpeg",
+#endif
+#ifndef HAVE_PNG
+  "png",
+#endif
+#ifndef HAVE_BMP
+  "bmp",
+#endif
+#ifndef HAVE_TGA
+  "tga",
+#endif
+#ifndef HAVE_SOFTIMAGE_PIC
+  "pic",
+#endif
+#ifndef HAVE_PNM
+  "ppm", "pgm",
+#endif
+
+  // We don't have other loaders for these, so add them unconditionally.
+  "psd",
+  "hdr",
+  "gif",
+};
+static const int num_stb_extensions = sizeof(stb_extensions) / sizeof(const char *);
+
+// Callbacks to allow stb_image to read from VFS.
+static int cb_read(void *user, char *data, int size) {
+  istream *in = (istream *)user;
+  nassertr(in != NULL, 0);
+
+  in->read(data, size);
+
+  if (in->eof()) {
+    // Gracefully handle EOF.
+    in->clear();
+  }
+
+  return (int)in->gcount();
+}
+
+static void cb_skip(void *user, int n) {
+  istream *in = (istream *)user;
+  nassertv(in != NULL);
+
+  in->seekg(n, ios::cur);
+
+  if (in->fail()) {
+    in->clear();
+
+    // Implement skip by just reading and discarding the result.
+    static const int size = 4096;
+    char data[4096];
+    while (n > 4096) {
+      in->read(data, 4096);
+      n -= 4096;
+    }
+    if (n > 0) {
+      in->read(data, n);
+    }
+  }
+}
+
+static int cb_eof(void *user) {
+  istream *in = (istream *)user;
+  nassertr(in != NULL, 1);
+
+  return in->eof();
+}
+
+static stbi_io_callbacks io_callbacks = {cb_read, cb_skip, cb_eof};
+
+/**
+ * This is defined in the .cxx file so we have access to stbi_context.
+ */
+class StbImageReader : public PNMReader {
+public:
+  StbImageReader(PNMFileType *type, istream *file, bool owns_file, string magic_number);
+
+  virtual bool is_floating_point();
+  virtual bool read_pfm(PfmFile &pfm);
+  virtual int read_data(xel *array, xelval *alpha);
+
+private:
+  bool _is_float;
+  stbi__context _context;
+};
+
+TypeHandle PNMFileTypeStbImage::_type_handle;
+
+/**
+ *
+ */
+PNMFileTypeStbImage::
+PNMFileTypeStbImage() {
+}
+
+/**
+ * Returns a few words describing the file type.
+ */
+string PNMFileTypeStbImage::
+get_name() const {
+  return "stb_image";
+}
+
+/**
+ * Returns the number of different possible filename extensions associated
+ * with this particular file type.
+ */
+int PNMFileTypeStbImage::
+get_num_extensions() const {
+  return num_stb_extensions;
+}
+
+/**
+ * Returns the nth possible filename extension associated with this particular
+ * file type, without a leading dot.
+ */
+string PNMFileTypeStbImage::
+get_extension(int n) const {
+  nassertr(n >= 0 && n < num_stb_extensions, string());
+  return stb_extensions[n];
+}
+
+/**
+ * Returns true if this particular file type uses a magic number to identify
+ * it, false otherwise.
+ */
+bool PNMFileTypeStbImage::
+has_magic_number() const {
+  return false;
+}
+
+/**
+ * Returns true if the indicated "magic number" byte stream (the initial few
+ * bytes read from the file) matches this particular file type, false
+ * otherwise.
+ */
+bool PNMFileTypeStbImage::
+matches_magic_number(const string &magic_number) const {
+  return false;
+}
+
+/**
+ * Allocates and returns a new PNMReader suitable for reading from this file
+ * type, if possible.  If reading from this file type is not supported,
+ * returns NULL.
+ */
+PNMReader *PNMFileTypeStbImage::
+make_reader(istream *file, bool owns_file, const string &magic_number) {
+  init_pnm();
+  return new StbImageReader(this, file, owns_file, magic_number);
+}
+
+/**
+ *
+ */
+StbImageReader::
+StbImageReader(PNMFileType *type, istream *file, bool owns_file, string magic_number) :
+  PNMReader(type, file, owns_file),
+  _is_float(false)
+{
+  // Hope we can putback() more than one character.
+  for (string::reverse_iterator mi = magic_number.rbegin();
+       mi != magic_number.rend();
+       mi++) {
+    _file->putback(*mi);
+  }
+  if (_file->fail()) {
+    pnmimage_cat.error()
+      << "Unable to put back magic number.\n";
+    _is_valid = false;
+    return;
+  }
+
+  stbi__start_callbacks(&_context, &io_callbacks, (void *)file);
+
+  if (strncmp(magic_number.c_str(), "#?", 2) == 0 &&
+      stbi__hdr_info(&_context, &_x_size, &_y_size, &_num_channels)) {
+    _is_valid = true;
+    _is_float = true;
+  } else if (stbi__info_main(&_context, &_x_size, &_y_size, &_num_channels)) {
+    _is_valid = true;
+  } else {
+    _is_valid = false;
+    pnmimage_cat.error()
+      << "stb_info failure: " << stbi_failure_reason() << "\n";
+  }
+
+  _maxval = 255;
+}
+
+/**
+ * Returns true if this PNMFileType represents a floating-point image type,
+ * false if it is a normal, integer type.  If this returns true, read_pfm() is
+ * implemented instead of read_data().
+ */
+bool StbImageReader::
+is_floating_point() {
+  return _is_float;
+}
+
+/**
+ * Reads floating-point data directly into the indicated PfmFile.  Returns
+ * true on success, false on failure.
+ */
+bool StbImageReader::
+read_pfm(PfmFile &pfm) {
+  if (!is_valid()) {
+    return false;
+  }
+
+  // Reposition the file at the beginning.
+  _file->seekg(0, ios::beg);
+  if (_file->tellg() != 0) {
+    pnmimage_cat.error()
+      << "Could not reposition file pointer to the beginning.\n";
+    return false;
+  }
+
+  stbi__start_callbacks(&_context, &io_callbacks, (void *)_file);
+
+  nassertr(_num_channels == 3, false);
+
+  // This next bit is copied and pasted from stbi__hdr_load so that we can
+  // avoid making an unnecessary extra copy of the data.
+  char buffer[STBI__HDR_BUFLEN];
+  char *token;
+  int valid = 0;
+  int width, height;
+  stbi_uc *scanline;
+  int len;
+  unsigned char count, value;
+  int i, j, k, c1, c2, z;
+
+  // Check identifier
+  if (strcmp(stbi__hdr_gettoken(&_context, buffer), "#?RADIANCE") != 0) {
+    pnmimage_cat.error()
+      << "Missing #?RADIANCE header.\n";
+    return false;
+  }
+
+  // Parse header
+  for(;;) {
+    token = stbi__hdr_gettoken(&_context, buffer);
+    if (token[0] == 0) break;
+    if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+  }
+
+  if (!valid) {
+    pnmimage_cat.error() << "Unsupported HDR format.\n";
+    return false;
+  }
+
+  // Parse width and height
+  // can't use sscanf() if we're not using stdio!
+  token = stbi__hdr_gettoken(&_context, buffer);
+  if (strncmp(token, "-Y ", 3)) {
+    pnmimage_cat.error() << "Unsupported HDR data layout.\n";
+    return false;
+  }
+  token += 3;
+  height = (int) strtol(token, &token, 10);
+  while (*token == ' ') ++token;
+  if (strncmp(token, "+X ", 3)) {
+    pnmimage_cat.error() << "Unsupported HDR data layout.\n";
+    return false;
+  }
+  token += 3;
+  width = (int) strtol(token, NULL, 10);
+
+  // Read data
+  pfm.clear(width, height, 3);
+  vector_float table;
+  pfm.swap_table(table);
+  float *hdr_data = (float *)&table[0];
+
+  // Load image data
+  // image data is stored as some number of sca
+  if (width < 8 || width >= 32768) {
+    // Read flat data
+    for (j = 0; j < height; ++j) {
+      for (i = 0; i < width; ++i) {
+        stbi_uc rgbe[4];
+main_decode_loop:
+        stbi__getn(&_context, rgbe, 4);
+        stbi__hdr_convert(hdr_data + j * width * 3 + i * 3, rgbe, 3);
+      }
+    }
+  } else {
+    // Read RLE-encoded data
+    scanline = NULL;
+
+    for (j = 0; j < height; ++j) {
+      c1 = stbi__get8(&_context);
+      c2 = stbi__get8(&_context);
+      len = stbi__get8(&_context);
+      if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+        // not run-length encoded, so we have to actually use THIS data as a decoded
+        // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+        stbi_uc rgbe[4];
+        rgbe[0] = (stbi_uc) c1;
+        rgbe[1] = (stbi_uc) c2;
+        rgbe[2] = (stbi_uc) len;
+        rgbe[3] = (stbi_uc) stbi__get8(&_context);
+        stbi__hdr_convert(hdr_data, rgbe, 3);
+        i = 1;
+        j = 0;
+        STBI_FREE(scanline);
+        goto main_decode_loop; // yes, this makes no sense
+      }
+      len <<= 8;
+      len |= stbi__get8(&_context);
+      if (len != width) {
+        STBI_FREE(scanline);
+        pnmimage_cat.error() << "Corrupt HDR: invalid decoded scanline length.\n";
+        return false;
+      }
+      if (scanline == NULL) {
+        scanline = (stbi_uc *) stbi__malloc(width * 4);
+      }
+
+      for (k = 0; k < 4; ++k) {
+        i = 0;
+        while (i < width) {
+          count = stbi__get8(&_context);
+          if (count > 128) {
+            // Run
+            value = stbi__get8(&_context);
+            count -= 128;
+            for (z = 0; z < count; ++z) {
+              scanline[i++ * 4 + k] = value;
+            }
+          } else {
+            // Dump
+            for (z = 0; z < count; ++z) {
+              scanline[i++ * 4 + k] = stbi__get8(&_context);
+            }
+          }
+        }
+      }
+      for (i = 0; i < width; ++i) {
+        stbi__hdr_convert(hdr_data+(j*width + i)*3, scanline + i*4, 3);
+      }
+    }
+    STBI_FREE(scanline);
+  }
+
+  pfm.swap_table(table);
+  return true;
+}
+
+/**
+ * Reads in an entire image all at once, storing it in the pre-allocated
+ * _x_size * _y_size array and alpha pointers.  (If the image type has no
+ * alpha channel, alpha is ignored.)  Returns the number of rows correctly
+ * read.
+ *
+ * Derived classes need not override this if they instead provide
+ * supports_read_row() and read_row(), below.
+ */
+int StbImageReader::
+read_data(xel *array, xelval *alpha) {
+  // Reposition the file at the beginning.
+  _file->seekg(0, ios::beg);
+  if (_file->tellg() != 0) {
+    pnmimage_cat.error()
+      << "Could not reposition file pointer to the beginning.\n";
+    return 0;
+  }
+
+  stbi__start_callbacks(&_context, &io_callbacks, (void *)_file);
+
+  int cols = 0;
+  int rows = 0;
+  stbi_uc *data = stbi__load_main(&_context, &cols, &rows, NULL, _num_channels);
+
+  if (data == NULL) {
+    pnmimage_cat.error()
+      << "stbi_load failure: " << stbi_failure_reason() << "\n";
+    return 0;
+  }
+
+  nassertr(cols == _x_size, 0);
+
+  size_t pixels = (size_t)_x_size * (size_t)rows;
+  stbi_uc *ptr = data;
+  switch (_num_channels) {
+  case 1:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[i], ptr[i], ptr[i]);
+    }
+    break;
+
+  case 2:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[0], ptr[0], ptr[0]);
+      alpha[i] = ptr[1];
+      ptr += 2;
+    }
+    break;
+
+  case 3:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[0], ptr[1], ptr[2]);
+      ptr += 3;
+    }
+    break;
+
+  case 4:
+    for (size_t i = 0; i < pixels; ++i) {
+      PPM_ASSIGN(array[i], ptr[0], ptr[1], ptr[2]);
+      alpha[i] = ptr[3];
+      ptr += 4;
+    }
+    break;
+  }
+
+  stbi_image_free(data);
+  return rows;
+}
+
+/**
+ * Registers the current object as something that can be read from a Bam file.
+ */
+void PNMFileTypeStbImage::
+register_with_read_factory() {
+  BamReader::get_factory()->
+    register_factory(get_class_type(), make_PNMFileTypeStbImage);
+}
+
+/**
+ * This method is called by the BamReader when an object of this type is
+ * encountered in a Bam file; it should allocate and return a new object with
+ * all the data read.
+ *
+ * In the case of the PNMFileType objects, since these objects are all shared,
+ * we just pull the object from the registry.
+ */
+TypedWritable *PNMFileTypeStbImage::
+make_PNMFileTypeStbImage(const FactoryParams &params) {
+  return PNMFileTypeRegistry::get_global_ptr()->get_type_by_handle(get_class_type());
+}
+
+#endif  // HAVE_STB_IMAGE

+ 73 - 0
panda/src/pnmimagetypes/pnmFileTypeStbImage.h

@@ -0,0 +1,73 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file pnmFileTypeStbImage.h
+ * @author rdb
+ * @date 2016-03-31
+ */
+
+#ifndef PNMFILETYPESTBIMAGE_H
+#define PNMFILETYPESTBIMAGE_H
+
+#include "pandabase.h"
+
+#ifdef HAVE_STB_IMAGE
+
+#include "pnmFileType.h"
+#include "pnmReader.h"
+#include "pnmWriter.h"
+
+#include "stb_image.h"
+
+/**
+ * For reading images via the public domain stb_image.h library.  This is used
+ * when compiling without support for more specific libraries that are more
+ * full-featured, such as libpng or libjpeg.
+ */
+class EXPCL_PANDA_PNMIMAGETYPES PNMFileTypeStbImage : public PNMFileType {
+public:
+  PNMFileTypeStbImage();
+
+  virtual string get_name() const;
+
+  virtual int get_num_extensions() const;
+  virtual string get_extension(int n) const;
+
+  virtual bool has_magic_number() const;
+  virtual bool matches_magic_number(const string &magic_number) const;
+
+  virtual PNMReader *make_reader(istream *file, bool owns_file = true,
+                                 const string &magic_number = string());
+
+public:
+  static void register_with_read_factory();
+
+protected:
+  static TypedWritable *make_PNMFileTypeStbImage(const FactoryParams &params);
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    PNMFileType::init_type();
+    register_type(_type_handle, "PNMFileTypeStbImage",
+                  PNMFileType::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#endif  // HAVE_STB_IMAGE
+
+#endif

+ 6755 - 0
panda/src/pnmimagetypes/stb_image.h

@@ -0,0 +1,6755 @@
+/* stb_image - v2.12 - public domain image loader - http://nothings.org/stb_image.h
+                                     no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8-bit-per-channel (16 bpc not supported)
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+   Revision 2.00 release notes:
+
+      - Progressive JPEG is now supported.
+
+      - PPM and PGM binary formats are now supported, thanks to Ken Miller.
+
+      - x86 platforms now make use of SSE2 SIMD instructions for
+        JPEG decoding, and ARM platforms can use NEON SIMD if requested.
+        This work was done by Fabian "ryg" Giesen. SSE2 is used by
+        default, but NEON must be enabled explicitly; see docs.
+
+        With other JPEG optimizations included in this version, we see
+        2x speedup on a JPEG on an x86 machine, and a 1.5x speedup
+        on a JPEG on an ARM machine, relative to previous versions of this
+        library. The same results will not obtain for all JPGs and for all
+        x86/ARM machines. (Note that progressive JPEGs are significantly
+        slower to decode than regular JPEGs.) This doesn't mean that this
+        is the fastest JPEG decoder in the land; rather, it brings it
+        closer to parity with standard libraries. If you want the fastest
+        decode, look elsewhere. (See "Philosophy" section of docs below.)
+
+        See final bullet items below for more info on SIMD.
+
+      - Added STBI_MALLOC, STBI_REALLOC, and STBI_FREE macros for replacing
+        the memory allocator. Unlike other STBI libraries, these macros don't
+        support a context parameter, so if you need to pass a context in to
+        the allocator, you'll have to store it in a global or a thread-local
+        variable.
+
+      - Split existing STBI_NO_HDR flag into two flags, STBI_NO_HDR and
+        STBI_NO_LINEAR.
+            STBI_NO_HDR:     suppress implementation of .hdr reader format
+            STBI_NO_LINEAR:  suppress high-dynamic-range light-linear float API
+
+      - You can suppress implementation of any of the decoders to reduce
+        your code footprint by #defining one or more of the following
+        symbols before creating the implementation.
+
+            STBI_NO_JPEG
+            STBI_NO_PNG
+            STBI_NO_BMP
+            STBI_NO_PSD
+            STBI_NO_TGA
+            STBI_NO_GIF
+            STBI_NO_HDR
+            STBI_NO_PIC
+            STBI_NO_PNM   (.ppm and .pgm)
+
+      - You can request *only* certain decoders and suppress all other ones
+        (this will be more forward-compatible, as addition of new decoders
+        doesn't require you to disable them explicitly):
+
+            STBI_ONLY_JPEG
+            STBI_ONLY_PNG
+            STBI_ONLY_BMP
+            STBI_ONLY_PSD
+            STBI_ONLY_TGA
+            STBI_ONLY_GIF
+            STBI_ONLY_HDR
+            STBI_ONLY_PIC
+            STBI_ONLY_PNM   (.ppm and .pgm)
+
+         Note that you can define multiples of these, and you will get all
+         of them ("only x" and "only y" is interpreted to mean "only x&y").
+
+       - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+         want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+
+      - Compilation of all SIMD code can be suppressed with
+            #define STBI_NO_SIMD
+        It should not be necessary to disable SIMD unless you have issues
+        compiling (e.g. using an x86 compiler which doesn't support SSE
+        intrinsics or that doesn't support the method used to detect
+        SSE2 support at run-time), and even those can be reported as
+        bugs so I can refine the built-in compile-time checking to be
+        smarter.
+
+      - The old STBI_SIMD system which allowed installing a user-defined
+        IDCT etc. has been removed. If you need this, don't upgrade. My
+        assumption is that almost nobody was doing this, and those who
+        were will find the built-in SIMD more satisfactory anyway.
+
+      - RGB values computed for JPEG images are slightly different from
+        previous versions of stb_image. (This is due to using less
+        integer precision in SIMD.) The C code has been adjusted so
+        that the same RGB values will be computed regardless of whether
+        SIMD support is available, so your app should always produce
+        consistent results. But these results are slightly different from
+        previous versions. (Specifically, about 3% of available YCbCr values
+        will compute different RGB results from pre-1.49 versions by +-1;
+        most of the deviating values are one smaller in the G channel.)
+
+      - If you must produce consistent results with previous versions of
+        stb_image, #define STBI_JPEG_OLD and you will get the same results
+        you used to; however, you will not get the SIMD speedups for
+        the YCbCr-to-RGB conversion step (although you should still see
+        significant JPEG speedup from the other changes).
+
+        Please note that STBI_JPEG_OLD is a temporary feature; it will be
+        removed in future versions of the library. It is only intended for
+        near-term back-compatibility use.
+
+
+   Latest revision history:
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack; 
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) partial animated GIF support
+                         limited 16-bit PSD support
+                         minor bugs, code cleanup, and compiler warnings
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) additional corruption checking
+                         stbi_set_flip_vertically_on_load
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPEG, including x86 SSE2 & ARM NEON SIMD
+                         progressive JPEG
+                         PGM/PPM support
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         STBI_NO_*, STBI_ONLY_*
+                         GIF bugfix
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    urraka@github (animated gif)           Junggon Kim (PNM comments)
+                                           Daniel Gibson (16-bit TGA)
+
+ Optimizations & bugfixes
+    Fabian "ryg" Giesen
+    Arseny Kapoulkine
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
+    Christpher Lloyd        Martin Golini      Jerry Jansson      Joseph Thomson
+    Dave Moore              Roy Eltham         Hayaki Saito       Phil Jordan
+    Won Chun                Luke Graham        Johan Duparc       Nathan Reed
+    the Horde3D community   Thomas Ruf         Ronny Chevalier    Nick Verigakis
+    Janez Zemva             John Bartholomew   Michal Cichon      svdijk@github
+    Jonathan Blow           Ken Hamada         Tero Hanninen      Baldur Karlsson
+    Laurent Gomila          Cort Stratton      Sergio Gonzalez    romigrou@github
+    Aruelien Pocheville     Thibault Reuille   Cass Everitt       Matthew Gregan
+    Ryamond Barbiero        Paul Du Bois       Engin Manap        snagar@github
+    Michaelangel007@github  Oriol Ferrer Mesia socks-the-fox
+    Blazej Dariusz Roszkowski
+
+
+LICENSE
+
+This software is dual-licensed to the public domain and under the following
+license: you are granted a perpetual, irrevocable license to copy, modify,
+publish, and distribute this file as you see fit.
+
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 16-bit-per-channel PNG
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - no 1-bit BMP
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data)
+//
+// Standard parameters:
+//    int *x       -- outputs image width in pixels
+//    int *y       -- outputs image height in pixels
+//    int *comp    -- outputs # of image components in image file
+//    int req_comp -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
+// If req_comp is non-zero, *comp has the number of components that _would_
+// have been output otherwise. E.g. if you set req_comp to 4, you will always
+// get RGBA output, but you can check *comp to see if it's trivially opaque
+// because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
+// can be queried for an extremely brief, end-user unfriendly explanation
+// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
+// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy to use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries do not emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// make more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// The output of the JPEG decoder is slightly different from versions where
+// SIMD support was introduced (that is, for versions before 1.49). The
+// difference is only +-1 in the 8-bit RGB channels, and only on a small
+// fraction of pixels. You can force the pre-1.49 behavior by defining
+// STBI_JPEG_OLD, but this will disable some of the SIMD decoding path
+// and hence cost some performance.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image now supports loading HDR images in general, and currently
+// the Radiance .HDR file format, although the support is provided
+// generically. You can still load any file through the existing interface;
+// if you attempt to load an HDR file, it will be automatically remapped to
+// LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// By default we convert iphone-formatted PNGs back to RGB, even though
+// they are internally encoded differently. You can disable this conversion
+// by by calling stbi_convert_iphone_png_to_rgb(0), in which case
+// you will always just get the native iphone "format" through (which
+// is BGR stored in RGB).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for req_comp
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+typedef unsigned char stbi_uc;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *comp, int req_comp);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// NOT THREADSAFE
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
+
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+
+#ifdef _MSC_VER
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// NOTE: not clear do we actually need this for the 64-bit path?
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
+// this is just broken and gcc are jerks for not fixing it properly
+// http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+static int stbi__sse2_available()
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+static int stbi__sse2_available()
+{
+#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 // GCC 4.8 or later
+   // GCC 4.8+ has a nice way to do this
+   return __builtin_cpu_supports("sse2");
+#else
+   // portable way to do this, preferably without using GCC inline ASM?
+   // just bail for now.
+   return 0;
+#endif
+}
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+// assume GCC or Clang on ARM targets
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   fseek((FILE*) user, n, SEEK_CUR);
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static stbi_uc *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static stbi_uc *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static stbi_uc *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+// this is not threadsafe
+static const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+    stbi__vertically_flip_on_load = flag_true_if_should_flip;
+}
+
+static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result = stbi__load_main(s, x, y, comp, req_comp);
+
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int w = *x, h = *y;
+      int depth = req_comp ? req_comp : *comp;
+      int row,col,z;
+      stbi_uc temp;
+
+      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
+      for (row = 0; row < (h>>1); row++) {
+         for (col = 0; col < w; col++) {
+            for (z = 0; z < depth; z++) {
+               temp = result[(row * w + col) * depth + z];
+               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
+               result[((h - row - 1) * w + col) * depth + z] = temp;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+#ifndef STBI_NO_HDR
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int w = *x, h = *y;
+      int depth = req_comp ? req_comp : *comp;
+      int row,col,z;
+      float temp;
+
+      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
+      for (row = 0; row < (h>>1); row++) {
+         for (col = 0; col < w; col++) {
+            for (z = 0; z < depth; z++) {
+               temp = result[(row * w + col) * depth + z];
+               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
+               result[((h - row - 1) * w + col) * depth + z] = temp;
+            }
+         }
+      }
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_flip(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_flip(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_flip(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_flip(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int      stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   return z + (stbi__get16le(s) << 16);
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc(req_comp * x * y);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define COMBO(a,b)  ((a)*8+(b))
+      #define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (COMBO(img_n, req_comp)) {
+         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
+         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
+         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
+         CASE(2,1) dest[0]=src[0]; break;
+         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
+         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
+         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
+         CASE(3,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
+         CASE(3,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
+         CASE(4,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
+         CASE(4,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
+         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
+         default: STBI_ASSERT(0);
+      }
+      #undef CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output = (float *) stbi__malloc(x * y * comp * sizeof(float));
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+      if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output = (stbi_uc *) stbi__malloc(x * y * comp);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi_uc dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0,code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i)
+      for (j=0; j < count[i]; ++j)
+         h->size[k++] = (stbi_uc) (i+1);
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (-1 << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+
+   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   k = stbi_lrot(j->code_buffer, n);
+   STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & ~sgn);
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi_uc *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      data[0] = (short) (dc << j->succ_low);
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) << shift);
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) << 12)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0] << 2;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s);
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi_uc *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4;
+            int t = q & 15,i;
+            if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s);
+            L -= 65;
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      stbi__skip(z->s, stbi__get16be(z->s)-2);
+      return 1;
+   }
+   return 0;
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   c = stbi__get8(s);
+   if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG");    // JFIF requires
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (z->img_comp[i].id != i+1)   // JFIF requires
+         if (z->img_comp[i].id != i) {  // some version of jpegtran outputs non-JFIF-compliant files!
+            // somethings output this (see http://fileformats.archiveteam.org/wiki/JPEG#Color_format)
+            if (z->img_comp[i].id != rgb[i])
+               return stbi__err("bad component ID","Corrupt JPEG");
+            ++z->rgb;
+         }
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].raw_data = stbi__malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
+
+      if (z->img_comp[i].raw_data == NULL) {
+         for(--i; i >= 0; --i) {
+            STBI_FREE(z->img_comp[i].raw_data);
+            z->img_comp[i].raw_data = NULL;
+         }
+         return stbi__err("outofmem", "Out of memory");
+      }
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      z->img_comp[i].linebuf = NULL;
+      if (z->progressive) {
+         z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
+         z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
+         z->img_comp[i].raw_coeff = STBI_MALLOC(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      } else {
+         z->img_comp[i].coeff = 0;
+         z->img_comp[i].raw_coeff = 0;
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+            // handle 0s at the end of image data from IP Kamera 9060
+            while (!stbi__at_eof(j->s)) {
+               int x = stbi__get8(j->s);
+               if (x == 255) {
+                  j->marker = stbi__get8(j->s);
+                  break;
+               } else if (x != 0) {
+                  return stbi__err("junk before marker", "Corrupt JPEG");
+               }
+            }
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+      } else {
+         if (!stbi__process_marker(j, m)) return 0;
+      }
+      m = stbi__get_marker(j);
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+#ifdef STBI_JPEG_OLD
+// this is the same YCbCr-to-RGB calculation that stb_image has used
+// historically before the algorithm changes in 1.49
+#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 16) + 32768; // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr*float2fixed(1.40200f);
+      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
+      b = y_fixed                            + cb*float2fixed(1.77200f);
+      r >>= 16;
+      g >>= 16;
+      b >>= 16;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#else
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* float2fixed(1.40200f);
+      g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                               +   cb* float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* float2fixed(1.40200f);
+      g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                             +   cb* float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      #ifndef STBI_JPEG_OLD
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      #endif
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   #ifndef STBI_JPEG_OLD
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   #endif
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   int i;
+   for (i=0; i < j->s->img_n; ++i) {
+      if (j->img_comp[i].raw_data) {
+         STBI_FREE(j->img_comp[i].raw_data);
+         j->img_comp[i].raw_data = NULL;
+         j->img_comp[i].data = NULL;
+      }
+      if (j->img_comp[i].raw_coeff) {
+         STBI_FREE(j->img_comp[i].raw_coeff);
+         j->img_comp[i].raw_coeff = 0;
+         j->img_comp[i].coeff = 0;
+      }
+      if (j->img_comp[i].linebuf) {
+         STBI_FREE(j->img_comp[i].linebuf);
+         j->img_comp[i].linebuf = NULL;
+      }
+   }
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n;
+
+   if (z->s->img_n == 3 && n < 3)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4];
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc(n * z->s->img_x * z->s->img_y + 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (z->rgb == 3) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            stbi_uc *y = coutput[0];
+            if (n == 1)
+               for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+            else
+               for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp  = z->s->img_n; // report original components, not output
+      return output;
+   }
+}
+
+static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg j;
+   j.s = s;
+   stbi__setup_jpeg(&j);
+   r = stbi__decode_jpeg_header(&j, STBI__SCAN_type);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[288];
+   stbi__uint16 value[288];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   if (z->zbuffer >= z->zbuffer_end) return 0;
+   return *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s == 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   STBI_ASSERT(z->size[b] == s);
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) stbi__fill_bits(a);
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (int) (z->zout     - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
+   while (cur + n > limit)
+      limit *= 2;
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < hlit + hdist) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else if (c == 16) {
+         c = stbi__zreceive(a,2)+3;
+         memset(lencodes+n, lencodes[n-1], c);
+         n += c;
+      } else if (c == 17) {
+         c = stbi__zreceive(a,3)+3;
+         memset(lencodes+n, 0, c);
+         n += c;
+      } else {
+         STBI_ASSERT(c == 18);
+         c = stbi__zreceive(a,7)+11;
+         memset(lencodes+n, 0, c);
+         n += c;
+      }
+   }
+   if (n != hlit+hdist) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   STBI_ASSERT(a->num_bits == 0);
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+// @TODO: should statically initialize these for optimal thread safety
+static stbi_uc stbi__zdefault_length[288], stbi__zdefault_distance[32];
+static void stbi__init_zdefaults(void)
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zdefault_distance[31]) stbi__init_zdefaults();
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc(x * y * output_bytes); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+   if (s->img_x == x && s->img_y == y) {
+      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
+   } else { // interlaced:
+      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+   }
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior = cur - stride;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         STBI_ASSERT(img_width_bytes <= x);
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); break;
+            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); break;
+            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); break;
+            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); break;
+            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); break;
+         }
+         #undef CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            CASE(STBI__F_none)         cur[k] = raw[k]; break;
+            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); break;
+            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); break;
+            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); break;
+            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); break;
+            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); break;
+         }
+         #undef CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc(a->s->img_x * a->s->img_y * out_n);
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_n + out_x*out_n,
+                      a->out + (j*x+i)*out_n, out_n);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc(pixel_count * pal_img_n);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__reduce_png(stbi__png *p)
+{
+   int i;
+   int img_len = p->s->img_x * p->s->img_y * p->s->img_out_n;
+   stbi_uc *reduced;
+   stbi__uint16 *orig = (stbi__uint16*)p->out;
+
+   if (p->depth != 16) return 1; // don't need to do anything if not 16-bit data
+
+   reduced = (stbi_uc *)stbi__malloc(img_len);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i) reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is a decent approx of 16->8 bit scaling
+
+   p->out = reduced;
+   STBI_FREE(orig);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load = 0;
+static int stbi__de_iphone_flag = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag = flag_true_if_should_convert;
+}
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               p[0] = p[2] * 255 / a;
+               p[1] = p[1] * 255 / a;
+               p[2] =  t   * 255 / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3];
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+			if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+               if (scan == STBI__SCAN_header) return 1;
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+               // if SCAN_header, have to scan to see if we have a tRNS
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp)
+{
+   unsigned char *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth == 16) {
+         if (!stbi__reduce_png(p)) {
+            return result;
+         }
+      }
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         result = stbi__convert_format(result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static unsigned char *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) n += 16, z >>= 16;
+   if (z >= 0x00100) n +=  8, z >>=  8;
+   if (z >= 0x00010) n +=  4, z >>=  4;
+   if (z >= 0x00004) n +=  2, z >>=  2;
+   if (z >= 0x00002) n +=  1, z >>=  1;
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+static int stbi__shiftsigned(int v, int shift, int bits)
+{
+   int result;
+   int z=0;
+
+   if (shift < 0) v <<= -shift;
+   else v >>= shift;
+   result = v;
+
+   z = bits;
+   while (z < 8) {
+      result += v >> z;
+      z += bits;
+   }
+   return result;
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+} stbi__bmp_data;
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (info->bpp == 1) return stbi__errpuc("monochrome", "BMP type not supported: 1-bit");
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               if (info->bpp == 32) {
+                  info->mr = 0xffu << 16;
+                  info->mg = 0xffu <<  8;
+                  info->mb = 0xffu <<  0;
+                  info->ma = 0xffu << 24;
+                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+               } else {
+                  info->mr = 31u << 10;
+                  info->mg = 31u <<  5;
+                  info->mb = 31u <<  0;
+               }
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+
+   info.all_a = 255;   
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - 14 - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - 14 - info.hsz) >> 2;
+   }
+
+   s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   out = (stbi_uc *) stbi__malloc(target * s->img_x * s->img_y);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      for (j=0; j < (int) s->img_y; ++j) {
+         for (i=0; i < (int) s->img_x; i += 2) {
+            int v=stbi__get8(s),v2=0;
+            if (info.bpp == 4) {
+               v2 = v & 15;
+               v >>= 4;
+            }
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+            if (i+1 == (int) s->img_x) break;
+            v = (info.bpp == 8) ? stbi__get8(s) : v2;
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+         }
+         stbi__skip(s, pad);
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - 14 - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+   
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i], p1[i] = p2[i], p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if(is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+            // else: fall-through
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+            return STBI_rgb;
+      case 24: // fall-through
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (r * 255)/31;
+   out[1] = (g * 255)/31;
+   out[2] = (b * 255)/31;
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4];
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   tga_data = (unsigned char*)stbi__malloc( (size_t)tga_width * tga_height * tga_comp );
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc( tga_palette_len * tga_comp );
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   int   pixelCount;
+   int channelCount, compression;
+   int channel, i, count, len;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Create the destination image.
+   out = (stbi_uc *) stbi__malloc(4 * w*h);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceeded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            count = 0;
+            while (count < pixelCount) {
+               len = stbi__get8(s);
+               if (len == 128) {
+                  // No-op.
+               } else if (len < 128) {
+                  // Copy next len+1 bytes literally.
+                  len++;
+                  count += len;
+                  while (len) {
+                     *p = stbi__get8(s);
+                     p += 4;
+                     len--;
+                  }
+               } else if (len > 128) {
+                  stbi_uc   val;
+                  // Next -len+1 bytes in the dest are replicated from next source byte.
+                  // (Interpret len as a negative 8-bit int.)
+                  len ^= 0x0FF;
+                  len += 2;
+                  val = stbi__get8(s);
+                  count += len;
+                  while (len) {
+                     *p = val;
+                     p += 4;
+                     len--;
+                  }
+               }
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out + channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            stbi_uc val = channel == 3 ? 255 : 0;
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = val;
+         } else {
+            // Read the data.
+            if (bitdepth == 16) {
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = (stbi_uc) (stbi__get16be(s) >> 8);
+            } else {
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = stbi__get8(s);
+            }
+         }
+      }
+   }
+
+   if (channelCount >= 4) {
+      for (i=0; i < w*h; ++i) {
+         unsigned char *pixel = out + 4*i;
+         if (pixel[3] != 0 && pixel[3] != 255) {
+            // remove weird white matte from PSD
+            float a = pixel[3] / 255.0f;
+            float ra = 1.0f / a;
+            float inv_a = 255.0f * (1 - ra);
+            pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+            pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+            pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+         }
+      }
+   }
+
+   if (req_comp && req_comp != 4) {
+      out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp)
+{
+   stbi_uc *result;
+   int i, x,y;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if ((1 << 28) / x < y) return stbi__errpuc("too large", "Image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc(x*y*4);
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out, *old_out;             // output buffer (always 4 components)
+   int flags, bgindex, ratio, transparent, eflags, delay;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[4096];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   p = &g->out[g->cur_x + g->cur_y];
+   c = &g->color_table[g->codes[code].suffix * 4];
+
+   if (c[3] >= 128) {
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) return stbi__errpuc("no clear code", "Corrupt GIF");
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 4096)        return stbi__errpuc("too many codes", "Corrupt GIF");
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+static void stbi__fill_gif_background(stbi__gif *g, int x0, int y0, int x1, int y1)
+{
+   int x, y;
+   stbi_uc *c = g->pal[g->bgindex];
+   for (y = y0; y < y1; y += 4 * g->w) {
+      for (x = x0; x < x1; x += 4) {
+         stbi_uc *p  = &g->out[y + x];
+         p[0] = c[2];
+         p[1] = c[1];
+         p[2] = c[0];
+         p[3] = 0;
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp)
+{
+   int i;
+   stbi_uc *prev_out = 0;
+
+   if (g->out == 0 && !stbi__gif_header(s, g, comp,0))
+      return 0; // stbi__g_failure_reason set by stbi__gif_header
+
+   prev_out = g->out;
+   g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
+   if (g->out == 0) return stbi__errpuc("outofmem", "Out of memory");
+
+   switch ((g->eflags & 0x1C) >> 2) {
+      case 0: // unspecified (also always used on 1st frame)
+         stbi__fill_gif_background(g, 0, 0, 4 * g->w, 4 * g->w * g->h);
+         break;
+      case 1: // do not dispose
+         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
+         g->old_out = prev_out;
+         break;
+      case 2: // dispose to background
+         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
+         stbi__fill_gif_background(g, g->start_x, g->start_y, g->max_x, g->max_y);
+         break;
+      case 3: // dispose to previous
+         if (g->old_out) {
+            for (i = g->start_y; i < g->max_y; i += 4 * g->w)
+               memcpy(&g->out[i + g->start_x], &g->old_out[i + g->start_x], g->max_x - g->start_x);
+         }
+         break;
+   }
+
+   for (;;) {
+      switch (stbi__get8(s)) {
+         case 0x2C: /* Image Descriptor */
+         {
+            int prev_trans = -1;
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               if (g->transparent >= 0 && (g->eflags & 0x01)) {
+                  prev_trans = g->pal[g->transparent][3];
+                  g->pal[g->transparent][3] = 0;
+               }
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (o == NULL) return NULL;
+
+            if (prev_trans != -1)
+               g->pal[g->transparent][3] = (stbi_uc) prev_trans;
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            if (stbi__get8(s) == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = stbi__get16le(s);
+                  g->transparent = stbi__get8(s);
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0)
+               stbi__skip(s, len);
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+
+   STBI_NOTUSED(req_comp);
+}
+
+static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *u = 0;
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   memset(g, 0, sizeof(*g));
+
+   u = stbi__gif_load_next(s, g, comp, req_comp);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g->w;
+      *y = g->h;
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g->w, g->h);
+   }
+   else if (g->out)
+      STBI_FREE(g->out);
+   STBI_FREE(g);
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s)
+{
+   const char *signature = "#?RADIANCE\n";
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+         return 0;
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+
+
+   // Check identifier
+   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   // Read data
+   hdr_data = (float *) stbi__malloc(height * width * req_comp * sizeof(float));
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) scanline = (stbi_uc *) stbi__malloc(width * 4);
+
+         for (k = 0; k < 4; ++k) {
+            i = 0;
+            while (i < width) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;   
+   p = stbi__bmp_parse_header(s, &info);
+   stbi__rewind( s );
+   if (p == NULL)
+      return 0;
+   *x = s->img_x;
+   *y = s->img_y;
+   *comp = info.ma ? 4 : 3;
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   if (stbi__get16be(s) != 8) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained;
+   stbi__pic_packet packets[10];
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+//    Does not support 16-bit-per-channel
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *out;
+   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+      return 0;
+   *x = s->img_x;
+   *y = s->img_y;
+   *comp = s->img_n;
+
+   out = (stbi_uc *) stbi__malloc(s->img_n * s->img_x * s->img_y);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+
+   if (req_comp && req_comp != s->img_n) {
+      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv;
+   char c, p, t;
+
+   stbi__rewind( s );
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+
+   if (maxv > 255)
+      return stbi__err("max value > 255", "PPM image not 8-bit");
+   else
+      return 1;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/

+ 2 - 1
panda/src/putil/bam.h

@@ -32,7 +32,7 @@ static const unsigned short _bam_major_ver = 6;
 // Bumped to major version 6 on 2006-02-11 to factor out PandaNode::CData.
 
 static const unsigned short _bam_first_minor_ver = 14;
-static const unsigned short _bam_minor_ver = 41;
+static const unsigned short _bam_minor_ver = 42;
 // Bumped to minor version 14 on 2007-12-19 to change default ColorAttrib.
 // Bumped to minor version 15 on 2008-04-09 to add TextureAttrib::_implicit_sort.
 // Bumped to minor version 16 on 2008-05-13 to add Texture::_quality_level.
@@ -61,5 +61,6 @@ static const unsigned short _bam_minor_ver = 41;
 // Bumped to minor version 39 on 2016-01-09 to change lights and materials.
 // Bumped to minor version 40 on 2016-01-11 to make NodePaths writable.
 // Bumped to minor version 41 on 2016-03-02 to change LensNode, Lens, and Camera.
+// Bumped to minor version 42 on 2016-04-08 to expand ColorBlendAttrib.
 
 #endif

+ 11 - 0
panda/src/putil/bamReader.I

@@ -159,6 +159,17 @@ get_file_pos() {
   return _source->get_file_pos();
 }
 
+/**
+ * Registers a factory function that is called when an object of the given
+ * type is encountered within the .bam stream.
+ *
+ * @param user_data an optional pointer to be passed along to the function.
+ */
+void BamReader::
+register_factory(TypeHandle handle, WritableFactory::CreateFunc *func, void *user_data) {
+  get_factory()->register_factory(handle, func, user_data);
+}
+
 /**
  * Returns the global WritableFactory for generating TypedWritable objects
  */

Some files were not shown because too many files changed in this diff