Browse Source

Merge branch 'master' into shaderpipeline

rdb 1 week ago
parent
commit
13e7def7a7
93 changed files with 13019 additions and 2547 deletions
  1. 4 4
      .github/workflows/ci.yml
  2. 2 2
      BACKERS.md
  3. 3 3
      README.md
  4. 2 2
      direct/src/directtools/DirectGeometry.py
  5. 1 1
      direct/src/directtools/DirectGrid.py
  6. 13 13
      direct/src/interval/LerpInterval.py
  7. 2 0
      direct/src/showbase/EventManager.py
  8. 1 0
      doc/ReleaseNotes
  9. 1 1
      dtool/Config.cmake
  10. 6 5
      dtool/src/dtoolbase/pvector.h
  11. 2 0
      dtool/src/dtoolbase/register_type.cxx
  12. 14 0
      dtool/src/dtoolbase/register_type.h
  13. 1 1
      dtool/src/dtoolutil/filename_ext.cxx
  14. 6 3
      dtool/src/dtoolutil/iostream_ext.cxx
  15. 22 0
      dtool/src/dtoolutil/vector_uchar.cxx
  16. 3 0
      dtool/src/dtoolutil/vector_uchar.h
  17. 12 0
      dtool/src/interrogatedb/py_compat.h
  18. 1 1
      makepanda/makepanda.py
  19. 13 3
      makepanda/makepandacore.py
  20. 1 0
      panda/src/android/site.py
  21. 58 0
      panda/src/audio/audioSound.cxx
  22. 11 0
      panda/src/audio/audioSound.h
  23. 1 1
      panda/src/audio/config_audio.cxx
  24. 1 0
      panda/src/audiotraits/openalAudioManager.cxx
  25. 3 0
      panda/src/audiotraits/openalAudioManager.h
  26. 11 0
      panda/src/audiotraits/openalAudioSound.cxx
  27. 5 0
      panda/src/audiotraits/openalAudioSound.h
  28. 26 3
      panda/src/display/graphicsEngine.cxx
  29. 5 1
      panda/src/display/graphicsEngine.h
  30. 25 1
      panda/src/display/graphicsStateGuardian.cxx
  31. 7 1
      panda/src/display/graphicsStateGuardian.h
  32. 1 2
      panda/src/event/asyncFuture_ext.cxx
  33. 27 0
      panda/src/event/eventParameter.I
  34. 3 0
      panda/src/event/eventParameter.h
  35. 56 38
      panda/src/event/pythonTask.cxx
  36. 1 1
      panda/src/event/pythonTask.h
  37. 5 1
      panda/src/express/pointerToArray_ext.I
  38. 17 0
      panda/src/ffmpeg/ffmpegAudioCursor.cxx
  39. 2 0
      panda/src/ffmpeg/ffmpegAudioCursor.h
  40. 5 0
      panda/src/gles2gsg/gles2gsg.h
  41. 10 0
      panda/src/glstuff/glBufferContext_src.cxx
  42. 2 0
      panda/src/glstuff/glBufferContext_src.h
  43. 161 18
      panda/src/glstuff/glGraphicsStateGuardian_src.cxx
  44. 17 4
      panda/src/glstuff/glGraphicsStateGuardian_src.h
  45. 10 0
      panda/src/glstuff/glIndexBufferContext_src.cxx
  46. 2 0
      panda/src/glstuff/glIndexBufferContext_src.h
  47. 11 1
      panda/src/glstuff/glSamplerContext_src.cxx
  48. 4 2
      panda/src/glstuff/glSamplerContext_src.h
  49. 10 0
      panda/src/glstuff/glVertexBufferContext_src.cxx
  50. 2 0
      panda/src/glstuff/glVertexBufferContext_src.h
  51. 4 1
      panda/src/gobj/adaptiveLru.I
  52. 1 1
      panda/src/gobj/geom.cxx
  53. 15 0
      panda/src/gobj/preparedGraphicsObjects.cxx
  54. 11 0
      panda/src/gobj/preparedGraphicsObjects.h
  55. 18 16
      panda/src/gobj/pythonTexturePoolFilter.cxx
  56. 10 0
      panda/src/gobj/savedContext.cxx
  57. 3 1
      panda/src/gobj/savedContext.h
  58. 20 0
      panda/src/gobj/shaderBuffer.cxx
  59. 4 0
      panda/src/gobj/shaderBuffer.h
  60. 5 5
      panda/src/gobj/texture.cxx
  61. 0 10
      panda/src/gobj/textureContext.cxx
  62. 2 1
      panda/src/gobj/textureContext.h
  63. 5 0
      panda/src/gsgbase/graphicsStateGuardianBase.h
  64. 12015 2339
      panda/src/movies/dr_flac.h
  65. 1 1
      panda/src/movies/flacAudio.h
  66. 62 15
      panda/src/movies/flacAudioCursor.cxx
  67. 8 0
      panda/src/movies/flacAudioCursor.h
  68. 9 0
      panda/src/movies/movieAudioCursor.cxx
  69. 2 0
      panda/src/movies/movieAudioCursor.h
  70. 26 0
      panda/src/movies/opusAudioCursor.cxx
  71. 3 0
      panda/src/movies/opusAudioCursor.h
  72. 25 0
      panda/src/movies/vorbisAudioCursor.cxx
  73. 3 0
      panda/src/movies/vorbisAudioCursor.h
  74. 18 13
      panda/src/pgraph/pythonLoaderFileType.cxx
  75. 7 5
      panda/src/pipeline/pythonThread.cxx
  76. 2 1
      panda/src/pipeline/pythonThread.h
  77. 6 7
      panda/src/putil/bamReader_ext.cxx
  78. 39 0
      panda/src/putil/bitArray_ext.cxx
  79. 1 0
      panda/src/putil/callbackObject.h
  80. 1 0
      panda/src/putil/config_putil.cxx
  81. 1 0
      panda/src/putil/paramValue.cxx
  82. 3 0
      panda/src/putil/paramValue.h
  83. 7 10
      panda/src/putil/pythonCallbackObject.cxx
  84. 4 0
      panda/src/text/textProperties.cxx
  85. 1 0
      setup.cfg
  86. BIN
      tests/audio/openclose_with_comments.flac
  87. BIN
      tests/audio/openclose_with_comments.mp3
  88. BIN
      tests/audio/openclose_with_comments.ogg
  89. BIN
      tests/audio/openclose_with_comments.opus
  90. 33 2
      tests/audio/test_loading.py
  91. 18 6
      tests/display/test_glsl_shader.py
  92. 3 0
      tests/event/test_futures.py
  93. 14 0
      tests/putil/test_bitarray.py

+ 4 - 4
.github/workflows/ci.yml

@@ -392,10 +392,10 @@ jobs:
     - name: Get thirdparty packages (macOS)
     - name: Get thirdparty packages (macOS)
       if: runner.os == 'macOS'
       if: runner.os == 'macOS'
       run: |
       run: |
-        curl -O https://www.panda3d.org/download/panda3d-1.10.15/panda3d-1.10.15-tools-mac.tar.gz
-        tar -xf panda3d-1.10.15-tools-mac.tar.gz
-        mv panda3d-1.10.15/thirdparty thirdparty
-        rmdir panda3d-1.10.15
+        curl -O https://www.panda3d.org/download/panda3d-1.10.16/panda3d-1.10.16-tools-mac.tar.gz
+        tar -xf panda3d-1.10.16-tools-mac.tar.gz
+        mv panda3d-1.10.16/thirdparty thirdparty
+        rmdir panda3d-1.10.16
         (cd thirdparty/darwin-libs-a && rm -rf rocket)
         (cd thirdparty/darwin-libs-a && rm -rf rocket)
 
 
     - name: Set up XCode (macOS)
     - name: Set up XCode (macOS)

+ 2 - 2
BACKERS.md

@@ -5,12 +5,12 @@ This is a list of all the people who are contributing financially to Panda3D.  I
 ## Bronze Sponsors
 ## Bronze Sponsors
 
 
 [<img src="https://www.panda3d.org/wp-content/uploads/2024/08/Route4MeLogo1185x300-2-1-1024x259.png" alt="Route4Me" height="48">](https://route4me.com/)
 [<img src="https://www.panda3d.org/wp-content/uploads/2024/08/Route4MeLogo1185x300-2-1-1024x259.png" alt="Route4Me" height="48">](https://route4me.com/)
-[<img src="https://www.lambdatest.com/blue-logo.png" alt="LambdaTest" height="48">](https://www.lambdatest.com/?utm_source=panda3d&utm_medium=sponsor)
+[<img src="https://www.panda3d.org/wp-content/uploads/2026/01/black-logo-1-e1768931551108.png" alt="TestMu AI" height="48">](https://www.testmu.ai/?utm_source=panda3d&utm_medium=sponsor)
 
 
 * [Daniel Stokes](https://opencollective.com/daniel-stokes)
 * [Daniel Stokes](https://opencollective.com/daniel-stokes)
 * [David Rose](https://opencollective.com/david-rose)
 * [David Rose](https://opencollective.com/david-rose)
 * [Route4Me](https://route4me.com/)
 * [Route4Me](https://route4me.com/)
-* [LambdaTest](https://www.lambdatest.com/?utm_source=panda3d&utm_medium=sponsor)
+* [TestMu AI](https://www.testmu.ai/?utm_source=panda3d&utm_medium=sponsor)
 
 
 ## Benefactors
 ## Benefactors
 
 

+ 3 - 3
README.md

@@ -24,7 +24,7 @@ Installing Panda3D
 ==================
 ==================
 
 
 The latest Panda3D SDK can be downloaded from
 The latest Panda3D SDK can be downloaded from
-[this page](https://www.panda3d.org/download/sdk-1-10-15/).
+[this page](https://www.panda3d.org/download/sdk-1-10-16/).
 If you are familiar with installing Python packages, you can use
 If you are familiar with installing Python packages, you can use
 the following command:
 the following command:
 
 
@@ -136,7 +136,7 @@ macOS
 -----
 -----
 
 
 On macOS, you will need to download a set of precompiled thirdparty packages in order to
 On macOS, you will need to download a set of precompiled thirdparty packages in order to
-compile Panda3D, which can be acquired from [here](https://www.panda3d.org/download/panda3d-1.10.15/panda3d-1.10.15-tools-mac.tar.gz).
+compile Panda3D, which can be acquired from [here](https://www.panda3d.org/download/panda3d-1.10.16/panda3d-1.10.16-tools-mac.tar.gz).
 
 
 After placing the thirdparty directory inside the panda3d source directory,
 After placing the thirdparty directory inside the panda3d source directory,
 you may build Panda3D using a command like the following:
 you may build Panda3D using a command like the following:
@@ -243,7 +243,7 @@ visit the [Sponsors page](https://www.panda3d.org/sponsors) on our web site.
 Thank you to everyone who has donated!
 Thank you to everyone who has donated!
 
 
 [<img src="https://www.panda3d.org/wp-content/uploads/2024/08/Route4MeLogo1185x300-2-1-1024x259.png" alt="Route4Me" height="48">](https://route4me.com/)
 [<img src="https://www.panda3d.org/wp-content/uploads/2024/08/Route4MeLogo1185x300-2-1-1024x259.png" alt="Route4Me" height="48">](https://route4me.com/)
-[<img src="https://www.lambdatest.com/blue-logo.png" alt="LambdaTest" height="48">](https://www.lambdatest.com/?utm_source=panda3d&utm_medium=sponsor)
+[<img src="https://www.panda3d.org/wp-content/uploads/2026/01/black-logo-1-e1768931551108.png" alt="TestMu AI" height="48">](https://www.testmu.ai/?utm_source=panda3d&utm_medium=sponsor)
 
 
 <a href="https://opencollective.com/panda3d" target="_blank">
 <a href="https://opencollective.com/panda3d" target="_blank">
   <img src="https://opencollective.com/panda3d/contribute/[email protected]?color=blue" width=300 />
   <img src="https://opencollective.com/panda3d/contribute/[email protected]?color=blue" width=300 />

+ 2 - 2
direct/src/directtools/DirectGeometry.py

@@ -79,8 +79,8 @@ class LineNodePath(NodePath):
     def getVertex(self, index):
     def getVertex(self, index):
         return self.lineSegs.getVertex(index)
         return self.lineSegs.getVertex(index)
 
 
-    def getVertexColor(self):
-        return self.lineSegs.getVertexColor()
+    def getVertexColor(self, index):
+        return self.lineSegs.getVertexColor(index)
 
 
     def drawArrow(self, sv, ev, arrowAngle, arrowLength):
     def drawArrow(self, sv, ev, arrowAngle, arrowLength):
         """
         """

+ 1 - 1
direct/src/directtools/DirectGrid.py

@@ -33,7 +33,7 @@ class DirectGrid(NodePath, DirectObject):
 
 
         self.centerLines = LineNodePath(self.lines)
         self.centerLines = LineNodePath(self.lines)
         self.centerLines.lineNode.setName('centerLines')
         self.centerLines.lineNode.setName('centerLines')
-        self.centerLines.setColor(VBase4(1, 0, 0, 0))
+        self.centerLines.setColor(VBase4(1, 0, 0, 1))
         self.centerLines.setThickness(3)
         self.centerLines.setThickness(3)
 
 
         # Small marker to hilight snap-to-grid point
         # Small marker to hilight snap-to-grid point

+ 13 - 13
direct/src/interval/LerpInterval.py

@@ -122,7 +122,7 @@ class LerpPosInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setStartPos, self.startPos)
         LerpNodePathInterval.privDoEvent(self, t, event)
         LerpNodePathInterval.privDoEvent(self, t, event)
@@ -153,7 +153,7 @@ class LerpHprInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndHpr, self.endHpr)
             self.setupParam(self.setEndHpr, self.endHpr)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartQuat, self.startQuat)
             self.setupParam(self.setStartQuat, self.startQuat)
@@ -192,7 +192,7 @@ class LerpQuatInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndQuat, self.endQuat)
             self.setupParam(self.setEndQuat, self.endQuat)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartQuat, self.startQuat)
             self.setupParam(self.setStartQuat, self.startQuat)
@@ -219,7 +219,7 @@ class LerpScaleInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndScale, self.endScale)
             self.setupParam(self.setEndScale, self.endScale)
             self.setupParam(self.setStartScale, self.startScale)
             self.setupParam(self.setStartScale, self.startScale)
         LerpNodePathInterval.privDoEvent(self, t, event)
         LerpNodePathInterval.privDoEvent(self, t, event)
@@ -245,7 +245,7 @@ class LerpShearInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndShear, self.endShear)
             self.setupParam(self.setEndShear, self.endShear)
             self.setupParam(self.setStartShear, self.startShear)
             self.setupParam(self.setStartShear, self.startShear)
         LerpNodePathInterval.privDoEvent(self, t, event)
         LerpNodePathInterval.privDoEvent(self, t, event)
@@ -280,7 +280,7 @@ class LerpPosHprInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setEndHpr, self.endHpr)
             self.setupParam(self.setEndHpr, self.endHpr)
@@ -326,7 +326,7 @@ class LerpPosQuatInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setEndQuat, self.endQuat)
             self.setupParam(self.setEndQuat, self.endQuat)
@@ -365,7 +365,7 @@ class LerpHprScaleInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndHpr, self.endHpr)
             self.setupParam(self.setEndHpr, self.endHpr)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartQuat, self.startQuat)
             self.setupParam(self.setStartQuat, self.startQuat)
@@ -414,7 +414,7 @@ class LerpQuatScaleInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndQuat, self.endQuat)
             self.setupParam(self.setEndQuat, self.endQuat)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartHpr, self.startHpr)
             self.setupParam(self.setStartQuat, self.startQuat)
             self.setupParam(self.setStartQuat, self.startQuat)
@@ -459,7 +459,7 @@ class LerpPosHprScaleInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setEndHpr, self.endHpr)
             self.setupParam(self.setEndHpr, self.endHpr)
@@ -516,7 +516,7 @@ class LerpPosQuatScaleInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setEndQuat, self.endQuat)
             self.setupParam(self.setEndQuat, self.endQuat)
@@ -569,7 +569,7 @@ class LerpPosHprScaleShearInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setEndHpr, self.endHpr)
             self.setupParam(self.setEndHpr, self.endHpr)
@@ -636,7 +636,7 @@ class LerpPosQuatScaleShearInterval(LerpNodePathInterval):
     def privDoEvent(self, t, event):
     def privDoEvent(self, t, event):
         # This function is only used if Python functors were passed in
         # This function is only used if Python functors were passed in
         # for some of the input parameters.
         # for some of the input parameters.
-        if self.paramSetup and event == CInterval.ETInitialize:
+        if self.paramSetup and (event == CInterval.ETInitialize or event == CInterval.ETInstant):
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setEndPos, self.endPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setStartPos, self.startPos)
             self.setupParam(self.setEndQuat, self.endQuat)
             self.setupParam(self.setEndQuat, self.endQuat)

+ 2 - 0
direct/src/showbase/EventManager.py

@@ -76,6 +76,8 @@ class EventManager:
             return eventParameter.getStringValue()
             return eventParameter.getStringValue()
         elif eventParameter.isWstring():
         elif eventParameter.isWstring():
             return eventParameter.getWstringValue()
             return eventParameter.getWstringValue()
+        elif hasattr(eventParameter, 'isBytes') and eventParameter.isBytes():
+            return eventParameter.getBytesValue()
         elif eventParameter.isTypedRefCount():
         elif eventParameter.isTypedRefCount():
             return eventParameter.getTypedRefCountValue()
             return eventParameter.getTypedRefCountValue()
         elif eventParameter.isEmpty():
         elif eventParameter.isEmpty():

+ 1 - 0
doc/ReleaseNotes

@@ -12,6 +12,7 @@ This maintenance release fixes some minor defects and stability issues.
 * Fix use-after-free in collision system with transform cache disabled (#1733)
 * Fix use-after-free in collision system with transform cache disabled (#1733)
 * Egg: Add limited forward compatibility for metallic-roughness textures
 * Egg: Add limited forward compatibility for metallic-roughness textures
 * OpenGL: fix error blitting depth texture on macOS (#1719)
 * OpenGL: fix error blitting depth texture on macOS (#1719)
+* OpenGL: fix SSBO not being rebound if deleted and recreated immediately
 * OpenGL: fix SSBO support not being detected in certain situations
 * OpenGL: fix SSBO support not being detected in certain situations
 * GLSL: Add p3d_MetallicRoughnessTexture input mapped to M_metallic_roughness
 * GLSL: Add p3d_MetallicRoughnessTexture input mapped to M_metallic_roughness
 * X11: Add config variable to enable detection of autorepeat key events (#1735)
 * X11: Add config variable to enable detection of autorepeat key events (#1735)

+ 1 - 1
dtool/Config.cmake

@@ -265,7 +265,7 @@ if(BUILD_INTERROGATE)
     panda3d-interrogate
     panda3d-interrogate
 
 
     GIT_REPOSITORY https://github.com/panda3d/interrogate.git
     GIT_REPOSITORY https://github.com/panda3d/interrogate.git
-    GIT_TAG 9f8faebdfcea03870107db60dcf54df2f9331384
+    GIT_TAG 7cf2550d2c8d95b8c268aa4bb0b5602e85a086dc
 
 
     PREFIX ${_interrogate_dir}
     PREFIX ${_interrogate_dir}
     CMAKE_ARGS
     CMAKE_ARGS

+ 6 - 5
dtool/src/dtoolbase/pvector.h

@@ -46,15 +46,16 @@ public:
   typedef std::vector<Type, allocator> base_class;
   typedef std::vector<Type, allocator> base_class;
   typedef typename base_class::size_type size_type;
   typedef typename base_class::size_type size_type;
 
 
-  explicit pvector(TypeHandle type_handle = pvector_type_handle) : base_class(allocator(type_handle)) { }
+  pvector() : base_class(allocator(get_type_handle(pvector<Type>))) { }
+  explicit pvector(TypeHandle type_handle) : base_class(allocator(type_handle)) { }
   pvector(const pvector<Type> &copy) : base_class(copy) { }
   pvector(const pvector<Type> &copy) : base_class(copy) { }
   pvector(pvector<Type> &&from) noexcept : base_class(std::move(from)) {};
   pvector(pvector<Type> &&from) noexcept : base_class(std::move(from)) {};
-  explicit pvector(size_type n, TypeHandle type_handle = pvector_type_handle) : base_class(n, Type(), allocator(type_handle)) { }
-  explicit pvector(size_type n, const Type &value, TypeHandle type_handle = pvector_type_handle) : base_class(n, value, allocator(type_handle)) { }
-  pvector(const Type *begin, const Type *end, TypeHandle type_handle = pvector_type_handle) : base_class(allocator(type_handle)) {
+  explicit pvector(size_type n, TypeHandle type_handle = get_type_handle(pvector<Type>)) : base_class(n, Type(), allocator(type_handle)) { }
+  explicit pvector(size_type n, const Type &value, TypeHandle type_handle = get_type_handle(pvector<Type>)) : base_class(n, value, allocator(type_handle)) { }
+  pvector(const Type *begin, const Type *end, TypeHandle type_handle = get_type_handle(pvector<Type>)) : base_class(allocator(type_handle)) {
     this->insert(this->end(), begin, end);
     this->insert(this->end(), begin, end);
   }
   }
-  pvector(std::initializer_list<Type> init, TypeHandle type_handle = pvector_type_handle) : base_class(std::move(init), allocator(type_handle)) { }
+  pvector(std::initializer_list<Type> init, TypeHandle type_handle = get_type_handle(pvector<Type>)) : base_class(std::move(init), allocator(type_handle)) { }
 
 
   pvector<Type> &operator =(const pvector<Type> &copy) {
   pvector<Type> &operator =(const pvector<Type> &copy) {
     base_class::operator =(copy);
     base_class::operator =(copy);

+ 2 - 0
dtool/src/dtoolbase/register_type.cxx

@@ -26,6 +26,7 @@ TypeHandle double_type_handle;
 TypeHandle float_type_handle;
 TypeHandle float_type_handle;
 TypeHandle string_type_handle;
 TypeHandle string_type_handle;
 TypeHandle wstring_type_handle;
 TypeHandle wstring_type_handle;
+TypeHandle vector_uchar_type_handle;
 
 
 TypeHandle long_p_type_handle;
 TypeHandle long_p_type_handle;
 TypeHandle int_p_type_handle;
 TypeHandle int_p_type_handle;
@@ -59,6 +60,7 @@ void init_system_type_handles() {
     register_type(float_type_handle, "float");
     register_type(float_type_handle, "float");
     register_type(string_type_handle, "string");
     register_type(string_type_handle, "string");
     register_type(wstring_type_handle, "wstring");
     register_type(wstring_type_handle, "wstring");
+    register_type(vector_uchar_type_handle, "vector_uchar");
 
 
     register_type(int_p_type_handle, "int*");
     register_type(int_p_type_handle, "int*");
     register_type(short_p_type_handle, "short*");
     register_type(short_p_type_handle, "short*");

+ 14 - 0
dtool/src/dtoolbase/register_type.h

@@ -19,6 +19,9 @@
 #include "typeHandle.h"
 #include "typeHandle.h"
 #include "typeRegistry.h"
 #include "typeRegistry.h"
 
 
+template<class T>
+class pvector;
+
 /**
 /**
  * This inline function is just a convenient way to call
  * This inline function is just a convenient way to call
  * TypeRegistry::register_type(), along with zero to four
  * TypeRegistry::register_type(), along with zero to four
@@ -88,6 +91,7 @@ extern TypeHandle EXPCL_DTOOL_DTOOLBASE double_type_handle;
 extern TypeHandle EXPCL_DTOOL_DTOOLBASE float_type_handle;
 extern TypeHandle EXPCL_DTOOL_DTOOLBASE float_type_handle;
 extern TypeHandle EXPCL_DTOOL_DTOOLBASE string_type_handle;
 extern TypeHandle EXPCL_DTOOL_DTOOLBASE string_type_handle;
 extern TypeHandle EXPCL_DTOOL_DTOOLBASE wstring_type_handle;
 extern TypeHandle EXPCL_DTOOL_DTOOLBASE wstring_type_handle;
+extern TypeHandle EXPCL_DTOOL_DTOOLBASE vector_uchar_type_handle;
 
 
 extern TypeHandle long_p_type_handle;
 extern TypeHandle long_p_type_handle;
 extern TypeHandle int_p_type_handle;
 extern TypeHandle int_p_type_handle;
@@ -175,6 +179,16 @@ INLINE TypeHandle _get_type_handle(const std::wstring *) {
   return wstring_type_handle;
   return wstring_type_handle;
 }
 }
 
 
+template<class T>
+INLINE TypeHandle _get_type_handle(const pvector<T> *) {
+  return pvector_type_handle;
+}
+
+template<>
+INLINE TypeHandle _get_type_handle(const pvector<unsigned char> *) {
+  return vector_uchar_type_handle;
+}
+
 template<>
 template<>
 INLINE TypeHandle _get_type_handle(const long * const *) {
 INLINE TypeHandle _get_type_handle(const long * const *) {
   return long_p_type_handle;
   return long_p_type_handle;

+ 1 - 1
dtool/src/dtoolutil/filename_ext.cxx

@@ -67,7 +67,7 @@ __init__(PyObject *path) {
     return;
     return;
   }
   }
 
 
-  PyObject *path_str = PyObject_CallFunctionObjArgs(fspath, path, nullptr);
+  PyObject *path_str = PyObject_CallOneArg(fspath, path);
   Py_DECREF(fspath);
   Py_DECREF(fspath);
   if (path_str == nullptr) {
   if (path_str == nullptr) {
     return;
     return;

+ 6 - 3
dtool/src/dtoolutil/iostream_ext.cxx

@@ -134,8 +134,9 @@ readinto(PyObject *b) {
 
 
   Py_buffer view;
   Py_buffer view;
   if (PyObject_GetBuffer(b, &view, PyBUF_CONTIG) == -1) {
   if (PyObject_GetBuffer(b, &view, PyBUF_CONTIG) == -1) {
-    PyErr_SetString(PyExc_TypeError,
-      "write() requires a contiguous, read-write bytes-like object");
+    PyErr_Format(PyExc_TypeError,
+                 "readinto() requires a contiguous, read-write bytes-like object, not '%.100s'",
+                 Py_TYPE(b)->tp_name);
     return 0;
     return 0;
   }
   }
 
 
@@ -260,7 +261,9 @@ write(PyObject *b) {
 
 
   Py_buffer view;
   Py_buffer view;
   if (PyObject_GetBuffer(b, &view, PyBUF_CONTIG_RO) == -1) {
   if (PyObject_GetBuffer(b, &view, PyBUF_CONTIG_RO) == -1) {
-    PyErr_SetString(PyExc_TypeError, "write() requires a contiguous buffer");
+    PyErr_Format(PyExc_TypeError,
+                 "write() requires a bytes-like object, not '%.100s'",
+                 Py_TYPE(b)->tp_name);
     return;
     return;
   }
   }
 
 

+ 22 - 0
dtool/src/dtoolutil/vector_uchar.cxx

@@ -19,3 +19,25 @@
 #define NAME vector_uchar
 #define NAME vector_uchar
 
 
 #include "vector_src.cxx"
 #include "vector_src.cxx"
+
+/**
+ * Writes a hex representation of the bytes to the output stream.
+ */
+std::ostream &
+operator << (std::ostream &out, const vector_uchar &data) {
+  static const char nibble_to_hex[] = "0123456789abcdef";
+
+  if (data.empty()) {
+    return out;
+  }
+
+  out << nibble_to_hex[(data[0] & 0xf0) >> 4];
+  out << nibble_to_hex[data[0] & 0xf];
+
+  for (size_t i = 1; i < data.size(); ++i) {
+    out << ' ';
+    out << nibble_to_hex[(data[i] & 0xf0) >> 4];
+    out << nibble_to_hex[data[i] & 0xf];
+  }
+  return out;
+}

+ 3 - 0
dtool/src/dtoolutil/vector_uchar.h

@@ -30,4 +30,7 @@
 
 
 #include "vector_src.h"
 #include "vector_src.h"
 
 
+EXPCL_DTOOL_DTOOLUTIL std::ostream &
+operator << (std::ostream &out, const vector_uchar &data);
+
 #endif
 #endif

+ 12 - 0
dtool/src/interrogatedb/py_compat.h

@@ -219,6 +219,14 @@ INLINE PyObject *_PyLong_Lshift(PyObject *a, size_t shiftby) {
 }
 }
 #endif
 #endif
 
 
+#ifndef PY_VECTORCALL_ARGUMENTS_OFFSET
+#  define PY_VECTORCALL_ARGUMENTS_OFFSET (((size_t)1) << (8 * sizeof(size_t) - 1))
+#endif
+
+#if PY_VERSION_HEX < 0x030800B1
+#  define PyVectorcall_NARGS(n) ((n) & ~PY_VECTORCALL_ARGUMENTS_OFFSET)
+#endif
+
 /* Python 3.9 */
 /* Python 3.9 */
 
 
 #if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
 #if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
@@ -229,6 +237,10 @@ INLINE PyObject *_PyLong_Lshift(PyObject *a, size_t shiftby) {
 #  define PyCFunction_CheckExact(op) (Py_IS_TYPE(op, &PyCFunction_Type))
 #  define PyCFunction_CheckExact(op) (Py_IS_TYPE(op, &PyCFunction_Type))
 #endif
 #endif
 
 
+#if PY_VERSION_HEX < 0x030900A4
+#  define PyObject_Vectorcall(callable, args, nargsf, kwnames) (_PyObject_Vectorcall((callable), (args), (nargsf), (kwnames)))
+#endif
+
 #if PY_VERSION_HEX < 0x03090000
 #if PY_VERSION_HEX < 0x03090000
 INLINE PyObject *PyObject_CallNoArgs(PyObject *func) {
 INLINE PyObject *PyObject_CallNoArgs(PyObject *func) {
 #if PY_VERSION_HEX >= 0x03080000
 #if PY_VERSION_HEX >= 0x03080000

+ 1 - 1
makepanda/makepanda.py

@@ -323,7 +323,7 @@ def parseopts(args):
     if GetTarget() == 'windows':
     if GetTarget() == 'windows':
         if not MSVC_VERSION:
         if not MSVC_VERSION:
             print("No MSVC version specified. Defaulting to 14.1 (Visual Studio 2017).")
             print("No MSVC version specified. Defaulting to 14.1 (Visual Studio 2017).")
-            MSVC_VERSION = (14, 1)
+            MSVC_VERSION = (14, 3)
         else:
         else:
             try:
             try:
                 MSVC_VERSION = tuple(int(d) for d in MSVC_VERSION.split('.'))[:2]
                 MSVC_VERSION = tuple(int(d) for d in MSVC_VERSION.split('.'))[:2]

+ 13 - 3
makepanda/makepandacore.py

@@ -284,7 +284,7 @@ def GetHost():
         return 'darwin'
         return 'darwin'
     elif sys.platform.startswith('linux'):
     elif sys.platform.startswith('linux'):
         try:
         try:
-            # Python seems to offer no built-in way to check this.
+            # Python versions before 3.13 reported android as "linux"
             osname = subprocess.check_output(["uname", "-o"])
             osname = subprocess.check_output(["uname", "-o"])
             if osname.strip().lower() == b'android':
             if osname.strip().lower() == b'android':
                 return 'android'
                 return 'android'
@@ -294,6 +294,9 @@ def GetHost():
             return 'linux'
             return 'linux'
     elif sys.platform.startswith('freebsd'):
     elif sys.platform.startswith('freebsd'):
         return 'freebsd'
         return 'freebsd'
+    # Handle for Python >= 3.13
+    elif sys.platform == 'android':
+        return 'android'
     else:
     else:
         exit('Unrecognized sys.platform: %s' % (sys.platform))
         exit('Unrecognized sys.platform: %s' % (sys.platform))
 
 
@@ -589,8 +592,8 @@ def GetInterrogateDir():
             return INTERROGATE_DIR
             return INTERROGATE_DIR
 
 
         dir = os.path.join(GetOutputDir(), "tmp", "interrogate")
         dir = os.path.join(GetOutputDir(), "tmp", "interrogate")
-        if not os.path.isdir(os.path.join(dir, "panda3d_interrogate-0.8.0.dist-info")):
-            oscmd("\"%s\" -m pip install --force-reinstall --upgrade -t \"%s\" panda3d-interrogate==0.8.0" % (sys.executable, dir))
+        if not os.path.isdir(os.path.join(dir, "panda3d_interrogate-0.8.1.dist-info")):
+            oscmd("\"%s\" -m pip install --force-reinstall --upgrade -t \"%s\" panda3d-interrogate==0.8.1" % (sys.executable, dir))
 
 
         INTERROGATE_DIR = dir
         INTERROGATE_DIR = dir
 
 
@@ -3052,6 +3055,13 @@ def SetupBuildEnvironment(compiler):
                 libdir += '64'
                 libdir += '64'
             SYS_LIB_DIRS += [libdir]
             SYS_LIB_DIRS += [libdir]
 
 
+        if sys.platform != "darwin":
+            # Some Linux distributions (eg. Fedora) don't add /usr/local/lib64
+            if ("/usr/lib64" in SYS_LIB_DIRS and
+                "/usr/local/lib64" not in SYS_LIB_DIRS and
+                os.path.isdir("/usr/local/lib64")):
+                SYS_LIB_DIRS.append("/usr/local/lib64")
+
         # Now extract the preprocessor's include directories.
         # Now extract the preprocessor's include directories.
         cmd = GetCXX() + " -x c++ -v -E " + os.devnull
         cmd = GetCXX() + " -x c++ -v -E " + os.devnull
         cmd += sysroot_flag
         cmd += sysroot_flag

+ 1 - 0
panda/src/android/site.py

@@ -6,6 +6,7 @@ from importlib.machinery import ModuleSpec
 
 
 from importlib import _bootstrap_external
 from importlib import _bootstrap_external
 
 
+# This should not be needed once we bump the minimum python version requirement to 3.13
 sys.platform = "android"
 sys.platform = "android"
 
 
 class AndroidExtensionFinder(MetaPathFinder):
 class AndroidExtensionFinder(MetaPathFinder):

+ 58 - 0
panda/src/audio/audioSound.cxx

@@ -13,6 +13,7 @@
  */
  */
 
 
 #include "audioSound.h"
 #include "audioSound.h"
+#include "vector_string.h"
 
 
 using std::ostream;
 using std::ostream;
 
 
@@ -196,3 +197,60 @@ operator << (ostream &out, AudioSound::SoundStatus status) {
 
 
   return out << "**invalid AudioSound::SoundStatus(" << (int)status << ")**";
   return out << "**invalid AudioSound::SoundStatus(" << (int)status << ")**";
 }
 }
+
+static const vector_string empty;
+
+/**
+ * Get the comment attached to this AudioSound as a list of strings.
+ */
+const vector_string& AudioSound::
+get_raw_comment() const {
+  return empty;
+}
+
+/**
+ * Returns true if this AudioSound has a comment with the given key,
+ * i.e. "author". Case-sensitive.
+ */
+bool AudioSound::
+has_comment(const std::string &key) const {
+  for (const std::string &st : get_raw_comment()) {
+    if (st.rfind(key + "=", 0) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * Returns the value for a given key in the comment. If the key is not present,
+ * returns an empty string.
+ */
+std::string AudioSound::
+get_comment(const std::string &key) const {
+  for (const std::string &st : get_raw_comment()) {
+    if (st.rfind(key + "=", 0) == 0) {
+      return st.substr(key.length() + 1);
+    }
+  }
+  return "";
+}
+
+/**
+ * Returns the number of comments this sound has.
+ */
+int AudioSound::
+get_num_raw_comments() const {
+  return get_raw_comment().size();
+}
+
+/**
+ * Returns the comment at a given index.
+ */
+std::string AudioSound::
+get_raw_comment(int index) const {
+  if (index >= get_num_raw_comments() || index < 0) {
+    return "";
+  }
+  return get_raw_comment()[index];
+}

+ 11 - 0
panda/src/audio/audioSound.h

@@ -20,6 +20,7 @@
 #include "pointerTo.h"
 #include "pointerTo.h"
 #include "filterProperties.h"
 #include "filterProperties.h"
 #include "luse.h"
 #include "luse.h"
+#include "vector_string.h"
 
 
 class AudioManager;
 class AudioManager;
 
 
@@ -141,6 +142,16 @@ PUBLISHED:
   enum SoundStatus { BAD, READY, PLAYING };
   enum SoundStatus { BAD, READY, PLAYING };
   virtual SoundStatus status() const = 0;
   virtual SoundStatus status() const = 0;
 
 
+  bool has_comment(const std::string &key) const;
+  std::string get_comment(const std::string &key) const;
+  MAKE_MAP_PROPERTY(comments, has_comment, get_comment);
+
+  virtual const vector_string& get_raw_comment() const;
+
+  int get_num_raw_comments() const;
+  std::string get_raw_comment(int index) const;
+  MAKE_SEQ_PROPERTY(raw_comments, get_num_raw_comments, get_raw_comment);
+
   virtual void output(std::ostream &out) const;
   virtual void output(std::ostream &out) const;
   virtual void write(std::ostream &out) const;
   virtual void write(std::ostream &out) const;
 
 

+ 1 - 1
panda/src/audio/config_audio.cxx

@@ -76,7 +76,7 @@ ConfigVariableInt audio_preload_threshold
           "time - the hard drive seek time makes it stutter."));
           "time - the hard drive seek time makes it stutter."));
 
 
 ConfigVariableBool audio_want_hrtf
 ConfigVariableBool audio_want_hrtf
-("audio-want-hrtf", true,
+("audio-want-hrtf", false,
  PRC_DESC("This determines whether OpenAL-Soft should activate HRTF in "
  PRC_DESC("This determines whether OpenAL-Soft should activate HRTF in "
           "certain hardware configurations. Set it to true to cause "
           "certain hardware configurations. Set it to true to cause "
           "OpenAL to automatically apply HRTF processing to all output "
           "OpenAL to automatically apply HRTF processing to all output "

+ 1 - 0
panda/src/audiotraits/openalAudioManager.cxx

@@ -440,6 +440,7 @@ get_sound_data(MovieAudio *movie, int mode) {
   sd->_rate     = stream->audio_rate();
   sd->_rate     = stream->audio_rate();
   sd->_channels = stream->audio_channels();
   sd->_channels = stream->audio_channels();
   sd->_length   = stream->length();
   sd->_length   = stream->length();
+  sd->_comment  = stream->get_raw_comment();
   audio_debug("Creating: " << sd->_movie->get_filename().get_basename());
   audio_debug("Creating: " << sd->_movie->get_filename().get_basename());
   audio_debug("  - Rate: " << sd->_rate);
   audio_debug("  - Rate: " << sd->_rate);
   audio_debug("  - Channels: " << sd->_channels);
   audio_debug("  - Channels: " << sd->_channels);

+ 3 - 0
panda/src/audiotraits/openalAudioManager.h

@@ -21,6 +21,7 @@
 #include "pset.h"
 #include "pset.h"
 #include "movieAudioCursor.h"
 #include "movieAudioCursor.h"
 #include "reMutex.h"
 #include "reMutex.h"
+#include "vector_string.h"
 
 
 // OSX uses the OpenAL framework
 // OSX uses the OpenAL framework
 #ifdef HAVE_OPENAL_FRAMEWORK
 #ifdef HAVE_OPENAL_FRAMEWORK
@@ -173,6 +174,8 @@ private:
     int                  _channels;
     int                  _channels;
     int                  _client_count;
     int                  _client_count;
     ExpirationQueue::iterator _expire;
     ExpirationQueue::iterator _expire;
+
+    vector_string _comment;
   };
   };
 
 
 
 

+ 11 - 0
panda/src/audiotraits/openalAudioSound.cxx

@@ -17,6 +17,7 @@
 #include "coordinateSystem.h"
 #include "coordinateSystem.h"
 #include "openalAudioSound.h"
 #include "openalAudioSound.h"
 #include "openalAudioManager.h"
 #include "openalAudioManager.h"
+#include "vector_string.h"
 
 
 TypeHandle OpenALAudioSound::_type_handle;
 TypeHandle OpenALAudioSound::_type_handle;
 
 
@@ -88,6 +89,8 @@ OpenALAudioSound(OpenALAudioManager *manager,
       audio_warning("stereo sound " << movie->get_filename() << " will not be spatialized");
       audio_warning("stereo sound " << movie->get_filename() << " will not be spatialized");
     }
     }
   }
   }
+
+  _comment = std::move(_sd->_comment);
   release_sound_data(false);
   release_sound_data(false);
 }
 }
 
 
@@ -1092,3 +1095,11 @@ status() const {
     return AudioSound::PLAYING;
     return AudioSound::PLAYING;
   }
   }
 }
 }
+
+/**
+ * Returns the comments attached to this audio file.
+ */
+const vector_string& OpenALAudioSound::
+get_raw_comment() const {
+  return _comment;
+}

+ 5 - 0
panda/src/audiotraits/openalAudioSound.h

@@ -19,6 +19,7 @@
 #include "movieAudioCursor.h"
 #include "movieAudioCursor.h"
 #include "trueClock.h"
 #include "trueClock.h"
 #include "openalAudioManager.h"
 #include "openalAudioManager.h"
+#include "vector_string.h"
 
 
 // OSX uses the OpenAL framework
 // OSX uses the OpenAL framework
 #ifdef HAVE_OPENAL_FRAMEWORK
 #ifdef HAVE_OPENAL_FRAMEWORK
@@ -117,6 +118,8 @@ public:
 
 
   void finished();
   void finished();
 
 
+  const vector_string& get_raw_comment() const;
+
 private:
 private:
   OpenALAudioSound(OpenALAudioManager* manager,
   OpenALAudioSound(OpenALAudioManager* manager,
                    MovieAudio *movie,
                    MovieAudio *movie,
@@ -214,6 +217,8 @@ private:
   PN_stdfloat _cone_outer_angle;
   PN_stdfloat _cone_outer_angle;
   PN_stdfloat _cone_outer_gain;
   PN_stdfloat _cone_outer_gain;
 
 
+  vector_string _comment;
+
 public:
 public:
   static TypeHandle get_class_type() {
   static TypeHandle get_class_type() {
     return _type_handle;
     return _type_handle;

+ 26 - 3
panda/src/display/graphicsEngine.cxx

@@ -1145,21 +1145,44 @@ extract_texture_data(Texture *tex, GraphicsStateGuardian *gsg) {
  *
  *
  * This is mainly useful for debugging.  It is a very slow call because it
  * This is mainly useful for debugging.  It is a very slow call because it
  * introduces a pipeline stall both of Panda's pipeline and the graphics
  * introduces a pipeline stall both of Panda's pipeline and the graphics
- * pipeline.
+ * pipeline.  This is even when only a small amount of data is downloaded.
  *
  *
  * The return value is empty if some kind of error occurred.
  * The return value is empty if some kind of error occurred.
  */
  */
 vector_uchar GraphicsEngine::
 vector_uchar GraphicsEngine::
-extract_shader_buffer_data(ShaderBuffer *buffer, GraphicsStateGuardian *gsg) {
+extract_shader_buffer_data(ShaderBuffer *buffer, GraphicsStateGuardian *gsg,
+                           size_t start, size_t size) {
   return run_on_draw_thread([=] () {
   return run_on_draw_thread([=] () {
     vector_uchar data;
     vector_uchar data;
-    if (!gsg->extract_shader_buffer_data(buffer, data)) {
+    if (!gsg->extract_shader_buffer_data(buffer, data, start, size)) {
       data.clear();
       data.clear();
     }
     }
     return data;
     return data;
   });
   });
 }
 }
 
 
+/**
+ * Updates shader buffer data from the CPU.  This may be a subsection of the
+ * buffer.  Returns true on success.
+ *
+ * Updating is only allowed for buffers with usage hint set to UH_dynamic.
+ */
+bool GraphicsEngine::
+update_shader_buffer_data(ShaderBuffer *buffer, GraphicsStateGuardian *gsg,
+                          const vector_uchar &data, size_t offset) {
+  if (buffer->get_usage_hint() != GeomEnums::UH_dynamic) {
+    display_cat.error()
+      << "Cannot update ShaderBuffer with usage hint " << buffer->get_usage_hint() << "\n";
+    return false;
+  }
+
+  const unsigned char *data_ptr = data.data();
+  const size_t data_size = data.size();
+  return run_on_draw_thread([=] () {
+    return gsg->update_shader_buffer_data(buffer, offset, data_size, data_ptr);
+  });
+}
+
 /**
 /**
  * Asks the indicated GraphicsStateGuardian to dispatch the compute shader in
  * Asks the indicated GraphicsStateGuardian to dispatch the compute shader in
  * the given ShaderAttrib using the given work group counts.  This can act as
  * the given ShaderAttrib using the given work group counts.  This can act as

+ 5 - 1
panda/src/display/graphicsEngine.h

@@ -115,7 +115,11 @@ PUBLISHED:
   BLOCKING void flip_frame();
   BLOCKING void flip_frame();
 
 
   bool extract_texture_data(Texture *tex, GraphicsStateGuardian *gsg);
   bool extract_texture_data(Texture *tex, GraphicsStateGuardian *gsg);
-  vector_uchar extract_shader_buffer_data(ShaderBuffer *buffer, GraphicsStateGuardian *gsg);
+  vector_uchar extract_shader_buffer_data(ShaderBuffer *buffer, GraphicsStateGuardian *gsg,
+                                          size_t start = 0, size_t size = (size_t)-1);
+  bool update_shader_buffer_data(ShaderBuffer *buffer, GraphicsStateGuardian *gsg,
+                                 const vector_uchar &data, size_t offset = 0);
+
   void dispatch_compute(const LVecBase3i &work_groups,
   void dispatch_compute(const LVecBase3i &work_groups,
                         const RenderState *state,
                         const RenderState *state,
                         GraphicsStateGuardian *gsg);
                         GraphicsStateGuardian *gsg);

+ 25 - 1
panda/src/display/graphicsStateGuardian.cxx

@@ -763,6 +763,19 @@ release_shader_buffers(const pvector<BufferContext *> &contexts) {
   }
   }
 }
 }
 
 
+/**
+ * This method should only be called by the GraphicsEngine.  Do not call it
+ * directly; call GraphicsEngine::update_shader_buffer_data() instead.
+ *
+ * This method will be called in the draw thread to upload data to (a part of)
+ * the shader buffer from the CPU.  If data is null, clears the buffer instead.
+ */
+bool GraphicsStateGuardian::
+update_shader_buffer_data(ShaderBuffer *buffer, size_t start, size_t size,
+                          const unsigned char *data) {
+  return false;
+}
+
 /**
 /**
  * This method should only be called by the GraphicsEngine.  Do not call it
  * This method should only be called by the GraphicsEngine.  Do not call it
  * directly; call GraphicsEngine::extract_texture_data() instead.
  * directly; call GraphicsEngine::extract_texture_data() instead.
@@ -771,10 +784,21 @@ release_shader_buffers(const pvector<BufferContext *> &contexts) {
  * current contents synchronously.
  * current contents synchronously.
  */
  */
 bool GraphicsStateGuardian::
 bool GraphicsStateGuardian::
-extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data) {
+extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                           size_t start, size_t size) {
   return false;
   return false;
 }
 }
 
 
+/**
+ * Asynchronous version of extract_shader_buffer_data.  It is the caller's
+ * responsibility that the data argument outlasts the token.
+ */
+void GraphicsStateGuardian::
+async_extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                                 size_t start, size_t size, CompletionToken token) {
+  token.complete(extract_shader_buffer_data(buffer, data, start, size));
+}
+
 /**
 /**
  * Begins a new occlusion query.  After this call, you may call
  * Begins a new occlusion query.  After this call, you may call
  * begin_draw_primitives() and draw_triangles()/draw_whatever() repeatedly.
  * begin_draw_primitives() and draw_triangles()/draw_whatever() repeatedly.

+ 7 - 1
panda/src/display/graphicsStateGuardian.h

@@ -329,7 +329,13 @@ public:
   virtual BufferContext *prepare_shader_buffer(ShaderBuffer *data);
   virtual BufferContext *prepare_shader_buffer(ShaderBuffer *data);
   virtual void release_shader_buffer(BufferContext *ibc);
   virtual void release_shader_buffer(BufferContext *ibc);
   virtual void release_shader_buffers(const pvector<BufferContext *> &contexts);
   virtual void release_shader_buffers(const pvector<BufferContext *> &contexts);
-  virtual bool extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data);
+  virtual bool update_shader_buffer_data(ShaderBuffer *buffer, size_t start,
+                                         size_t size, const unsigned char *data);
+  virtual bool extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                                          size_t start = 0, size_t size = (size_t)-1);
+  virtual void async_extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                                                size_t start = 0, size_t size = (size_t)-1,
+                                                CompletionToken token = CompletionToken());
 
 
   virtual void begin_occlusion_query();
   virtual void begin_occlusion_query();
   virtual PT(OcclusionQueryContext) end_occlusion_query();
   virtual PT(OcclusionQueryContext) end_occlusion_query();

+ 1 - 2
panda/src/event/asyncFuture_ext.cxx

@@ -332,8 +332,7 @@ add_done_callback(PyObject *self, PyObject *fn) {
   }
   }
 
 
   PythonTask *task = new PythonTask(fn);
   PythonTask *task = new PythonTask(fn);
-  Py_DECREF(task->_args);
-  task->_args = PyTuple_Pack(1, self);
+  task->_args.assign(1, Py_NewRef(self));
   task->_append_task = false;
   task->_append_task = false;
   task->_ignore_return = true;
   task->_ignore_return = true;
 
 

+ 27 - 0
panda/src/event/eventParameter.I

@@ -63,6 +63,11 @@ EventParameter(const std::string &value) : _ptr(new EventStoreString(value)) { }
 INLINE EventParameter::
 INLINE EventParameter::
 EventParameter(const std::wstring &value) : _ptr(new EventStoreWstring(value)) { }
 EventParameter(const std::wstring &value) : _ptr(new EventStoreWstring(value)) { }
 
 
+/**
+ * Defines an EventParameter that stores a bytes value.
+ */
+INLINE EventParameter::
+EventParameter(const vector_uchar &value) : _ptr(new ParamBytes(value)) { }
 
 
 /**
 /**
  *
  *
@@ -185,6 +190,28 @@ get_wstring_value() const {
   return ((const EventStoreWstring *)_ptr.p())->get_value();
   return ((const EventStoreWstring *)_ptr.p())->get_value();
 }
 }
 
 
+/**
+ * Returns true if the EventParameter stores a binary bytes value, false
+ * otherwise.
+ */
+INLINE bool EventParameter::
+is_bytes() const {
+  if (is_empty()) {
+    return false;
+  }
+  return _ptr->is_of_type(ParamBytes::get_class_type());
+}
+
+/**
+ * Retrieves the value stored in the EventParameter.  It is only valid to call
+ * this if is_bytes() has already returned true.
+ */
+INLINE vector_uchar EventParameter::
+get_bytes_value() const {
+  nassertr(is_bytes(), vector_uchar());
+  return ((const ParamBytes *)_ptr.p())->get_value();
+}
+
 /**
 /**
  * Returns true if the EventParameter stores a TypedReferenceCount pointer,
  * Returns true if the EventParameter stores a TypedReferenceCount pointer,
  * false otherwise.  Note that a TypedReferenceCount is not exactly the same
  * false otherwise.  Note that a TypedReferenceCount is not exactly the same

+ 3 - 0
panda/src/event/eventParameter.h

@@ -41,6 +41,7 @@ PUBLISHED:
   INLINE EventParameter(double value);
   INLINE EventParameter(double value);
   INLINE EventParameter(const std::string &value);
   INLINE EventParameter(const std::string &value);
   INLINE EventParameter(const std::wstring &value);
   INLINE EventParameter(const std::wstring &value);
+  INLINE EventParameter(const vector_uchar &value);
 
 
   INLINE EventParameter(const EventParameter &copy);
   INLINE EventParameter(const EventParameter &copy);
   INLINE EventParameter &operator = (const EventParameter &copy);
   INLINE EventParameter &operator = (const EventParameter &copy);
@@ -59,6 +60,8 @@ PUBLISHED:
   INLINE std::string get_string_value() const;
   INLINE std::string get_string_value() const;
   INLINE bool is_wstring() const;
   INLINE bool is_wstring() const;
   INLINE std::wstring get_wstring_value() const;
   INLINE std::wstring get_wstring_value() const;
+  INLINE bool is_bytes() const;
+  INLINE vector_uchar get_bytes_value() const;
 
 
   INLINE bool is_typed_ref_count() const;
   INLINE bool is_typed_ref_count() const;
   INLINE TypedReferenceCount *get_typed_ref_count_value() const;
   INLINE TypedReferenceCount *get_typed_ref_count_value() const;

+ 56 - 38
panda/src/event/pythonTask.cxx

@@ -37,7 +37,6 @@ PythonTask::
 PythonTask(PyObject *func_or_coro, const std::string &name) :
 PythonTask(PyObject *func_or_coro, const std::string &name) :
   AsyncTask(name),
   AsyncTask(name),
   _function(nullptr),
   _function(nullptr),
-  _args(nullptr),
   _upon_death(nullptr),
   _upon_death(nullptr),
   _owner(nullptr),
   _owner(nullptr),
   _exception(nullptr),
   _exception(nullptr),
@@ -45,6 +44,7 @@ PythonTask(PyObject *func_or_coro, const std::string &name) :
   _exc_traceback(nullptr),
   _exc_traceback(nullptr),
   _generator(nullptr),
   _generator(nullptr),
   _fut_waiter(nullptr),
   _fut_waiter(nullptr),
+  _append_task(true),
   _ignore_return(false),
   _ignore_return(false),
   _registered_to_owner(false),
   _registered_to_owner(false),
   _retrieved_exception(false) {
   _retrieved_exception(false) {
@@ -63,7 +63,6 @@ PythonTask(PyObject *func_or_coro, const std::string &name) :
     nassert_raise("Invalid function passed to PythonTask");
     nassert_raise("Invalid function passed to PythonTask");
   }
   }
 
 
-  set_args(Py_None, true);
   set_upon_death(Py_None);
   set_upon_death(Py_None);
   set_owner(Py_None);
   set_owner(Py_None);
 
 
@@ -104,7 +103,9 @@ PythonTask::
 
 
   // All of these may have already been cleared by __clear__.
   // All of these may have already been cleared by __clear__.
   Py_XDECREF(_function);
   Py_XDECREF(_function);
-  Py_XDECREF(_args);
+  for (PyObject *arg : _args) {
+    Py_DECREF(arg);
+  }
   Py_XDECREF(__dict__);
   Py_XDECREF(__dict__);
   Py_XDECREF(_exception);
   Py_XDECREF(_exception);
   Py_XDECREF(_exc_value);
   Py_XDECREF(_exc_value);
@@ -134,21 +135,23 @@ set_function(PyObject *function) {
  */
  */
 void PythonTask::
 void PythonTask::
 set_args(PyObject *args, bool append_task) {
 set_args(PyObject *args, bool append_task) {
-  Py_XDECREF(_args);
-  _args = nullptr;
-
-  if (args == Py_None) {
-    // None means no arguments; create an empty tuple.
-    _args = PyTuple_New(0);
-  } else {
-    if (PySequence_Check(args)) {
-      _args = PySequence_Tuple(args);
+  pvector<PyObject *> old_args = std::exchange(_args, {});
+
+  if (PySequence_Check(args)) {
+    Py_ssize_t len = PySequence_Size(args);
+    _args.resize(len);
+
+    for (Py_ssize_t i = 0; i < len; ++i) {
+      _args[i] = PySequence_GetItem(args, i);
     }
     }
   }
   }
-
-  if (_args == nullptr) {
+  else if (args != Py_None) {
     nassert_raise("Invalid args passed to PythonTask");
     nassert_raise("Invalid args passed to PythonTask");
-    _args = PyTuple_New(0);
+    return;
+  }
+
+  for (PyObject *old_arg : old_args) {
+    Py_DECREF(old_arg);
   }
   }
 
 
   _append_task = append_task;
   _append_task = append_task;
@@ -159,19 +162,18 @@ set_args(PyObject *args, bool append_task) {
  */
  */
 PyObject *PythonTask::
 PyObject *PythonTask::
 get_args() {
 get_args() {
-  if (_append_task) {
-    // If we want to append the task, we have to create a new tuple with space
-    // for one more at the end.  We have to do this dynamically each time, to
-    // avoid storing the task itself in its own arguments list, and thereby
-    // creating a cyclical reference.
-
-    int num_args = PyTuple_GET_SIZE(_args);
-    PyObject *with_task = PyTuple_New(num_args + 1);
-    for (int i = 0; i < num_args; ++i) {
-      PyObject *item = PyTuple_GET_ITEM(_args, i);
-      PyTuple_SET_ITEM(with_task, i, Py_NewRef(item));
-    }
+  // If we want to append the task, we have to create a new tuple with space
+  // for one more at the end.  We have to do this dynamically each time, to
+  // avoid storing the task itself in its own arguments list, and thereby
+  // creating a cyclical reference.
+
+  size_t num_args = _args.size();
+  PyObject *result = PyTuple_New(num_args + _append_task);
+  for (size_t i = 0; i < num_args; ++i) {
+    PyTuple_SET_ITEM(result, i, Py_NewRef(_args[i]));
+  }
 
 
+  if (_append_task) {
     // Check whether we have a Python wrapper.  This is not the case if the
     // Check whether we have a Python wrapper.  This is not the case if the
     // object has been created by C++ and never been exposed to Python code.
     // object has been created by C++ and never been exposed to Python code.
     if (__self__ == nullptr) {
     if (__self__ == nullptr) {
@@ -180,12 +182,9 @@ get_args() {
       __self__ = DTool_CreatePyInstance(this, Dtool_PythonTask, true, false);
       __self__ = DTool_CreatePyInstance(this, Dtool_PythonTask, true, false);
     }
     }
 
 
-    PyTuple_SET_ITEM(with_task, num_args, Py_NewRef(__self__));
-    return with_task;
-  }
-  else {
-    return Py_NewRef(_args);
+    PyTuple_SET_ITEM(result, num_args, Py_NewRef(__self__));
   }
   }
+  return result;
 }
 }
 
 
 /**
 /**
@@ -376,7 +375,9 @@ int PythonTask::
 __traverse__(visitproc visit, void *arg) {
 __traverse__(visitproc visit, void *arg) {
   Py_VISIT(__self__);
   Py_VISIT(__self__);
   Py_VISIT(_function);
   Py_VISIT(_function);
-  Py_VISIT(_args);
+  for (PyObject *arg : _args) {
+    Py_VISIT(arg);
+  }
   Py_VISIT(_upon_death);
   Py_VISIT(_upon_death);
   Py_VISIT(_owner);
   Py_VISIT(_owner);
   Py_VISIT(__dict__);
   Py_VISIT(__dict__);
@@ -390,7 +391,12 @@ __traverse__(visitproc visit, void *arg) {
 int PythonTask::
 int PythonTask::
 __clear__() {
 __clear__() {
   Py_CLEAR(_function);
   Py_CLEAR(_function);
-  Py_CLEAR(_args);
+  {
+    pvector<PyObject *> old_args = std::exchange(_args, {});
+    for (PyObject *arg : old_args) {
+      Py_DECREF(arg);
+    }
+  }
   Py_CLEAR(_upon_death);
   Py_CLEAR(_upon_death);
   Py_CLEAR(_owner);
   Py_CLEAR(_owner);
   Py_CLEAR(__dict__);
   Py_CLEAR(__dict__);
@@ -590,9 +596,21 @@ do_python_task() {
     // We are calling the function directly.
     // We are calling the function directly.
     nassertr(_function != nullptr, DS_interrupt);
     nassertr(_function != nullptr, DS_interrupt);
 
 
-    PyObject *args = get_args();
-    result = PythonThread::call_python_func(_function, args);
-    Py_DECREF(args);
+    size_t nargs = _args.size();
+    PyObject **args = (PyObject **)alloca(sizeof(PyObject *) * (nargs + 2)) + 1;
+    std::copy(_args.begin(), _args.end(), args);
+
+    if (_append_task) {
+      // Check whether we have a Python wrapper.  This is not the case if the
+      // object has been created by C++ and never been exposed to Python code.
+      if (__self__ == nullptr) {
+        // A __self__ instance does not exist, let's create one now.
+        ref();
+        __self__ = DTool_CreatePyInstance(this, Dtool_PythonTask, true, false);
+      }
+      args[nargs++] = __self__;
+    }
+    result = PythonThread::call_python_func(_function, args, nargs | PY_VECTORCALL_ARGUMENTS_OFFSET);
 
 
     if (result != nullptr && PyGen_Check(result)) {
     if (result != nullptr && PyGen_Check(result)) {
       // The function has yielded a generator.  We will call into that
       // The function has yielded a generator.  We will call into that
@@ -652,7 +670,7 @@ do_python_task() {
       PyObject *func = PyObject_GetAttrString(_generator, "throw");
       PyObject *func = PyObject_GetAttrString(_generator, "throw");
       if (func != nullptr) {
       if (func != nullptr) {
         PyObject *exc = PyObject_CallNoArgs(exc_type);
         PyObject *exc = PyObject_CallNoArgs(exc_type);
-        result = PyObject_CallFunctionObjArgs(func, exc, nullptr);
+        result = PyObject_CallOneArg(func, exc);
         Py_DECREF(exc);
         Py_DECREF(exc);
         Py_DECREF(func);
         Py_DECREF(func);
       } else {
       } else {

+ 1 - 1
panda/src/event/pythonTask.h

@@ -114,7 +114,7 @@ private:
 
 
 private:
 private:
   PyObject *_function;
   PyObject *_function;
-  PyObject *_args;
+  pvector<PyObject *> _args;
   PyObject *_upon_death;
   PyObject *_upon_death;
   PyObject *_owner;
   PyObject *_owner;
 
 

+ 5 - 1
panda/src/express/pointerToArray_ext.I

@@ -106,6 +106,9 @@ __init__(PyObject *self, PyObject *source) {
   // We need to initialize the this pointer before we can call push_back.
   // We need to initialize the this pointer before we can call push_back.
   DtoolInstance_INIT_PTR(self, this->_this);
   DtoolInstance_INIT_PTR(self, this->_this);
 
 
+  PyObject *args[3];
+  args[1] = self;
+
   Py_BEGIN_CRITICAL_SECTION(source);
   Py_BEGIN_CRITICAL_SECTION(source);
   Py_ssize_t size = PySequence_Size(source);
   Py_ssize_t size = PySequence_Size(source);
   this->_this->reserve(size);
   this->_this->reserve(size);
@@ -114,7 +117,8 @@ __init__(PyObject *self, PyObject *source) {
     if (item == nullptr) {
     if (item == nullptr) {
       break;
       break;
     }
     }
-    PyObject *result = PyObject_CallFunctionObjArgs(push_back, self, item, nullptr);
+    args[2] = item;
+    PyObject *result = _PyObject_Vectorcall(push_back, args + 1, 2 | PY_VECTORCALL_ARGUMENTS_OFFSET, nullptr);
     Py_DECREF(item);
     Py_DECREF(item);
     if (result == nullptr) {
     if (result == nullptr) {
       // Unable to add item--probably it wasn't of the appropriate type.
       // Unable to add item--probably it wasn't of the appropriate type.

+ 17 - 0
panda/src/ffmpeg/ffmpegAudioCursor.cxx

@@ -15,6 +15,7 @@
 #include "ffmpegAudioCursor.h"
 #include "ffmpegAudioCursor.h"
 
 
 #include "ffmpegAudio.h"
 #include "ffmpegAudio.h"
+#include "vector_string.h"
 extern "C" {
 extern "C" {
   #include <libavutil/dict.h>
   #include <libavutil/dict.h>
   #include <libavutil/opt.h>
   #include <libavutil/opt.h>
@@ -508,3 +509,19 @@ read_samples(int n, int16_t *data) {
   _samples_read += n;
   _samples_read += n;
   return n;
   return n;
 }
 }
+
+/**
+ *
+ */
+vector_string FfmpegAudioCursor::
+get_raw_comment() const {
+  AVDictionaryEntry *tag = nullptr;
+  vector_string comment_strings;
+  while ((tag = av_dict_get(_format_ctx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
+    std::string comment(tag->key);
+    comment.append("=");
+    comment.append(tag->value);
+    comment_strings.push_back(std::move(comment));
+  }
+  return comment_strings;
+}

+ 2 - 0
panda/src/ffmpeg/ffmpegAudioCursor.h

@@ -21,6 +21,7 @@
 #include "texture.h"
 #include "texture.h"
 #include "pointerTo.h"
 #include "pointerTo.h"
 #include "ffmpegVirtualFile.h"
 #include "ffmpegVirtualFile.h"
+#include "vector_string.h"
 
 
 extern "C" {
 extern "C" {
   #include <libavcodec/avcodec.h>
   #include <libavcodec/avcodec.h>
@@ -43,6 +44,7 @@ PUBLISHED:
   FfmpegAudioCursor(FfmpegAudio *src);
   FfmpegAudioCursor(FfmpegAudio *src);
   virtual ~FfmpegAudioCursor();
   virtual ~FfmpegAudioCursor();
   virtual void seek(double offset);
   virtual void seek(double offset);
+  vector_string get_raw_comment() const;
 
 
 public:
 public:
   virtual int read_samples(int n, int16_t *data);
   virtual int read_samples(int n, int16_t *data);

+ 5 - 0
panda/src/gles2gsg/gles2gsg.h

@@ -179,6 +179,7 @@ typedef char GLchar;
 #define GL_RG32UI 0x823C
 #define GL_RG32UI 0x823C
 #define GL_PROGRAM_BINARY_RETRIEVABLE_HINT 0x8257
 #define GL_PROGRAM_BINARY_RETRIEVABLE_HINT 0x8257
 #define GL_MAX_VERTEX_ATTRIB_STRIDE 0x82E5
 #define GL_MAX_VERTEX_ATTRIB_STRIDE 0x82E5
+#define GL_TEXTURE_COMPRESSED 0x86A1
 #define GL_PROGRAM_BINARY_LENGTH 0x8741
 #define GL_PROGRAM_BINARY_LENGTH 0x8741
 #define GL_NUM_PROGRAM_BINARY_FORMATS 0x87FE
 #define GL_NUM_PROGRAM_BINARY_FORMATS 0x87FE
 #define GL_PROGRAM_BINARY_FORMATS 0x87FF
 #define GL_PROGRAM_BINARY_FORMATS 0x87FF
@@ -253,6 +254,10 @@ typedef char GLchar;
 #define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4
 #define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4
 #define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7
 #define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7
 #define GL_UNSIGNED_INT_SAMPLER_BUFFER 0x8DD8
 #define GL_UNSIGNED_INT_SAMPLER_BUFFER 0x8DD8
+#define GL_COPY_READ_BUFFER 0x8F36
+#define GL_COPY_READ_BUFFER_BINDING 0x8F36
+#define GL_COPY_WRITE_BUFFER 0x8F36
+#define GL_COPY_WRITE_BUFFER_BINDING 0x8F36
 #define GL_MAX_IMAGE_UNITS 0x8F38
 #define GL_MAX_IMAGE_UNITS 0x8F38
 #define GL_TEXTURE_CUBE_MAP_ARRAY 0x9009
 #define GL_TEXTURE_CUBE_MAP_ARRAY 0x9009
 #define GL_TEXTURE_BINDING_CUBE_MAP_ARRAY 0x900A
 #define GL_TEXTURE_BINDING_CUBE_MAP_ARRAY 0x900A

+ 10 - 0
panda/src/glstuff/glBufferContext_src.cxx

@@ -47,3 +47,13 @@ evict_lru() {
   update_data_size_bytes(0);
   update_data_size_bytes(0);
   set_resident(false);
   set_resident(false);
 }
 }
+
+/**
+ * Returns an implementation-defined handle or pointer that can be used
+ * to interface directly with the underlying API.
+ * Returns 0 if the underlying implementation does not support this.
+ */
+uint64_t CLP(BufferContext)::
+get_native_id() const {
+  return _index;
+}

+ 2 - 0
panda/src/glstuff/glBufferContext_src.h

@@ -27,6 +27,8 @@ public:
 
 
   virtual void evict_lru();
   virtual void evict_lru();
 
 
+  virtual uint64_t get_native_id() const;
+
   CLP(GraphicsStateGuardian) *_glgsg;
   CLP(GraphicsStateGuardian) *_glgsg;
 
 
   // This is the GL "name" of the data object.
   // This is the GL "name" of the data object.

+ 161 - 18
panda/src/glstuff/glGraphicsStateGuardian_src.cxx

@@ -1262,10 +1262,12 @@ reset() {
   if (is_at_least_gl_version(4, 3) || has_extension("GL_ARB_clear_buffer_object")) {
   if (is_at_least_gl_version(4, 3) || has_extension("GL_ARB_clear_buffer_object")) {
     _glClearBufferData = (PFNGLCLEARBUFFERDATAPROC)
     _glClearBufferData = (PFNGLCLEARBUFFERDATAPROC)
       get_extension_func("glClearBufferData");
       get_extension_func("glClearBufferData");
+    _glClearBufferSubData = (PFNGLCLEARBUFFERSUBDATAPROC)
+      get_extension_func("glClearBufferSubData");
 
 
-    if (_glClearBufferData == nullptr) {
+    if (_glClearBufferData == nullptr || _glClearBufferSubData == nullptr) {
       GLCAT.warning()
       GLCAT.warning()
-        << "GL_ARB_clear_buffer_object advertised as supported by OpenGL runtime, but could not get pointers to extension function.\n";
+        << "GL_ARB_clear_buffer_object advertised as supported by OpenGL runtime, but could not get pointers to extension functions.\n";
     } else {
     } else {
       _supports_clear_buffer = true;
       _supports_clear_buffer = true;
     }
     }
@@ -3058,6 +3060,15 @@ reset() {
         get_extension_func("glMapNamedBufferRange");
         get_extension_func("glMapNamedBufferRange");
     }
     }
 
 
+    _glNamedBufferSubData = (PFNGLNAMEDBUFFERSUBDATAPROC)
+      get_extension_func("glNamedBufferSubData");
+    _glClearNamedBufferSubData = (PFNGLCLEARNAMEDBUFFERSUBDATAPROC)
+      get_extension_func("glClearNamedBufferSubData");
+    _glCopyNamedBufferSubData = (PFNGLCOPYNAMEDBUFFERSUBDATAPROC)
+      get_extension_func("glCopyNamedBufferSubData");
+    _glGetNamedBufferSubData = (PFNGLGETNAMEDBUFFERSUBDATAPROC)
+      get_extension_func("glGetNamedBufferSubData");
+
     _supports_dsa = true;
     _supports_dsa = true;
   } else {
   } else {
     _supports_dsa = false;
     _supports_dsa = false;
@@ -7822,6 +7833,9 @@ prepare_shader_buffer(ShaderBuffer *data) {
       if (data->get_usage_hint() == GeomEnums::UH_client) {
       if (data->get_usage_hint() == GeomEnums::UH_client) {
         flags |= GL_CLIENT_STORAGE_BIT;
         flags |= GL_CLIENT_STORAGE_BIT;
       }
       }
+      if (data->get_usage_hint() == GeomEnums::UH_dynamic) {
+        flags |= GL_DYNAMIC_STORAGE_BIT;
+      }
       _glBufferStorage(GL_SHADER_STORAGE_BUFFER, num_bytes, data->get_initial_data(), flags);
       _glBufferStorage(GL_SHADER_STORAGE_BUFFER, num_bytes, data->get_initial_data(), flags);
     } else
     } else
 #endif
 #endif
@@ -7972,6 +7986,66 @@ release_shader_buffers(const pvector<BufferContext *> &contexts) {
   report_my_gl_errors();
   report_my_gl_errors();
 }
 }
 
 
+/**
+ * This method should only be called by the GraphicsEngine.  Do not call it
+ * directly; call GraphicsEngine::update_shader_buffer_data() instead.
+ *
+ * This method will be called in the draw thread to upload data to (a part of)
+ * the shader buffer from the CPU.  If data is null, clears the buffer instead.
+ */
+bool CLP(GraphicsStateGuardian)::
+update_shader_buffer_data(ShaderBuffer *buffer, size_t start, size_t size,
+                          const unsigned char *data) {
+  nassertr(buffer->get_usage_hint() == GeomEnums::UH_dynamic, false);
+
+  BufferContext *bc = buffer->prepare_now(get_prepared_objects(), this);
+  if (bc == nullptr || !bc->is_of_type(CLP(BufferContext)::get_class_type())) {
+    return false;
+  }
+  CLP(BufferContext) *gbc = DCAST(CLP(BufferContext), bc);
+
+  if (_glMemoryBarrier != nullptr) {
+    _glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+  }
+
+#ifndef OPENGLES
+  if (_supports_dsa) {
+    if (data != nullptr) {
+      _glNamedBufferSubData(gbc->_index, start, size, data);
+    }
+    else if (_supports_clear_buffer) {
+      _glClearNamedBufferSubData(gbc->_index, GL_R8, start, size, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+    }
+    else {
+      void *null_data = calloc(1, size);
+      _glNamedBufferSubData(gbc->_index, start, size, null_data);
+      free(null_data);
+    }
+  } else
+#endif
+  {
+    _glBindBuffer(GL_SHADER_STORAGE_BUFFER, gbc->_index);
+    if (data != nullptr) {
+      _glBufferSubData(GL_SHADER_STORAGE_BUFFER, start, size, data);
+    }
+#ifndef OPENGLES
+    else if (_supports_clear_buffer) {
+      _glClearBufferSubData(GL_SHADER_STORAGE_BUFFER, GL_R8, start, size, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+    }
+#endif
+    else {
+      void *null_data = calloc(1, size);
+      _glBufferSubData(GL_SHADER_STORAGE_BUFFER, start, size, null_data);
+      free(null_data);
+    }
+    _glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+    _current_sbuffer_index = 0;
+  }
+  report_my_gl_errors();
+
+  return false;
+}
+
 /**
 /**
  * This method should only be called by the GraphicsEngine.  Do not call it
  * This method should only be called by the GraphicsEngine.  Do not call it
  * directly; call GraphicsEngine::extract_texture_data() instead.
  * directly; call GraphicsEngine::extract_texture_data() instead.
@@ -7980,14 +8054,21 @@ release_shader_buffers(const pvector<BufferContext *> &contexts) {
  * current contents synchronously.
  * current contents synchronously.
  */
  */
 bool CLP(GraphicsStateGuardian)::
 bool CLP(GraphicsStateGuardian)::
-extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data) {
+extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                           size_t start, size_t size) {
   BufferContext *bc = buffer->prepare_now(get_prepared_objects(), this);
   BufferContext *bc = buffer->prepare_now(get_prepared_objects(), this);
   if (bc == nullptr || !bc->is_of_type(CLP(BufferContext)::get_class_type())) {
   if (bc == nullptr || !bc->is_of_type(CLP(BufferContext)::get_class_type())) {
     return false;
     return false;
   }
   }
   CLP(BufferContext) *gbc = DCAST(CLP(BufferContext), bc);
   CLP(BufferContext) *gbc = DCAST(CLP(BufferContext), bc);
 
 
-  data.resize(buffer->get_data_size_bytes());
+  size_t total_size = buffer->get_data_size_bytes();
+  if (start >= total_size) {
+    data.clear();
+    return true;
+  }
+
+  data.resize(std::min(total_size - start, size));
 
 
   if (_glMemoryBarrier != nullptr) {
   if (_glMemoryBarrier != nullptr) {
     _glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
     _glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
@@ -8010,19 +8091,79 @@ extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data) {
   }
   }
   memcpy(&data[0], ptr, data.size());
   memcpy(&data[0], ptr, data.size());
   _glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
   _glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+  _glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+  _current_sbuffer_index = 0;
 #else
 #else
-  _glBindBuffer(GL_SHADER_STORAGE_BUFFER, gbc->_index);
-  _glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, data.size(), &data[0]);
+  if (_supports_dsa) {
+    _glGetNamedBufferSubData(gbc->_index, start, data.size(), &data[0]);
+  } else {
+    _glBindBuffer(GL_SHADER_STORAGE_BUFFER, gbc->_index);
+    _glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, start, data.size(), &data[0]);
+    _glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+    _current_sbuffer_index = 0;
+  }
 #endif
 #endif
 
 
-  _glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
-  _current_sbuffer_index = 0;
   report_my_gl_errors();
   report_my_gl_errors();
 
 
   return true;
   return true;
 }
 }
 #endif  // !OPENGLES_1
 #endif  // !OPENGLES_1
 
 
+#ifndef OPENGLES
+/**
+ * Asynchronous version of extract_shader_buffer_data.  It is the caller's
+ * responsibility that the data argument outlasts the token.
+ */
+void CLP(GraphicsStateGuardian)::
+async_extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                                 size_t start, size_t size, CompletionToken token) {
+  BufferContext *bc = buffer->prepare_now(get_prepared_objects(), this);
+  if (bc == nullptr || !bc->is_of_type(CLP(BufferContext)::get_class_type())) {
+    token.complete(false);
+    return;
+  }
+
+  CLP(BufferContext) *gbc = DCAST(CLP(BufferContext), bc);
+
+  size_t total_size = buffer->get_data_size_bytes();
+  if (start >= total_size) {
+    data.clear();
+    token.complete(true);
+    return;
+  }
+
+  size = std::min(total_size - start, size);
+
+  GLuint staging_buffer = 0;
+  void *mapped_ptr = nullptr;
+
+  bind_new_client_buffer(staging_buffer, mapped_ptr, GL_COPY_WRITE_BUFFER, size);
+  if (_glMemoryBarrier != nullptr) {
+    _glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+  }
+
+  _glBindBuffer(GL_SHADER_STORAGE_BUFFER, gbc->_index);
+  _glCopyBufferSubData(GL_SHADER_STORAGE_BUFFER, GL_COPY_WRITE_BUFFER, start, 0, size);
+  _glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+  _glBindBuffer(GL_COPY_WRITE_BUFFER, 0);
+  _current_sbuffer_index = 0;
+
+  if (_supports_buffer_storage && _glMemoryBarrier != nullptr) {
+    _glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
+  }
+
+  insert_fence([=, &data, token = std::move(token)] (bool success) mutable {
+    if (success) {
+      data = vector_uchar((const unsigned char *)mapped_ptr, (const unsigned char *)mapped_ptr + size);
+      token.complete(true);
+    } else {
+      token.complete(false);
+    }
+  });
+}
+#endif
+
 #ifndef OPENGLES
 #ifndef OPENGLES
 /**
 /**
  * Begins a new occlusion query.  After this call, you may call
  * Begins a new occlusion query.  After this call, you may call
@@ -15949,14 +16090,15 @@ do_extract_texture_data(CLP(TextureContext) *gtc, int view) {
 #endif
 #endif
   }
   }
 
 
+  GLint width = gtc->_width, height = gtc->_height, depth = gtc->_depth;
+
+#ifndef OPENGLES
   GLenum page_target = target;
   GLenum page_target = target;
   if (target == GL_TEXTURE_CUBE_MAP) {
   if (target == GL_TEXTURE_CUBE_MAP) {
     // We need a particular page to get the level parameter from.
     // We need a particular page to get the level parameter from.
     page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X;
     page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X;
   }
   }
 
 
-  GLint width = gtc->_width, height = gtc->_height, depth = gtc->_depth;
-#ifndef OPENGLES
   if (target != GL_TEXTURE_BUFFER) {
   if (target != GL_TEXTURE_BUFFER) {
     glGetTexLevelParameteriv(page_target, 0, GL_TEXTURE_WIDTH, &width);
     glGetTexLevelParameteriv(page_target, 0, GL_TEXTURE_WIDTH, &width);
     if (target != GL_TEXTURE_1D) {
     if (target != GL_TEXTURE_1D) {
@@ -16494,8 +16636,7 @@ do_extract_texture_data(CLP(TextureContext) *gtc, int view) {
   PTA_uchar image;
   PTA_uchar image;
   size_t page_size = 0;
   size_t page_size = 0;
 
 
-  if (!extract_texture_image(image, page_size, tex, target, page_target,
-                             type, compression, 0)) {
+  if (!extract_texture_image(image, page_size, tex, target, type, compression, 0)) {
     return false;
     return false;
   }
   }
 
 
@@ -16513,8 +16654,7 @@ do_extract_texture_data(CLP(TextureContext) *gtc, int view) {
 
 
   // Also get the mipmap levels.
   // Also get the mipmap levels.
   for (int n = 1; n < gtc->_num_levels; ++n) {
   for (int n = 1; n < gtc->_num_levels; ++n) {
-    if (!extract_texture_image(image, page_size, tex, target, page_target,
-                               type, compression, n)) {
+    if (!extract_texture_image(image, page_size, tex, target, type, compression, n)) {
       return false;
       return false;
     }
     }
     if (num_views == 1) {
     if (num_views == 1) {
@@ -16534,7 +16674,7 @@ do_extract_texture_data(CLP(TextureContext) *gtc, int view) {
  */
  */
 bool CLP(GraphicsStateGuardian)::
 bool CLP(GraphicsStateGuardian)::
 extract_texture_image(PTA_uchar &image, size_t &page_size,
 extract_texture_image(PTA_uchar &image, size_t &page_size,
-                      Texture *tex, GLenum target, GLenum page_target,
+                      Texture *tex, GLenum target,
                       Texture::ComponentType type,
                       Texture::ComponentType type,
                       Texture::CompressionMode compression, int n) {
                       Texture::CompressionMode compression, int n) {
 #ifdef OPENGLES  // Extracting texture data unsupported in OpenGL ES.
 #ifdef OPENGLES  // Extracting texture data unsupported in OpenGL ES.
@@ -16578,8 +16718,8 @@ extract_texture_image(PTA_uchar &image, size_t &page_size,
 
 
     if (compression != Texture::CM_off) {
     if (compression != Texture::CM_off) {
       GLint image_size;
       GLint image_size;
-      glGetTexLevelParameteriv(page_target, n,
-                                  GL_TEXTURE_COMPRESSED_IMAGE_SIZE, &image_size);
+      glGetTexLevelParameteriv(GL_TEXTURE_CUBE_MAP_POSITIVE_X, n,
+                               GL_TEXTURE_COMPRESSED_IMAGE_SIZE, &image_size);
       nassertr(image_size <= (int)page_size, false);
       nassertr(image_size <= (int)page_size, false);
       page_size = image_size;
       page_size = image_size;
     }
     }
@@ -16587,7 +16727,7 @@ extract_texture_image(PTA_uchar &image, size_t &page_size,
     image = PTA_uchar::empty_array(page_size * 6);
     image = PTA_uchar::empty_array(page_size * 6);
 
 
     for (int z = 0; z < 6; ++z) {
     for (int z = 0; z < 6; ++z) {
-      page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + z;
+      GLenum page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + z;
 
 
       if (compression == Texture::CM_off) {
       if (compression == Texture::CM_off) {
         glGetTexImage(page_target, n, external_format, pixel_type,
         glGetTexImage(page_target, n, external_format, pixel_type,
@@ -16940,6 +17080,9 @@ map_read_buffer(GLenum target, GLuint buffer, size_t size) {
 #endif
 #endif
 
 
   _glBindBuffer(target, 0);
   _glBindBuffer(target, 0);
+  if (target == GL_SHADER_STORAGE_BUFFER) {
+    _current_sbuffer_index = 0;
+  }
   return mapped_ptr;
   return mapped_ptr;
 }
 }
 
 

+ 17 - 4
panda/src/glstuff/glGraphicsStateGuardian_src.h

@@ -419,7 +419,13 @@ public:
   CLP(BufferContext) *apply_shader_buffer(GLuint base, ShaderBuffer *buffer);
   CLP(BufferContext) *apply_shader_buffer(GLuint base, ShaderBuffer *buffer);
   virtual void release_shader_buffer(BufferContext *bc);
   virtual void release_shader_buffer(BufferContext *bc);
   virtual void release_shader_buffers(const pvector<BufferContext *> &contexts);
   virtual void release_shader_buffers(const pvector<BufferContext *> &contexts);
-  virtual bool extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data);
+  virtual bool update_shader_buffer_data(ShaderBuffer *buffer, size_t start,
+                                         size_t size, const unsigned char *data);
+  virtual bool extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                                          size_t start, size_t size);
+  virtual void async_extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                                                size_t start = 0, size_t size = (size_t)-1,
+                                                CompletionToken token = CompletionToken());
 #endif
 #endif
 
 
 #ifndef OPENGLES
 #ifndef OPENGLES
@@ -676,9 +682,9 @@ protected:
   void check_nonresident_texture(BufferContextChain &chain);
   void check_nonresident_texture(BufferContextChain &chain);
   bool do_extract_texture_data(CLP(TextureContext) *gtc, int view);
   bool do_extract_texture_data(CLP(TextureContext) *gtc, int view);
   bool extract_texture_image(PTA_uchar &image, size_t &page_size,
   bool extract_texture_image(PTA_uchar &image, size_t &page_size,
-           Texture *tex, GLenum target, GLenum page_target,
-           Texture::ComponentType type,
-           Texture::CompressionMode compression, int n);
+                             Texture *tex, GLenum target,
+                             Texture::ComponentType type,
+                             Texture::CompressionMode compression, int n);
 
 
 #ifdef SUPPORT_FIXED_FUNCTION
 #ifdef SUPPORT_FIXED_FUNCTION
   void do_point_size();
   void do_point_size();
@@ -888,6 +894,7 @@ public:
   bool _supports_clear_buffer;
   bool _supports_clear_buffer;
 #ifndef OPENGLES
 #ifndef OPENGLES
   PFNGLCLEARBUFFERDATAPROC _glClearBufferData;
   PFNGLCLEARBUFFERDATAPROC _glClearBufferData;
+  PFNGLCLEARBUFFERSUBDATAPROC _glClearBufferSubData;
 #endif
 #endif
 
 
   PFNGLCOMPRESSEDTEXIMAGE1DPROC _glCompressedTexImage1D;
   PFNGLCOMPRESSEDTEXIMAGE1DPROC _glCompressedTexImage1D;
@@ -1024,6 +1031,12 @@ public:
   PFNGLGENERATETEXTUREMIPMAPPROC _glGenerateTextureMipmap;
   PFNGLGENERATETEXTUREMIPMAPPROC _glGenerateTextureMipmap;
   PFNGLBINDTEXTUREUNITPROC _glBindTextureUnit;
   PFNGLBINDTEXTUREUNITPROC _glBindTextureUnit;
   PFNGLMAPNAMEDBUFFERRANGEPROC _glMapNamedBufferRange;
   PFNGLMAPNAMEDBUFFERRANGEPROC _glMapNamedBufferRange;
+  PFNGLNAMEDBUFFERSUBDATAPROC _glNamedBufferSubData;
+  PFNGLCLEARNAMEDBUFFERSUBDATAPROC _glClearNamedBufferSubData;
+  PFNGLCOPYNAMEDBUFFERSUBDATAPROC _glCopyNamedBufferSubData;
+  PFNGLGETNAMEDBUFFERSUBDATAPROC _glGetNamedBufferSubData;
+#else
+  static const bool _supports_dsa = false;
 #endif
 #endif
 
 
 #ifndef OPENGLES_1
 #ifndef OPENGLES_1

+ 10 - 0
panda/src/glstuff/glIndexBufferContext_src.cxx

@@ -47,3 +47,13 @@ evict_lru() {
   update_data_size_bytes(0);
   update_data_size_bytes(0);
   mark_unloaded();
   mark_unloaded();
 }
 }
+
+/**
+ * Returns an implementation-defined handle or pointer that can be used
+ * to interface directly with the underlying API.
+ * Returns 0 if the underlying implementation does not support this.
+ */
+uint64_t CLP(IndexBufferContext)::
+get_native_id() const {
+  return _index;
+}

+ 2 - 0
panda/src/glstuff/glIndexBufferContext_src.h

@@ -27,6 +27,8 @@ public:
 
 
   virtual void evict_lru();
   virtual void evict_lru();
 
 
+  virtual uint64_t get_native_id() const;
+
   CLP(GraphicsStateGuardian) *_glgsg;
   CLP(GraphicsStateGuardian) *_glgsg;
 
 
   // This is the GL "name" of the data object.
   // This is the GL "name" of the data object.

+ 11 - 1
panda/src/glstuff/glSamplerContext_src.cxx

@@ -20,7 +20,7 @@ TypeHandle CLP(SamplerContext)::_type_handle;
 /**
 /**
  *
  *
  */
  */
-INLINE CLP(SamplerContext)::
+CLP(SamplerContext)::
 CLP(SamplerContext)(CLP(GraphicsStateGuardian) *glgsg,
 CLP(SamplerContext)(CLP(GraphicsStateGuardian) *glgsg,
   const SamplerState &sampler) :
   const SamplerState &sampler) :
   SamplerContext(sampler)
   SamplerContext(sampler)
@@ -68,4 +68,14 @@ reset_data() {
   // the sampler later.  glGenSamplers(1, &_index);
   // the sampler later.  glGenSamplers(1, &_index);
 }
 }
 
 
+/**
+ * Returns an implementation-defined handle or pointer that can be used
+ * to interface directly with the underlying API.
+ * Returns 0 if the underlying implementation does not support this.
+ */
+uint64_t CLP(SamplerContext)::
+get_native_id() const {
+  return _index;
+}
+
 #endif  // OPENGLES_1
 #endif  // OPENGLES_1

+ 4 - 2
panda/src/glstuff/glSamplerContext_src.h

@@ -25,14 +25,16 @@ class CLP(GraphicsStateGuardian);
  */
  */
 class EXPCL_GL CLP(SamplerContext) : public SamplerContext {
 class EXPCL_GL CLP(SamplerContext) : public SamplerContext {
 public:
 public:
-  INLINE CLP(SamplerContext)(CLP(GraphicsStateGuardian) *glgsg,
-                             const SamplerState &sampler);
+  CLP(SamplerContext)(CLP(GraphicsStateGuardian) *glgsg,
+                      const SamplerState &sampler);
   ALLOC_DELETED_CHAIN(CLP(SamplerContext));
   ALLOC_DELETED_CHAIN(CLP(SamplerContext));
 
 
   virtual ~CLP(SamplerContext)();
   virtual ~CLP(SamplerContext)();
   virtual void evict_lru();
   virtual void evict_lru();
   void reset_data();
   void reset_data();
 
 
+  virtual uint64_t get_native_id() const;
+
   // This is the GL "name" of the sampler object.
   // This is the GL "name" of the sampler object.
   GLuint _index;
   GLuint _index;
 
 

+ 10 - 0
panda/src/glstuff/glVertexBufferContext_src.cxx

@@ -47,3 +47,13 @@ evict_lru() {
   update_data_size_bytes(0);
   update_data_size_bytes(0);
   mark_unloaded();
   mark_unloaded();
 }
 }
+
+/**
+ * Returns an implementation-defined handle or pointer that can be used
+ * to interface directly with the underlying API.
+ * Returns 0 if the underlying implementation does not support this.
+ */
+uint64_t CLP(VertexBufferContext)::
+get_native_id() const {
+  return _index;
+}

+ 2 - 0
panda/src/glstuff/glVertexBufferContext_src.h

@@ -29,6 +29,8 @@ public:
 
 
   virtual void evict_lru();
   virtual void evict_lru();
 
 
+  virtual uint64_t get_native_id() const;
+
   CLP(GraphicsStateGuardian) *_glgsg;
   CLP(GraphicsStateGuardian) *_glgsg;
 
 
   // This is the GL "name" of the data object.
   // This is the GL "name" of the data object.

+ 4 - 1
panda/src/gobj/adaptiveLru.I

@@ -134,7 +134,10 @@ get_lru() const {
  */
  */
 INLINE void AdaptiveLruPage::
 INLINE void AdaptiveLruPage::
 dequeue_lru() {
 dequeue_lru() {
-  enqueue_lru(nullptr);
+  if (_lru != nullptr) {
+    _lru->do_remove_page(this);
+    _lru = nullptr;
+  }
 }
 }
 
 
 /**
 /**

+ 1 - 1
panda/src/gobj/geom.cxx

@@ -1373,7 +1373,7 @@ compute_internal_bounds(Geom::CData *cdata, Thread *current_thread) const {
     nassertv(!pmax.is_nan());
     nassertv(!pmax.is_nan());
 
 
     // Then we put the bounding volume around both of those points.
     // Then we put the bounding volume around both of those points.
-    PN_stdfloat avg_box_area;
+    PN_stdfloat avg_box_area {};
     switch (btype) {
     switch (btype) {
     case BoundingVolume::BT_best:
     case BoundingVolume::BT_best:
     case BoundingVolume::BT_fastest:
     case BoundingVolume::BT_fastest:

+ 15 - 0
panda/src/gobj/preparedGraphicsObjects.cxx

@@ -1380,6 +1380,15 @@ prepare_shader_buffer_now(ShaderBuffer *data, GraphicsStateGuardianBase *gsg) {
   return bc;
   return bc;
 }
 }
 
 
+/**
+ * Schedules the given function to be called during the next begin_frame().
+ */
+void PreparedGraphicsObjects::
+enqueue_call(EnqueuedCall &&call) {
+  ReMutexHolder holder(_lock);
+  _enqueued_calls.push_back(std::move(call));
+}
+
 /**
 /**
  * Creates a new future for the given object.
  * Creates a new future for the given object.
  */
  */
@@ -1601,6 +1610,12 @@ begin_frame(GraphicsStateGuardianBase *gsg, Thread *current_thread) {
   }
   }
 
 
   _enqueued_shader_buffers.clear();
   _enqueued_shader_buffers.clear();
+
+  for (EnqueuedCall &call : _enqueued_calls) {
+    std::move(call)(gsg);
+  }
+
+  _enqueued_calls.clear();
 }
 }
 
 
 /**
 /**

+ 11 - 0
panda/src/gobj/preparedGraphicsObjects.h

@@ -31,6 +31,10 @@
 #include "adaptiveLru.h"
 #include "adaptiveLru.h"
 #include "asyncFuture.h"
 #include "asyncFuture.h"
 
 
+#ifndef CPPPARSER
+#include <functional>
+#endif
+
 class TextureContext;
 class TextureContext;
 class SamplerContext;
 class SamplerContext;
 class GeomContext;
 class GeomContext;
@@ -160,6 +164,12 @@ PUBLISHED:
                             GraphicsStateGuardianBase *gsg);
                             GraphicsStateGuardianBase *gsg);
 
 
 public:
 public:
+#ifndef CPPPARSER
+  typedef std::function<void (GraphicsStateGuardianBase *gsg)> EnqueuedCall;
+
+  void enqueue_call(EnqueuedCall &&call);
+#endif
+
   /**
   /**
    * This is a handle to an enqueued object, from which the result can be
    * This is a handle to an enqueued object, from which the result can be
    * obtained upon completion.
    * obtained upon completion.
@@ -281,6 +291,7 @@ private:
   Buffers _prepared_shader_buffers;
   Buffers _prepared_shader_buffers;
   pvector<BufferContext *> _released_shader_buffers;
   pvector<BufferContext *> _released_shader_buffers;
   EnqueuedShaderBuffers _enqueued_shader_buffers;
   EnqueuedShaderBuffers _enqueued_shader_buffers;
+  pvector<EnqueuedCall> _enqueued_calls;
 
 
   BufferCache _vertex_buffer_cache;
   BufferCache _vertex_buffer_cache;
   BufferCacheLRU _vertex_buffer_cache_lru;
   BufferCacheLRU _vertex_buffer_cache_lru;

+ 18 - 16
panda/src/gobj/pythonTexturePoolFilter.cxx

@@ -101,17 +101,19 @@ pre_load(const Filename &orig_filename, const Filename &orig_alpha_filename,
 #endif
 #endif
 
 
   // Wrap the arguments.
   // Wrap the arguments.
-  PyObject *args = Py_BuildValue("(OOiiNO)",
-    DTool_CreatePyInstance((void *)&orig_filename, Dtool_Filename, false, true),
-    DTool_CreatePyInstance((void *)&orig_alpha_filename, Dtool_Filename, false, true),
-    primary_file_num_channels,
-    alpha_file_channel,
-    PyBool_FromLong(read_mipmaps),
-    DTool_CreatePyInstance((void *)&options, Dtool_LoaderOptions, false, true)
-  );
-
-  PyObject *result = PythonThread::call_python_func(_pre_load_func, args);
-  Py_DECREF(args);
+  PyObject *args[7];
+  args[1] = DTool_CreatePyInstance((void *)&orig_filename, Dtool_Filename, false, true);
+  args[2] = DTool_CreatePyInstance((void *)&orig_alpha_filename, Dtool_Filename, false, true);
+  args[3] = PyLong_FromLong(primary_file_num_channels);
+  args[4] = PyLong_FromLong(alpha_file_channel);
+  args[5] = PyBool_FromLong(read_mipmaps);
+  args[6] = DTool_CreatePyInstance((void *)&options, Dtool_LoaderOptions, false, true);
+
+  PyObject *result = PythonThread::call_python_func(_pre_load_func, args + 1, 6 | PY_VECTORCALL_ARGUMENTS_OFFSET);
+
+  for (size_t i = 1; i < 7; ++i) {
+    Py_DECREF(args[i]);
+  }
 
 
   PT(Texture) tex;
   PT(Texture) tex;
   if (result != nullptr) {
   if (result != nullptr) {
@@ -162,11 +164,11 @@ post_load(Texture *tex) {
 #endif
 #endif
 
 
   // Wrap the arguments.
   // Wrap the arguments.
-  PyObject *args = PyTuple_Pack(1,
-    DTool_CreatePyInstance(tex, Dtool_Texture, true, false)
-  );
-  PyObject *result = PythonThread::call_python_func(_post_load_func, args);
-  Py_DECREF(args);
+  PyObject *args[2];
+  args[1] = DTool_CreatePyInstance(tex, Dtool_Texture, true, false);
+
+  PyObject *result = PythonThread::call_python_func(_post_load_func, args + 1, 1 | PY_VECTORCALL_ARGUMENTS_OFFSET);
+  Py_DECREF(args[1]);
 
 
   PT(Texture) result_tex;
   PT(Texture) result_tex;
   if (result != nullptr) {
   if (result != nullptr) {

+ 10 - 0
panda/src/gobj/savedContext.cxx

@@ -31,3 +31,13 @@ void SavedContext::
 write(std::ostream &out, int indent_level) const {
 write(std::ostream &out, int indent_level) const {
   indent(out, indent_level) << *this << "\n";
   indent(out, indent_level) << *this << "\n";
 }
 }
+
+/**
+ * Returns an implementation-defined handle or pointer that can be used
+ * to interface directly with the underlying API.
+ * Returns 0 if the underlying implementation does not support this.
+ */
+uint64_t SavedContext::
+get_native_id() const {
+  return 0;
+}

+ 3 - 1
panda/src/gobj/savedContext.h

@@ -31,10 +31,12 @@ public:
   virtual void write(std::ostream &out, int indent_level) const;
   virtual void write(std::ostream &out, int indent_level) const;
 
 
 PUBLISHED:
 PUBLISHED:
+  virtual uint64_t get_native_id() const;
+
+public:
   static TypeHandle get_class_type() {
   static TypeHandle get_class_type() {
     return _type_handle;
     return _type_handle;
   }
   }
-public:
   static void init_type() {
   static void init_type() {
     TypedObject::init_type();
     TypedObject::init_type();
     register_type(_type_handle, "SavedContext",
     register_type(_type_handle, "SavedContext",

+ 20 - 0
panda/src/gobj/shaderBuffer.cxx

@@ -32,6 +32,26 @@ output(std::ostream &out) const {
   out << "buffer " << get_name() << ", " << _data_size_bytes << "B, " << _usage_hint;
   out << "buffer " << get_name() << ", " << _data_size_bytes << "B, " << _usage_hint;
 }
 }
 
 
+/**
+ * Asynchronously requests to read the given subset of the data on the CPU.
+ */
+PT(AsyncFuture) ShaderBuffer::
+extract_data(PreparedGraphicsObjects *prepared_objects, size_t start, size_t size) {
+  PT(AsyncFuture) future = new AsyncFuture;
+  PT(ParamBytes) bytes = new ParamBytes(vector_uchar());
+
+  prepared_objects->enqueue_call([=, bytes = std::move(bytes)] (GraphicsStateGuardianBase *gsg) {
+    gsg->async_extract_shader_buffer_data(this, (vector_uchar &)bytes->get_value(), start, size, [future = std::move(future), bytes = std::move(bytes)] (bool success) {
+      if (success) {
+        future->set_result(bytes);
+      } else {
+        future->set_result(nullptr);
+      }
+    });
+  });
+  return future;
+}
+
 /**
 /**
  * Indicates that the data should be enqueued to be prepared in the indicated
  * Indicates that the data should be enqueued to be prepared in the indicated
  * prepared_objects at the beginning of the next frame.  This will ensure the
  * prepared_objects at the beginning of the next frame.  This will ensure the

+ 4 - 0
panda/src/gobj/shaderBuffer.h

@@ -21,6 +21,7 @@
 #include "graphicsStateGuardianBase.h"
 #include "graphicsStateGuardianBase.h"
 #include "factoryParams.h"
 #include "factoryParams.h"
 #include "vector_uchar.h"
 #include "vector_uchar.h"
+#include "asyncFuture.h"
 
 
 class BufferContext;
 class BufferContext;
 class PreparedGraphicsObjects;
 class PreparedGraphicsObjects;
@@ -51,6 +52,9 @@ PUBLISHED:
   MAKE_PROPERTY(data_size_bytes, get_data_size_bytes);
   MAKE_PROPERTY(data_size_bytes, get_data_size_bytes);
   MAKE_PROPERTY(usage_hint, get_usage_hint);
   MAKE_PROPERTY(usage_hint, get_usage_hint);
 
 
+  PT(AsyncFuture) extract_data(PreparedGraphicsObjects *prepared_objects,
+                               size_t start = 0, size_t size = (size_t)-1);
+
   void prepare(PreparedGraphicsObjects *prepared_objects);
   void prepare(PreparedGraphicsObjects *prepared_objects);
   bool is_prepared(PreparedGraphicsObjects *prepared_objects) const;
   bool is_prepared(PreparedGraphicsObjects *prepared_objects) const;
 
 

+ 5 - 5
panda/src/gobj/texture.cxx

@@ -7613,7 +7613,7 @@ get_ram_image_as(const string &requested_format) {
       // If the image size wasn't a multiple of 4, we may have a handful of
       // If the image size wasn't a multiple of 4, we may have a handful of
       // pixels left over.  Convert those the slower way.
       // pixels left over.  Convert those the slower way.
       uint8_t *tail = (uint8_t *)dst;
       uint8_t *tail = (uint8_t *)dst;
-      for (int i = (imgsize & ~0x3); i < imgsize; ++i) {
+      for (size_t i = (imgsize & ~0x3); i < imgsize; ++i) {
         uint32_t v = *src++;
         uint32_t v = *src++;
         *tail++ = (v & 0x00ff0000u) >> 16;
         *tail++ = (v & 0x00ff0000u) >> 16;
         *tail++ = (v & 0x0000ff00u) >> 8;
         *tail++ = (v & 0x0000ff00u) >> 8;
@@ -7627,8 +7627,8 @@ get_ram_image_as(const string &requested_format) {
 
 
       // Convert blocks of 4 pixels at a time, so that we can treat both the
       // Convert blocks of 4 pixels at a time, so that we can treat both the
       // source and destination as 32-bit integers.
       // source and destination as 32-bit integers.
-      int blocks = imgsize >> 2;
-      for (int i = 0; i < blocks; ++i) {
+      size_t blocks = imgsize >> 2;
+      for (size_t i = 0; i < blocks; ++i) {
         uint32_t v0 = *src++;
         uint32_t v0 = *src++;
         uint32_t v1 = *src++;
         uint32_t v1 = *src++;
         uint32_t v2 = *src++;
         uint32_t v2 = *src++;
@@ -7641,7 +7641,7 @@ get_ram_image_as(const string &requested_format) {
       // If the image size wasn't a multiple of 4, we may have a handful of
       // If the image size wasn't a multiple of 4, we may have a handful of
       // pixels left over.  Convert those the slower way.
       // pixels left over.  Convert those the slower way.
       uint8_t *tail = (uint8_t *)dst;
       uint8_t *tail = (uint8_t *)dst;
-      for (int i = (imgsize & ~0x3); i < imgsize; ++i) {
+      for (size_t i = (imgsize & ~0x3); i < imgsize; ++i) {
         uint32_t v = *src++;
         uint32_t v = *src++;
         *tail++ = (v & 0x000000ffu);
         *tail++ = (v & 0x000000ffu);
         *tail++ = (v & 0x0000ff00u) >> 8;
         *tail++ = (v & 0x0000ff00u) >> 8;
@@ -7653,7 +7653,7 @@ get_ram_image_as(const string &requested_format) {
     uint8_t *dst = (uint8_t *)newdata.p();
     uint8_t *dst = (uint8_t *)newdata.p();
 
 
     if (format == "RGB" && cdata->_num_components == 3) {
     if (format == "RGB" && cdata->_num_components == 3) {
-      for (int i = 0; i < imgsize; ++i) {
+      for (size_t i = 0; i < imgsize; ++i) {
         *dst++ = src[2];
         *dst++ = src[2];
         *dst++ = src[1];
         *dst++ = src[1];
         *dst++ = src[0];
         *dst++ = src[0];

+ 0 - 10
panda/src/gobj/textureContext.cxx

@@ -15,16 +15,6 @@
 
 
 TypeHandle TextureContext::_type_handle;
 TypeHandle TextureContext::_type_handle;
 
 
-/**
- * Returns an implementation-defined handle or pointer that can be used
- * to interface directly with the underlying API.
- * Returns 0 if the underlying implementation does not support this.
- */
-uint64_t TextureContext::
-get_native_id() const {
-  return 0;
-}
-
 /**
 /**
  * Similar to get_native_id, but some implementations use a separate
  * Similar to get_native_id, but some implementations use a separate
  * identifier for the buffer object associated with buffer textures.
  * identifier for the buffer object associated with buffer textures.

+ 2 - 1
panda/src/gobj/textureContext.h

@@ -37,7 +37,8 @@ public:
 PUBLISHED:
 PUBLISHED:
   INLINE Texture *get_texture() const;
   INLINE Texture *get_texture() const;
   INLINE int get_view() const;
   INLINE int get_view() const;
-  virtual uint64_t get_native_id() const;
+
+  //virtual uint64_t get_native_id() const;
   virtual uint64_t get_native_buffer_id() const;
   virtual uint64_t get_native_buffer_id() const;
 
 
   INLINE bool was_modified() const;
   INLINE bool was_modified() const;

+ 5 - 0
panda/src/gsgbase/graphicsStateGuardianBase.h

@@ -23,6 +23,7 @@
 #include "patomic.h"
 #include "patomic.h"
 #include "small_vector.h"
 #include "small_vector.h"
 #include "completionToken.h"
 #include "completionToken.h"
+#include "vector_uchar.h"
 
 
 // A handful of forward references.
 // A handful of forward references.
 
 
@@ -32,6 +33,7 @@ class GraphicsWindow;
 class NodePath;
 class NodePath;
 class GraphicsOutputBase;
 class GraphicsOutputBase;
 class ScreenshotRequest;
 class ScreenshotRequest;
+class AsyncFuture;
 
 
 class VertexBufferContext;
 class VertexBufferContext;
 class IndexBufferContext;
 class IndexBufferContext;
@@ -176,6 +178,9 @@ public:
   virtual BufferContext *prepare_shader_buffer(ShaderBuffer *data)=0;
   virtual BufferContext *prepare_shader_buffer(ShaderBuffer *data)=0;
   virtual void release_shader_buffer(BufferContext *ibc)=0;
   virtual void release_shader_buffer(BufferContext *ibc)=0;
   virtual void release_shader_buffers(const pvector<BufferContext *> &contexts)=0;
   virtual void release_shader_buffers(const pvector<BufferContext *> &contexts)=0;
+  virtual void async_extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data,
+                                                size_t start = 0, size_t size = (size_t)-1,
+                                                CompletionToken token = CompletionToken())=0;
 
 
   virtual void dispatch_compute(int size_x, int size_y, int size_z)=0;
   virtual void dispatch_compute(int size_x, int size_y, int size_z)=0;
 
 

+ 12015 - 2339
panda/src/movies/dr_flac.h

@@ -1,2978 +1,12654 @@
-// Public domain. See "unlicense" statement at the end of this file.
-//NB: modified by rdb to use 16-bit instead of 32-bit samples.
-
-// ABOUT
-//
-// This is a simple library for decoding FLAC files.
-//
-//
-//
-// USAGE
-//
-// This is a single-file library. To use it, do something like the following in one .c file.
-//   #define DR_FLAC_IMPLEMENTATION
-//   #include "dr_flac.h"
-//
-// You can then #include this file in other parts of the program as you would with any other header file. To decode audio data,
-// do something like the following:
-//
-//     drflac* pFlac = drflac_open_file("MySong.flac");
-//     if (pFlac == NULL) {
-//         ... Failed to open FLAC file ...
-//     }
-//
-//     int16_t* pSamples = malloc(pFlac->totalSampleCount * sizeof(int16_t));
-//     uint64_t numberOfSamplesActuallyRead = drflac_read_s16(pFlac, pFlac->totalSampleCount, pSamples);
-//
-//     ... pSamples now contains the decoded samples as interleaved signed 16-bit PCM ...
-//
-// The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of
-// channels and the bits per sample, should be directly accessible - just make sure you don't change their values.
-//
-// You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and
-// the decoder will give you as many samples as it can, up to the amount requested. Later on when you need the next batch of
-// samples, just call it again. Example:
-//
-//     while (drflac_read_s16(pFlac, chunkSize, pChunkSamples) > 0) {
-//         do_something();
-//     }
-//
-// You can seek to a specific sample with drflac_seek_to_sample(). The given sample is based on interleaving. So for example,
-// if you were to seek to the sample at index 0 in a stereo stream, you'll be seeking to the first sample of the left channel.
-// The sample at index 1 will be the first sample of the right channel. The sample at index 2 will be the second sample of the
-// left channel, etc.
-//
-//
-//
-// OPTIONS
-// #define these options before including this file.
-//
-// #define DR_FLAC_NO_STDIO
-//   Disable drflac_open_file().
-//
-// #define DR_FLAC_NO_WIN32_IO
-//   Don't use the Win32 API internally for drflac_open_file(). Setting this will force stdio FILE APIs instead. This is
-//   mainly for testing, but it's left here in case somebody might find use for it. dr_flac will use the Win32 API by
-//   default. Ignored when DR_FLAC_NO_STDIO is #defined.
-//
-// #define DR_FLAC_BUFFER_SIZE <number>
-//   Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls
-//   back to the client for more data. Larger values means more memory, but better performance. My tests show diminishing
-//   returns after about 4KB (which is the default). Consider reducing this if you have a very efficient implementation of
-//   onRead(), or increase it if it's very inefficient.
-//
-//
-//
-// QUICK NOTES
-//
-// - Based on my own tests, the 32-bit build is about about 1.1x-1.25x slower than the reference implementation. The 64-bit
-//   build is at about parity.
-// - This should work fine with valid native FLAC files, but it won't work very well when the STREAMINFO block is unavailable
-//   and when a stream starts in the middle of a frame. This is something I plan on addressing.
-// - Audio data is retrieved as signed 16-bit PCM, regardless of the bits per sample the FLAC stream is encoded as.
-// - This has not been tested on big-endian architectures.
-// - Rice codes in unencoded binary form (see https://xiph.org/flac/format.html#rice_partition) has not been tested. If anybody
-//   knows where I can find some test files for this, let me know.
-// - Perverse and erroneous files have not been tested. Again, if you know where I can get some test files let me know.
-// - dr_flac is not thread-safe, but it's APIs can be called from any thread so long as you do your own synchronization.
-// - dr_flac does not currently do any CRC checks.
-// - Ogg encapsulation is not supported, but I want to add it at some point.
-//
-//
-//
-// TODO
-// - Implement a proper test suite.
-// - Add support for initializing the decoder without a STREAMINFO block. Build a synthethic test to get support working at at least
-//   a basic level.
-// - Add support for retrieving metadata blocks so applications can retrieve the album art or whatnot.
-// - Add support for Ogg encapsulation.
+/*
+FLAC audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file.
+dr_flac - v0.13.2 - 2025-12-02
 
 
-#ifndef dr_flac_h
-#define dr_flac_h
+David Reid - [email protected]
 
 
-#include <stddef.h>
-#include <stdint.h>
-//#include <stdbool.h>
+GitHub: https://github.com/mackron/dr_libs
+*/
 
 
-// As data is read from the client it is placed into an internal buffer for fast access. This controls the
-// size of that buffer. Larger values means more speed, but also more memory. In my testing there is diminishing
-// returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
-#ifndef DR_FLAC_BUFFER_SIZE
-#define DR_FLAC_BUFFER_SIZE   4096
-#endif
+/*
+Introduction
+============
+dr_flac is a single file library. To use it, do something like the following in one .c file.
 
 
-// Check if we can enable 64-bit optimizations.
-#if defined(_WIN64)
-#define DRFLAC_64BIT
-#endif
+    ```c
+    #define DR_FLAC_IMPLEMENTATION
+    #include "dr_flac.h"
+    ```
 
 
-#if defined(__GNUC__)
-#if defined(__x86_64__) || defined(__ppc64__)
-#define DRFLAC_64BIT
-#endif
-#endif
+You can then #include this file in other parts of the program as you would with any other header file. To decode audio data, do something like the following:
 
 
-#ifdef DRFLAC_64BIT
-typedef uint64_t drflac_cache_t;
-#else
-typedef uint32_t drflac_cache_t;
-#endif
+    ```c
+    drflac* pFlac = drflac_open_file("MySong.flac", NULL);
+    if (pFlac == NULL) {
+        // Failed to open FLAC file
+    }
 
 
+    drflac_int32* pSamples = malloc(pFlac->totalPCMFrameCount * pFlac->channels * sizeof(drflac_int32));
+    drflac_uint64 numberOfInterleavedSamplesActuallyRead = drflac_read_pcm_frames_s32(pFlac, pFlac->totalPCMFrameCount, pSamples);
+    ```
 
 
+The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of channels and the bits per sample,
+should be directly accessible - just make sure you don't change their values. Samples are always output as interleaved signed 32-bit PCM. In the example above
+a native FLAC stream was opened, however dr_flac has seamless support for Ogg encapsulated FLAC streams as well.
 
 
-// Callback for when data is read. Return value is the number of bytes actually read.
-typedef size_t (* drflac_read_proc)(void* userData, void* bufferOut, size_t bytesToRead);
+You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and the decoder will give you as many
+samples as it can, up to the amount requested. Later on when you need the next batch of samples, just call it again. Example:
 
 
-// Callback for when data needs to be seeked. Offset is always relative to the current position. Return value is false on failure, true success.
-typedef bool (* drflac_seek_proc)(void* userData, int offset);
+    ```c
+    while (drflac_read_pcm_frames_s32(pFlac, chunkSizeInPCMFrames, pChunkSamples) > 0) {
+        do_something();
+    }
+    ```
 
 
+You can seek to a specific PCM frame with `drflac_seek_to_pcm_frame()`.
 
 
-typedef struct
-{
-    // The absolute position of the first byte of the data of the block. This is just past the block's header.
-    long long pos;
+If you just want to quickly decode an entire FLAC file in one go you can do something like this:
 
 
-    // The size in bytes of the block's data.
-    unsigned int sizeInBytes;
+    ```c
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+    drflac_int32* pSampleData = drflac_open_file_and_read_pcm_frames_s32("MySong.flac", &channels, &sampleRate, &totalPCMFrameCount, NULL);
+    if (pSampleData == NULL) {
+        // Failed to open and decode FLAC file.
+    }
 
 
-} drflac_block;
+    ...
 
 
-typedef struct
-{
-    // The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC.
-    unsigned char subframeType;
+    drflac_free(pSampleData, NULL);
+    ```
 
 
-    // The number of wasted bits per sample as specified by the sub-frame header.
-    unsigned char wastedBitsPerSample;
+You can read samples as signed 16-bit integer and 32-bit floating-point PCM with the *_s16() and *_f32() family of APIs respectively, but note that these
+should be considered lossy.
 
 
-    // The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC.
-    unsigned char lpcOrder;
 
 
-    // The number of bits per sample for this subframe. This is not always equal to the current frame's bit per sample because
-    // an extra bit is required for side channels when interchannel decorrelation is being used.
-    int bitsPerSample;
+If you need access to metadata (album art, etc.), use `drflac_open_with_metadata()`, `drflac_open_file_with_metdata()` or `drflac_open_memory_with_metadata()`.
+The rationale for keeping these APIs separate is that they're slightly slower than the normal versions and also just a little bit harder to use. dr_flac
+reports metadata to the application through the use of a callback, and every metadata block is reported before `drflac_open_with_metdata()` returns.
 
 
-    // A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pHeap, or
-    // NULL if the heap is not being used. Note that it's a signed 32-bit integer for each value.
-    int32_t* pDecodedSamples;
+The main opening APIs (`drflac_open()`, etc.) will fail if the header is not present. The presents a problem in certain scenarios such as broadcast style
+streams or internet radio where the header may not be present because the user has started playback mid-stream. To handle this, use the relaxed APIs:
 
 
-} drflac_subframe;
+    `drflac_open_relaxed()`
+    `drflac_open_with_metadata_relaxed()`
 
 
-typedef struct
-{
-    // If the stream uses variable block sizes, this will be set to the index of the first sample. If fixed block sizes are used, this will
-    // always be set to 0.
-    unsigned long long sampleNumber;
+It is not recommended to use these APIs for file based streams because a missing header would usually indicate a corrupt or perverse file. In addition, these
+APIs can take a long time to initialize because they may need to spend a lot of time finding the first frame.
 
 
-    // If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0.
-    unsigned int frameNumber;
 
 
-    // The sample rate of this frame.
-    unsigned int sampleRate;
 
 
-    // The number of samples in each sub-frame within this frame.
-    unsigned short blockSize;
+Build Options
+=============
+#define these options before including this file.
 
 
-    // The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
-    // will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
-    unsigned char channelAssignment;
+#define DR_FLAC_NO_STDIO
+  Disable `drflac_open_file()` and family.
 
 
-    // The number of bits per sample within this frame.
-    unsigned char bitsPerSample;
+#define DR_FLAC_NO_OGG
+  Disables support for Ogg/FLAC streams.
 
 
-    // The frame's CRC. This is set, but unused at the moment.
-    unsigned char crc8;
+#define DR_FLAC_BUFFER_SIZE <number>
+  Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls back to the client for more data.
+  Larger values means more memory, but better performance. My tests show diminishing returns after about 4KB (which is the default). Consider reducing this if
+  you have a very efficient implementation of onRead(), or increase it if it's very inefficient. Must be a multiple of 8.
 
 
-    // The number of samples left to be read in this frame. This is initially set to the block size multiplied by the channel count. As samples
-    // are read, this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
-    unsigned int samplesRemaining;
+#define DR_FLAC_NO_CRC
+  Disables CRC checks. This will offer a performance boost when CRC is unnecessary. This will disable binary search seeking. When seeking, the seek table will
+  be used if available. Otherwise the seek will be performed using brute force.
 
 
-    // The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels.
-    drflac_subframe subframes[8];
+#define DR_FLAC_NO_SIMD
+  Disables SIMD optimizations (SSE on x86/x64 architectures, NEON on ARM architectures). Use this if you are having compatibility issues with your compiler.
+
+#define DR_FLAC_NO_WCHAR
+  Disables all functions ending with `_w`. Use this if your compiler does not provide wchar.h. Not required if DR_FLAC_NO_STDIO is also defined.
 
 
-} drflac_frame;
 
 
+
+Notes
+=====
+- dr_flac does not support changing the sample rate nor channel count mid stream.
+- dr_flac is not thread-safe, but its APIs can be called from any thread so long as you do your own synchronization.
+- When using Ogg encapsulation, a corrupted metadata block will result in `drflac_open_with_metadata()` and `drflac_open()` returning inconsistent samples due
+  to differences in corrupted stream recorvery logic between the two APIs.
+*/
+
+#ifndef dr_flac_h
+#define dr_flac_h
+
+#if defined(CPPPARSER)
+    #define DR_FLAC_NO_SIMD
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DRFLAC_STRINGIFY(x)      #x
+#define DRFLAC_XSTRINGIFY(x)     DRFLAC_STRINGIFY(x)
+
+#define DRFLAC_VERSION_MAJOR     0
+#define DRFLAC_VERSION_MINOR     13
+#define DRFLAC_VERSION_REVISION  2
+#define DRFLAC_VERSION_STRING    DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MAJOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MINOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_REVISION)
+
+#include <stddef.h> /* For size_t. */
+
+/* Sized Types */
+typedef   signed char           drflac_int8;
+typedef unsigned char           drflac_uint8;
+typedef   signed short          drflac_int16;
+typedef unsigned short          drflac_uint16;
+typedef   signed int            drflac_int32;
+typedef unsigned int            drflac_uint32;
+/* Panda3D: Added !defined(CPPPARSER) so that Interrogate can parse these types correctly. */
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(CPPPARSER)
+    typedef   signed __int64    drflac_int64;
+    typedef unsigned __int64    drflac_uint64;
+#else
+    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wlong-long"
+        #if defined(__clang__)
+            #pragma GCC diagnostic ignored "-Wc++11-long-long"
+        #endif
+    #endif
+    typedef   signed long long  drflac_int64;
+    typedef unsigned long long  drflac_uint64;
+    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+        #pragma GCC diagnostic pop
+    #endif
+#endif
+#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined(_M_IA64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
+    typedef drflac_uint64       drflac_uintptr;
+#else
+    typedef drflac_uint32       drflac_uintptr;
+#endif
+typedef drflac_uint8            drflac_bool8;
+typedef drflac_uint32           drflac_bool32;
+#define DRFLAC_TRUE             1
+#define DRFLAC_FALSE            0
+/* End Sized Types */
+
+/* Decorations */
+#if !defined(DRFLAC_API)
+    #if defined(DRFLAC_DLL)
+        #if defined(_WIN32)
+            #define DRFLAC_DLL_IMPORT  __declspec(dllimport)
+            #define DRFLAC_DLL_EXPORT  __declspec(dllexport)
+            #define DRFLAC_DLL_PRIVATE static
+        #else
+            #if defined(__GNUC__) && __GNUC__ >= 4
+                #define DRFLAC_DLL_IMPORT  __attribute__((visibility("default")))
+                #define DRFLAC_DLL_EXPORT  __attribute__((visibility("default")))
+                #define DRFLAC_DLL_PRIVATE __attribute__((visibility("hidden")))
+            #else
+                #define DRFLAC_DLL_IMPORT
+                #define DRFLAC_DLL_EXPORT
+                #define DRFLAC_DLL_PRIVATE static
+            #endif
+        #endif
+
+        #if defined(DR_FLAC_IMPLEMENTATION) || defined(DRFLAC_IMPLEMENTATION)
+            #define DRFLAC_API  DRFLAC_DLL_EXPORT
+        #else
+            #define DRFLAC_API  DRFLAC_DLL_IMPORT
+        #endif
+        #define DRFLAC_PRIVATE DRFLAC_DLL_PRIVATE
+    #else
+        #define DRFLAC_API extern
+        #define DRFLAC_PRIVATE static
+    #endif
+#endif
+/* End Decorations */
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700   /* Visual Studio 2012 */
+    #define DRFLAC_DEPRECATED       __declspec(deprecated)
+#elif (defined(__GNUC__) && __GNUC__ >= 4)  /* GCC 4 */
+    #define DRFLAC_DEPRECATED       __attribute__((deprecated))
+#elif defined(__has_feature)                /* Clang */
+    #if __has_feature(attribute_deprecated)
+        #define DRFLAC_DEPRECATED   __attribute__((deprecated))
+    #else
+        #define DRFLAC_DEPRECATED
+    #endif
+#else
+    #define DRFLAC_DEPRECATED
+#endif
+
+DRFLAC_API void drflac_version(drflac_uint32* pMajor, drflac_uint32* pMinor, drflac_uint32* pRevision);
+DRFLAC_API const char* drflac_version_string(void);
+
+/* Allocation Callbacks */
 typedef struct
 typedef struct
 {
 {
-    // The function to call when more data needs to be read. This is set by drflac_open().
-    drflac_read_proc onRead;
+    void* pUserData;
+    void* (* onMalloc)(size_t sz, void* pUserData);
+    void* (* onRealloc)(void* p, size_t sz, void* pUserData);
+    void  (* onFree)(void* p, void* pUserData);
+} drflac_allocation_callbacks;
+/* End Allocation Callbacks */
 
 
-    // The function to call when the current read position needs to be moved.
-    drflac_seek_proc onSeek;
+/*
+As data is read from the client it is placed into an internal buffer for fast access. This controls the size of that buffer. Larger values means more speed,
+but also more memory. In my testing there is diminishing returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
+*/
+#ifndef DR_FLAC_BUFFER_SIZE
+#define DR_FLAC_BUFFER_SIZE   4096
+#endif
 
 
-    // The user data to pass around to onRead and onSeek.
-    void* pUserData;
 
 
+/* Architecture Detection */
+#if defined(_WIN64) || defined(_LP64) || defined(__LP64__)
+#define DRFLAC_64BIT
+#endif
 
 
-    // The sample rate. Will be set to something like 44100.
-    unsigned int sampleRate;
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
+    #define DRFLAC_X64
+#elif defined(__i386) || defined(_M_IX86)
+    #define DRFLAC_X86
+#elif defined(__arm__) || defined(_M_ARM) || defined(__arm64) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    #define DRFLAC_ARM
+#endif
+/* End Architecture Detection */
 
 
-    // The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
-    // value specified in the STREAMINFO block.
-    unsigned char channels;
 
 
-    // The bits per sample. Will be set to somthing like 16, 24, etc.
-    unsigned char bitsPerSample;
+#ifdef DRFLAC_64BIT
+typedef drflac_uint64 drflac_cache_t;
+#else
+typedef drflac_uint32 drflac_cache_t;
+#endif
 
 
-    // The maximum block size, in samples. This number represents the number of samples in each channel (not combined).
-    unsigned short maxBlockSize;
+/* The various metadata block types. */
+#define DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO       0
+#define DRFLAC_METADATA_BLOCK_TYPE_PADDING          1
+#define DRFLAC_METADATA_BLOCK_TYPE_APPLICATION      2
+#define DRFLAC_METADATA_BLOCK_TYPE_SEEKTABLE        3
+#define DRFLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT   4
+#define DRFLAC_METADATA_BLOCK_TYPE_CUESHEET         5
+#define DRFLAC_METADATA_BLOCK_TYPE_PICTURE          6
+#define DRFLAC_METADATA_BLOCK_TYPE_INVALID          127
+
+/* The various picture types specified in the PICTURE block. */
+#define DRFLAC_PICTURE_TYPE_OTHER                   0
+#define DRFLAC_PICTURE_TYPE_FILE_ICON               1
+#define DRFLAC_PICTURE_TYPE_OTHER_FILE_ICON         2
+#define DRFLAC_PICTURE_TYPE_COVER_FRONT             3
+#define DRFLAC_PICTURE_TYPE_COVER_BACK              4
+#define DRFLAC_PICTURE_TYPE_LEAFLET_PAGE            5
+#define DRFLAC_PICTURE_TYPE_MEDIA                   6
+#define DRFLAC_PICTURE_TYPE_LEAD_ARTIST             7
+#define DRFLAC_PICTURE_TYPE_ARTIST                  8
+#define DRFLAC_PICTURE_TYPE_CONDUCTOR               9
+#define DRFLAC_PICTURE_TYPE_BAND                    10
+#define DRFLAC_PICTURE_TYPE_COMPOSER                11
+#define DRFLAC_PICTURE_TYPE_LYRICIST                12
+#define DRFLAC_PICTURE_TYPE_RECORDING_LOCATION      13
+#define DRFLAC_PICTURE_TYPE_DURING_RECORDING        14
+#define DRFLAC_PICTURE_TYPE_DURING_PERFORMANCE      15
+#define DRFLAC_PICTURE_TYPE_SCREEN_CAPTURE          16
+#define DRFLAC_PICTURE_TYPE_BRIGHT_COLORED_FISH     17
+#define DRFLAC_PICTURE_TYPE_ILLUSTRATION            18
+#define DRFLAC_PICTURE_TYPE_BAND_LOGOTYPE           19
+#define DRFLAC_PICTURE_TYPE_PUBLISHER_LOGOTYPE      20
+
+typedef enum
+{
+    drflac_container_native,
+    drflac_container_ogg,
+    drflac_container_unknown
+} drflac_container;
 
 
-    // The total number of samples making up the stream. This includes every channel. For example, if the stream has 2 channels,
-    // with each channel having a total of 4096, this value will be set to 2*4096 = 8192.
-    uint64_t totalSampleCount;
+typedef enum
+{
+    DRFLAC_SEEK_SET,
+    DRFLAC_SEEK_CUR,
+    DRFLAC_SEEK_END
+} drflac_seek_origin;
 
 
+/* The order of members in this structure is important because we map this directly to the raw data within the SEEKTABLE metadata block. */
+typedef struct
+{
+    drflac_uint64 firstPCMFrame;
+    drflac_uint64 flacFrameOffset;   /* The offset from the first byte of the header of the first frame. */
+    drflac_uint16 pcmFrameCount;
+} drflac_seekpoint;
 
 
-    // The location and size of the APPLICATION block.
-    drflac_block applicationBlock;
+typedef struct
+{
+    drflac_uint16 minBlockSizeInPCMFrames;
+    drflac_uint16 maxBlockSizeInPCMFrames;
+    drflac_uint32 minFrameSizeInPCMFrames;
+    drflac_uint32 maxFrameSizeInPCMFrames;
+    drflac_uint32 sampleRate;
+    drflac_uint8  channels;
+    drflac_uint8  bitsPerSample;
+    drflac_uint64 totalPCMFrameCount;
+    drflac_uint8  md5[16];
+} drflac_streaminfo;
 
 
-    // The location and size of the SEEKTABLE block.
-    drflac_block seektableBlock;
+typedef struct
+{
+    /*
+    The metadata type. Use this to know how to interpret the data below. Will be set to one of the
+    DRFLAC_METADATA_BLOCK_TYPE_* tokens.
+    */
+    drflac_uint32 type;
+
+    /* The size in bytes of the block and the buffer pointed to by pRawData if it's non-NULL. */
+    drflac_uint32 rawDataSize;
+
+    /* The offset in the stream of the raw data. */
+    drflac_uint64 rawDataOffset;
+
+    /*
+    A pointer to the raw data. This points to a temporary buffer so don't hold on to it. It's best to
+    not modify the contents of this buffer. Use the structures below for more meaningful and structured
+    information about the metadata. It's possible for this to be null.
+    */
+    const void* pRawData;
+
+    union
+    {
+        drflac_streaminfo streaminfo;
 
 
-    // The location and size of the VORBIS_COMMENT block.
-    drflac_block vorbisCommentBlock;
+        struct
+        {
+            int unused;
+        } padding;
 
 
-    // The location and size of the CUESHEET block.
-    drflac_block cuesheetBlock;
+        struct
+        {
+            drflac_uint32 id;
+            const void* pData;
+            drflac_uint32 dataSize;
+        } application;
 
 
-    // The location and size of the PICTURE block.
-    drflac_block pictureBlock;
+        struct
+        {
+            drflac_uint32 seekpointCount;
+            const drflac_seekpoint* pSeekpoints;
+        } seektable;
 
 
+        struct
+        {
+            drflac_uint32 vendorLength;
+            const char* vendor;
+            drflac_uint32 commentCount;
+            const void* pComments;
+        } vorbis_comment;
 
 
-    // Information about the frame the decoder is currently sitting on.
-    drflac_frame currentFrame;
+        struct
+        {
+            char catalog[128];
+            drflac_uint64 leadInSampleCount;
+            drflac_bool32 isCD;
+            drflac_uint8 trackCount;
+            const void* pTrackData;
+        } cuesheet;
+
+        struct
+        {
+            drflac_uint32 type;
+            drflac_uint32 mimeLength;
+            const char* mime;
+            drflac_uint32 descriptionLength;
+            const char* description;
+            drflac_uint32 width;
+            drflac_uint32 height;
+            drflac_uint32 colorDepth;
+            drflac_uint32 indexColorCount;
+            drflac_uint32 pictureDataSize;
+            drflac_uint64 pictureDataOffset;  /* Offset from the start of the stream. */
+            const drflac_uint8* pPictureData;
+        } picture;
+    } data;
+} drflac_metadata;
 
 
-    // The position of the first frame in the stream. This is only ever used for seeking.
-    unsigned long long firstFramePos;
 
 
+/*
+Callback for when data needs to be read from the client.
 
 
 
 
-    // The current byte position in the client's data stream.
-    uint64_t currentBytePos;
+Parameters
+----------
+pUserData (in)
+    The user data that was passed to drflac_open() and family.
 
 
-    // The index of the next valid cache line in the "L2" cache.
-    size_t nextL2Line;
+pBufferOut (out)
+    The output buffer.
 
 
-    // The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining.
-    size_t consumedBits;
+bytesToRead (in)
+    The number of bytes to read.
 
 
-    // Unused L2 lines. This will always be 0 until the end of the stream is hit. Used for correctly calculating the current byte
-    // position of the read pointer in the stream.
-    size_t unusedL2Lines;
 
 
-    // The cached data which was most recently read from the client. When data is read from the client, it is placed within this
-    // variable. As data is read, it's bit-shifted such that the next valid bit is sitting on the most significant bit.
-    drflac_cache_t cache;
-    drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];
+Return Value
+------------
+The number of bytes actually read.
 
 
 
 
-    // A pointer to the decoded sample data. This is an offset of pExtraData.
-    int32_t* pDecodedSamples;
+Remarks
+-------
+A return value of less than bytesToRead indicates the end of the stream. Do _not_ return from this callback until either the entire bytesToRead is filled or
+you have reached the end of the stream.
+*/
+typedef size_t (* drflac_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
 
 
-    // Variable length extra data. We attach this to the end of the object so we avoid unnecessary mallocs.
-    char pExtraData[1];
+/*
+Callback for when data needs to be seeked.
 
 
-} drflac;
 
 
+Parameters
+----------
+pUserData (in)
+    The user data that was passed to drflac_open() and family.
 
 
+offset (in)
+    The number of bytes to move, relative to the origin. Will never be negative.
 
 
+origin (in)
+    The origin of the seek - the current position, the start of the stream, or the end of the stream.
 
 
-// Opens a FLAC decoder.
-//
-// This is the lowest level function for opening a FLAC stream. You can also use drflac_open_file() and drflac_open_memory()
-// to open the stream from a file or from a block of memory respectively.
-//
-// At the moment the STREAMINFO block must be present for this to succeed.
-//
-// The onRead and onSeek callbacks are used to read and seek data provided by the client.
-static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData);
 
 
-// Closes the given FLAC decoder.
-static void drflac_close(drflac* pFlac);
+Return Value
+------------
+Whether or not the seek was successful.
 
 
-// Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
-//
-// Returns the number of samples actually read.
-static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* pBufferOut);
 
 
-// Seeks to the sample at the given index.
-static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex);
+Remarks
+-------
+Seeking relative to the start and the current position must always be supported. If seeking from the end of the stream is not supported, return DRFLAC_FALSE.
 
 
+When seeking to a PCM frame using drflac_seek_to_pcm_frame(), dr_flac may call this with an offset beyond the end of the FLAC stream. This needs to be detected
+and handled by returning DRFLAC_FALSE.
+*/
+typedef drflac_bool32 (* drflac_seek_proc)(void* pUserData, int offset, drflac_seek_origin origin);
 
 
+/*
+Callback for when the current position in the stream needs to be retrieved.
 
 
-#ifndef DR_FLAC_NO_STDIO
-// Opens a flac decoder from the file at the given path.
-static drflac* drflac_open_file(const char* pFile);
-#endif
 
 
-// Helper for opening a file from a pre-allocated memory buffer.
-//
-// This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
-// the lifetime of the decoder.
-static drflac* drflac_open_memory(const void* data, size_t dataSize);
+Parameters
+----------
+pUserData (in)
+    The user data that was passed to drflac_open() and family.
 
 
-#endif  //dr_flac_h
+pCursor (out)
+    A pointer to a variable to receive the current position in the stream.
 
 
 
 
-///////////////////////////////////////////////////////////////////////////////
-//
-// IMPLEMENTATION
-//
-///////////////////////////////////////////////////////////////////////////////
-#ifdef DR_FLAC_IMPLEMENTATION
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
+Return Value
+------------
+Whether or not the operation was successful.
+*/
+typedef drflac_bool32 (* drflac_tell_proc)(void* pUserData, drflac_int64* pCursor);
 
 
-#ifdef _MSC_VER
-#include <intrin.h>     // For _byteswap_ulong and _byteswap_uint64
-#endif
+/*
+Callback for when a metadata block is read.
 
 
-#ifdef __linux__
-#ifndef _BSD_SOURCE
-#define _BSD_SOURCE
-#endif
-#include <endian.h>
-#endif
 
 
-#define DRFLAC_INLINE ALWAYS_INLINE
+Parameters
+----------
+pUserData (in)
+    The user data that was passed to drflac_open() and family.
 
 
-#define DRFLAC_BLOCK_TYPE_STREAMINFO                    0
-#define DRFLAC_BLOCK_TYPE_PADDING                       1
-#define DRFLAC_BLOCK_TYPE_APPLICATION                   2
-#define DRFLAC_BLOCK_TYPE_SEEKTABLE                     3
-#define DRFLAC_BLOCK_TYPE_VORBIS_COMMENT                4
-#define DRFLAC_BLOCK_TYPE_CUESHEET                      5
-#define DRFLAC_BLOCK_TYPE_PICTURE                       6
-#define DRFLAC_BLOCK_TYPE_INVALID                       127
+pMetadata (in)
+    A pointer to a structure containing the data of the metadata block.
 
 
-#define DRFLAC_SUBFRAME_CONSTANT                        0
-#define DRFLAC_SUBFRAME_VERBATIM                        1
-#define DRFLAC_SUBFRAME_FIXED                           8
-#define DRFLAC_SUBFRAME_LPC                             32
-#define DRFLAC_SUBFRAME_RESERVED                        255
 
 
-#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
-#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
+Remarks
+-------
+Use pMetadata->type to determine which metadata block is being handled and how to read the data. This
+will be set to one of the DRFLAC_METADATA_BLOCK_TYPE_* tokens.
+*/
+typedef void (* drflac_meta_proc)(void* pUserData, drflac_metadata* pMetadata);
 
 
-#define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
-#define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
-#define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
-#define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
 
 
+/* Structure for internal use. Only used for decoders opened with drflac_open_memory. */
 typedef struct
 typedef struct
 {
 {
-    uint64_t firstSample;
-    uint64_t frameOffset;   // The offset from the first byte of the header of the first frame.
-    uint16_t sampleCount;
-} drflac_seekpoint;
-
-#ifndef DR_FLAC_NO_STDIO
-#if defined(DR_FLAC_NO_WIN32_IO) || !defined(_WIN32)
-#include <stdio.h>
+    const drflac_uint8* data;
+    size_t dataSize;
+    size_t currentReadPos;
+} drflac__memory_stream;
 
 
-static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+/* Structure for internal use. Used for bit streaming. */
+typedef struct
 {
 {
-    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
-}
+    /* The function to call when more data needs to be read. */
+    drflac_read_proc onRead;
 
 
-static bool drflac__on_seek_stdio(void* pUserData, int offset)
-{
-    return fseek((FILE*)pUserData, offset, SEEK_CUR) == 0;
-}
+    /* The function to call when the current read position needs to be moved. */
+    drflac_seek_proc onSeek;
 
 
-drflac* drflac_open_file(const char* filename)
-{
-    FILE* pFile;
-#ifdef _MSC_VER
-    if (fopen_s(&pFile, filename, "rb") != 0) {
-        return NULL;
-    }
-#else
-    pFile = fopen(filename, "rb");
-    if (pFile == NULL) {
-        return NULL;
-    }
-#endif
+    /* The function to call when the current read position needs to be retrieved. */
+    drflac_tell_proc onTell;
 
 
-    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, pFile);
-}
-#else
-#include <windows.h>
+    /* The user data to pass around to onRead and onSeek. */
+    void* pUserData;
 
 
-static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
-{
-    assert(bytesToRead < 0xFFFFFFFF);   // dr_flac will never request huge amounts of data at a time. This is a safe assertion.
 
 
-    DWORD bytesRead;
-    ReadFile((HANDLE)pUserData, bufferOut, (DWORD)bytesToRead, &bytesRead, NULL);
+    /*
+    The number of unaligned bytes in the L2 cache. This will always be 0 until the end of the stream is hit. At the end of the
+    stream there will be a number of bytes that don't cleanly fit in an L1 cache line, so we use this variable to know whether
+    or not the bistreamer needs to run on a slower path to read those last bytes. This will never be more than sizeof(drflac_cache_t).
+    */
+    size_t unalignedByteCount;
 
 
-    return (size_t)bytesRead;
-}
+    /* The content of the unaligned bytes. */
+    drflac_cache_t unalignedCache;
 
 
-static bool drflac__on_seek_stdio(void* pUserData, int offset)
-{
-    return SetFilePointer((HANDLE)pUserData, offset, NULL, FILE_CURRENT) != INVALID_SET_FILE_POINTER;
-}
+    /* The index of the next valid cache line in the "L2" cache. */
+    drflac_uint32 nextL2Line;
 
 
-static drflac* drflac_open_file(const char* filename)
-{
-    HANDLE hFile = CreateFileA(filename, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
-    if (hFile == INVALID_HANDLE_VALUE) {
-        return NULL;
-    }
+    /* The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining. */
+    drflac_uint32 consumedBits;
 
 
-    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, (void*)hFile);
-}
-#endif
-#endif  //DR_FLAC_NO_STDIO
+    /*
+    The cached data which was most recently read from the client. There are two levels of cache. Data flows as such:
+    Client -> L2 -> L1. The L2 -> L1 movement is aligned and runs on a fast path in just a few instructions.
+    */
+    drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];
+    drflac_cache_t cache;
 
 
+    /*
+    CRC-16. This is updated whenever bits are read from the bit stream. Manually set this to 0 to reset the CRC. For FLAC, this
+    is reset to 0 at the beginning of each frame.
+    */
+    drflac_uint16 crc16;
+    drflac_cache_t crc16Cache;              /* A cache for optimizing CRC calculations. This is filled when when the L1 cache is reloaded. */
+    drflac_uint32 crc16CacheIgnoredBytes;   /* The number of bytes to ignore when updating the CRC-16 from the CRC-16 cache. */
+} drflac_bs;
 
 
 typedef struct
 typedef struct
 {
 {
-    /// A pointer to the beginning of the data. We use a char as the type here for easy offsetting.
-    const unsigned char* data;
+    /* The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC. */
+    drflac_uint8 subframeType;
 
 
-    /// The size of the data.
-    size_t dataSize;
+    /* The number of wasted bits per sample as specified by the sub-frame header. */
+    drflac_uint8 wastedBitsPerSample;
 
 
-    /// The position we're currently sitting at.
-    size_t currentReadPos;
+    /* The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC. */
+    drflac_uint8 lpcOrder;
 
 
-} drflac_memory;
+    /* A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pExtraData. */
+    drflac_int32* pSamplesS32;
+} drflac_subframe;
 
 
-static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
+typedef struct
 {
 {
-    drflac_memory* memory = (drflac_memory*)pUserData;
-    assert(memory != NULL);
-    assert(memory->dataSize >= memory->currentReadPos);
-
-    size_t bytesRemaining = memory->dataSize - memory->currentReadPos;
-    if (bytesToRead > bytesRemaining) {
-        bytesToRead = bytesRemaining;
-    }
+    /*
+    If the stream uses variable block sizes, this will be set to the index of the first PCM frame. If fixed block sizes are used, this will
+    always be set to 0. This is 64-bit because the decoded PCM frame number will be 36 bits.
+    */
+    drflac_uint64 pcmFrameNumber;
 
 
-    if (bytesToRead > 0) {
-        memcpy(bufferOut, memory->data + memory->currentReadPos, bytesToRead);
-        memory->currentReadPos += bytesToRead;
-    }
+    /*
+    If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0. This
+    is 32-bit because in fixed block sizes, the maximum frame number will be 31 bits.
+    */
+    drflac_uint32 flacFrameNumber;
 
 
-    return bytesToRead;
-}
+    /* The sample rate of this frame. */
+    drflac_uint32 sampleRate;
 
 
-static bool drflac__on_seek_memory(void* pUserData, int offset)
-{
-    drflac_memory* memory = (drflac_memory*)pUserData;
-    assert(memory != NULL);
+    /* The number of PCM frames in each sub-frame within this frame. */
+    drflac_uint16 blockSizeInPCMFrames;
 
 
-    if (offset > 0) {
-        if (memory->currentReadPos + offset > memory->dataSize) {
-            offset = (int)(memory->dataSize - memory->currentReadPos);     // Trying to seek too far forward.
-        }
-    } else {
-        if (memory->currentReadPos < (size_t)-offset) {
-            offset = -(int)memory->currentReadPos;                  // Trying to seek too far backwards.
-        }
-    }
+    /*
+    The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
+    will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
+    */
+    drflac_uint8 channelAssignment;
 
 
-    // This will never underflow thanks to the clamps above.
-    memory->currentReadPos += offset;
+    /* The number of bits per sample within this frame. */
+    drflac_uint8 bitsPerSample;
 
 
-    return 1;
-}
+    /* The frame's CRC. */
+    drflac_uint8 crc8;
+} drflac_frame_header;
 
 
-static drflac* drflac_open_memory(const void* data, size_t dataSize)
+typedef struct
 {
 {
-    drflac_memory* pUserData = (drflac_memory*)malloc(sizeof(*pUserData));
-    if (pUserData == NULL) {
-        return NULL;
-    }
+    /* The header. */
+    drflac_frame_header header;
 
 
-    pUserData->data = (const unsigned char*)data;
-    pUserData->dataSize = dataSize;
-    pUserData->currentReadPos = 0;
-    return drflac_open(drflac__on_read_memory, drflac__on_seek_memory, pUserData);
-}
+    /*
+    The number of PCM frames left to be read in this FLAC frame. This is initially set to the block size. As PCM frames are read,
+    this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
+    */
+    drflac_uint32 pcmFramesRemaining;
 
 
+    /* The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels. */
+    drflac_subframe subframes[8];
+} drflac_frame;
 
 
-//// Endian Management ////
-static DRFLAC_INLINE bool drflac__is_little_endian()
+typedef struct
 {
 {
-    int n = 1;
-    return (*(char*)&n) == 1;
-}
+    /* The function to call when a metadata block is read. */
+    drflac_meta_proc onMeta;
 
 
-static DRFLAC_INLINE uint32_t drflac__swap_endian_uint32(uint32_t n)
-{
-#ifdef _MSC_VER
-    return _byteswap_ulong(n);
-#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-    return __builtin_bswap32(n);
-#else
-    return ((n & 0xFF000000) >> 24) |
-           ((n & 0x00FF0000) >>  8) |
-           ((n & 0x0000FF00) <<  8) |
-           ((n & 0x000000FF) << 24);
-#endif
-}
+    /* The user data posted to the metadata callback function. */
+    void* pUserDataMD;
 
 
-static DRFLAC_INLINE uint64_t drflac__swap_endian_uint64(uint64_t n)
-{
-#ifdef _MSC_VER
-    return _byteswap_uint64(n);
-#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-    return __builtin_bswap64(n);
-#else
-    return ((n & 0xFF00000000000000ULL) >> 56) |
-           ((n & 0x00FF000000000000ULL) >> 40) |
-           ((n & 0x0000FF0000000000ULL) >> 24) |
-           ((n & 0x000000FF00000000ULL) >>  8) |
-           ((n & 0x00000000FF000000ULL) <<  8) |
-           ((n & 0x0000000000FF0000ULL) << 24) |
-           ((n & 0x000000000000FF00ULL) << 40) |
-           ((n & 0x00000000000000FFULL) << 56);
-#endif
-}
+    /* Memory allocation callbacks. */
+    drflac_allocation_callbacks allocationCallbacks;
 
 
 
 
-static DRFLAC_INLINE uint32_t drflac__be2host_32(uint32_t n)
-{
-#if defined(__BYTE_ORDER__) && (__BYTE_ORDER == __ORDER_LITTLE_ENDIAN__)
-    return drflac__swap_endian_uint32(n);
-#elif defined(__linux__)
-    return be32toh(n);
-#else
-    if (drflac__is_little_endian()) {
-        return drflac__swap_endian_uint32(n);
-    }
+    /* The sample rate. Will be set to something like 44100. */
+    drflac_uint32 sampleRate;
 
 
-    return n;
-#endif
-}
+    /*
+    The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
+    value specified in the STREAMINFO block.
+    */
+    drflac_uint8 channels;
 
 
-static DRFLAC_INLINE uint64_t drflac__be2host_64(uint64_t n)
-{
-#if defined(__BYTE_ORDER__) && (__BYTE_ORDER == __ORDER_LITTLE_ENDIAN__)
-    return drflac__swap_endian_uint64(n);
-#elif defined(__linux__)
-    return be64toh(n);
-#else
-    if (drflac__is_little_endian()) {
-        return drflac__swap_endian_uint64(n);
-    }
+    /* The bits per sample. Will be set to something like 16, 24, etc. */
+    drflac_uint8 bitsPerSample;
 
 
-    return n;
-#endif
-}
+    /* The maximum block size, in samples. This number represents the number of samples in each channel (not combined). */
+    drflac_uint16 maxBlockSizeInPCMFrames;
 
 
-#ifdef DRFLAC_64BIT
-#define drflac__be2host__cache_line drflac__be2host_64
-#else
-#define drflac__be2host__cache_line drflac__be2host_32
-#endif
+    /*
+    The total number of PCM Frames making up the stream. Can be 0 in which case it's still a valid stream, but just means
+    the total PCM frame count is unknown. Likely the case with streams like internet radio.
+    */
+    drflac_uint64 totalPCMFrameCount;
 
 
 
 
-// BIT READING ATTEMPT #2
-//
-// This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
-// on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
-// is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
-// array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
-// from onRead() is read into.
-#define DRFLAC_CACHE_L1_SIZE_BYTES                  (sizeof(pFlac->cache))
-#define DRFLAC_CACHE_L1_SIZE_BITS                   (sizeof(pFlac->cache)*8)
-#define DRFLAC_CACHE_L1_BITS_REMAINING              (DRFLAC_CACHE_L1_SIZE_BITS - (pFlac->consumedBits))
-#ifdef DRFLAC_64BIT
-#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint64_t)-1LL) >> (_bitCount)))
-#else
-#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint32_t)-1) >> (_bitCount)))
-#endif
-#define DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount)  (DRFLAC_CACHE_L1_SIZE_BITS - (_bitCount))
-#define DRFLAC_CACHE_L1_SELECT(_bitCount)           ((pFlac->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
-#define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(_bitCount) (DRFLAC_CACHE_L1_SELECT(_bitCount) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount))
-#define DRFLAC_CACHE_L2_SIZE_BYTES                  (sizeof(pFlac->cacheL2))
-#define DRFLAC_CACHE_L2_LINE_COUNT                  (DRFLAC_CACHE_L2_SIZE_BYTES / sizeof(pFlac->cacheL2[0]))
-#define DRFLAC_CACHE_L2_LINES_REMAINING             (DRFLAC_CACHE_L2_LINE_COUNT - pFlac->nextL2Line)
+    /* The container type. This is set based on whether or not the decoder was opened from a native or Ogg stream. */
+    drflac_container container;
 
 
-static DRFLAC_INLINE bool drflac__reload_l1_cache_from_l2(drflac* pFlac)
-{
-    // Fast path. Try loading straight from L2.
-    if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
-        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
-        return true;
-    }
+    /* The number of seekpoints in the seektable. */
+    drflac_uint32 seekpointCount;
 
 
-    // If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client.
-    size_t bytesRead = pFlac->onRead(pFlac->pUserData, pFlac->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES);
-    pFlac->currentBytePos += bytesRead;
 
 
-    pFlac->nextL2Line = 0;
-    if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES) {
-        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
-        return true;
-    }
+    /* Information about the frame the decoder is currently sitting on. */
+    drflac_frame currentFLACFrame;
 
 
 
 
-    // If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
-    // means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
-    // and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
-    // the size of the L1 so we'll need to seek backwards by any misaligned bytes.
-    size_t alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES;
-    if (alignedL1LineCount > 0)
-    {
-        size_t offset = DRFLAC_CACHE_L2_LINE_COUNT - alignedL1LineCount;
-        for (size_t i = alignedL1LineCount; i > 0; --i) {
-            pFlac->cacheL2[i-1 + offset] = pFlac->cacheL2[i-1];
-        }
+    /* The index of the PCM frame the decoder is currently sitting on. This is only used for seeking. */
+    drflac_uint64 currentPCMFrame;
 
 
-        pFlac->nextL2Line = offset;
-        pFlac->unusedL2Lines = offset;
+    /* The position of the first FLAC frame in the stream. This is only ever used for seeking. */
+    drflac_uint64 firstFLACFramePosInBytes;
 
 
-        // At this point there may be some leftover unaligned bytes. We need to seek backwards so we don't lose
-        // those bytes.
-        size_t unalignedBytes = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES);
-        if (unalignedBytes > 0) {
-            pFlac->onSeek(pFlac->pUserData, -(int)unalignedBytes);
-            pFlac->currentBytePos -= unalignedBytes;
-        }
 
 
-        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
-        return true;
-    }
-    else
-    {
-        // If we get into this branch it means we weren't able to load any L1-aligned data. We just need to seek
-        // backwards by the leftover bytes and return false.
-        if (bytesRead > 0) {
-            pFlac->onSeek(pFlac->pUserData, -(int)bytesRead);
-            pFlac->currentBytePos -= bytesRead;
-        }
+    /* A hack to avoid a malloc() when opening a decoder with drflac_open_memory(). */
+    drflac__memory_stream memoryStream;
 
 
-        pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT;
-        return false;
-    }
-}
 
 
-static bool drflac__reload_cache(drflac* pFlac)
-{
-    // Fast path. Try just moving the next value in the L2 cache to the L1 cache.
-    if (drflac__reload_l1_cache_from_l2(pFlac)) {
-        pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
-        pFlac->consumedBits = 0;
-        return true;
-    }
+    /* A pointer to the decoded sample data. This is an offset of pExtraData. */
+    drflac_int32* pDecodedSamples;
 
 
-    // Slow path.
+    /* A pointer to the seek table. This is an offset of pExtraData, or NULL if there is no seek table. */
+    drflac_seekpoint* pSeekpoints;
 
 
-    // If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
-    // few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
-    // data straight from the client into the L1 cache. This should only really happen once per stream so efficiency is not important.
-    size_t bytesRead = pFlac->onRead(pFlac->pUserData, &pFlac->cache, DRFLAC_CACHE_L1_SIZE_BYTES);
-    if (bytesRead == 0) {
-        return false;
-    }
+    /* Internal use only. Only used with Ogg containers. Points to a drflac_oggbs object. This is an offset of pExtraData. */
+    void* _oggbs;
 
 
-    pFlac->currentBytePos += bytesRead;
+    /* Internal use only. Used for profiling and testing different seeking modes. */
+    drflac_bool32 _noSeekTableSeek    : 1;
+    drflac_bool32 _noBinarySearchSeek : 1;
+    drflac_bool32 _noBruteForceSeek   : 1;
 
 
-    assert(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES);
-    pFlac->consumedBits = (DRFLAC_CACHE_L1_SIZE_BYTES - bytesRead) * 8;
+    /* The bit streamer. The raw FLAC data is fed through this object. */
+    drflac_bs bs;
 
 
-    pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
-    pFlac->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_SIZE_BITS - pFlac->consumedBits);    // <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property.
-    return true;
-}
+    /* Variable length extra data. We attach this to the end of the object so we can avoid unnecessary mallocs. */
+    drflac_uint8 pExtraData[1];
+} drflac;
 
 
-static bool drflac__seek_bits(drflac* pFlac, size_t bitsToSeek)
-{
-    if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING) {
-        pFlac->consumedBits += bitsToSeek;
-        pFlac->cache <<= bitsToSeek;
-        return true;
-    } else {
-        // It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here.
-        bitsToSeek -= DRFLAC_CACHE_L1_BITS_REMAINING;
-        pFlac->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING;
-        pFlac->cache = 0;
 
 
-        size_t wholeBytesRemaining = bitsToSeek/8;
-        if (wholeBytesRemaining > 0)
-        {
-            // The next bytes to seek will be located in the L2 cache. The problem is that the L2 cache is not byte aligned,
-            // but rather DRFLAC_CACHE_L1_SIZE_BYTES aligned (usually 4 or 8). If, for example, the number of bytes to seek is
-            // 3, we'll need to handle it in a special way.
-            size_t wholeCacheLinesRemaining = wholeBytesRemaining / DRFLAC_CACHE_L1_SIZE_BYTES;
-            if (wholeCacheLinesRemaining < DRFLAC_CACHE_L2_LINES_REMAINING)
-            {
-                wholeBytesRemaining -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BYTES;
-                bitsToSeek -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BITS;
-                pFlac->nextL2Line += wholeCacheLinesRemaining;
-            }
-            else
-            {
-                wholeBytesRemaining -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BYTES;
-                bitsToSeek -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BITS;
-                pFlac->nextL2Line += DRFLAC_CACHE_L2_LINES_REMAINING;
+/*
+Opens a FLAC decoder.
 
 
-                pFlac->onSeek(pFlac->pUserData, (int)wholeBytesRemaining);
-                pFlac->currentBytePos += wholeBytesRemaining;
-                bitsToSeek -= wholeBytesRemaining*8;
-            }
-        }
 
 
+Parameters
+----------
+onRead (in)
+    The function to call when data needs to be read from the client.
 
 
-        if (bitsToSeek > 0) {
-            if (!drflac__reload_cache(pFlac)) {
-                return false;
-            }
+onSeek (in)
+    The function to call when the read position of the client data needs to move.
 
 
-            return drflac__seek_bits(pFlac, bitsToSeek);
-        }
+pUserData (in, optional)
+    A pointer to application defined data that will be passed to onRead and onSeek.
 
 
-        return true;
-    }
-}
+pAllocationCallbacks (in, optional)
+    A pointer to application defined callbacks for managing memory allocations.
 
 
-static bool drflac__read_uint32(drflac* pFlac, unsigned int bitCount, uint32_t* pResultOut)
-{
-    assert(pFlac != NULL);
-    assert(pResultOut != NULL);
-    assert(bitCount > 0);
-    assert(bitCount <= 32);
 
 
-    if (pFlac->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS) {
-        if (!drflac__reload_cache(pFlac)) {
-            return false;
-        }
-    }
+Return Value
+------------
+Returns a pointer to an object representing the decoder.
 
 
-    if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING) {
-        if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS) {
-            *pResultOut = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCount);
-            pFlac->consumedBits += bitCount;
-            pFlac->cache <<= bitCount;
-        } else {
-            *pResultOut = (uint32_t)pFlac->cache;
-            pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
-            pFlac->cache = 0;
+
+Remarks
+-------
+Close the decoder with `drflac_close()`.
+
+`pAllocationCallbacks` can be NULL in which case it will use `DRFLAC_MALLOC`, `DRFLAC_REALLOC` and `DRFLAC_FREE`.
+
+This function will automatically detect whether or not you are attempting to open a native or Ogg encapsulated FLAC, both of which should work seamlessly
+without any manual intervention. Ogg encapsulation also works with multiplexed streams which basically means it can play FLAC encoded audio tracks in videos.
+
+This is the lowest level function for opening a FLAC stream. You can also use `drflac_open_file()` and `drflac_open_memory()` to open the stream from a file or
+from a block of memory respectively.
+
+The STREAMINFO block must be present for this to succeed. Use `drflac_open_relaxed()` to open a FLAC stream where the header may not be present.
+
+Use `drflac_open_with_metadata()` if you need access to metadata.
+
+
+Seek Also
+---------
+drflac_open_file()
+drflac_open_memory()
+drflac_open_with_metadata()
+drflac_close()
+*/
+DRFLAC_API drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Opens a FLAC stream with relaxed validation of the header block.
+
+
+Parameters
+----------
+onRead (in)
+    The function to call when data needs to be read from the client.
+
+onSeek (in)
+    The function to call when the read position of the client data needs to move.
+
+container (in)
+    Whether or not the FLAC stream is encapsulated using standard FLAC encapsulation or Ogg encapsulation.
+
+pUserData (in, optional)
+    A pointer to application defined data that will be passed to onRead and onSeek.
+
+pAllocationCallbacks (in, optional)
+    A pointer to application defined callbacks for managing memory allocations.
+
+
+Return Value
+------------
+A pointer to an object representing the decoder.
+
+
+Remarks
+-------
+The same as drflac_open(), except attempts to open the stream even when a header block is not present.
+
+Because the header is not necessarily available, the caller must explicitly define the container (Native or Ogg). Do not set this to `drflac_container_unknown`
+as that is for internal use only.
+
+Opening in relaxed mode will continue reading data from onRead until it finds a valid frame. If a frame is never found it will continue forever. To abort,
+force your `onRead` callback to return 0, which dr_flac will use as an indicator that the end of the stream was found.
+
+Use `drflac_open_with_metadata_relaxed()` if you need access to metadata.
+*/
+DRFLAC_API drflac* drflac_open_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Opens a FLAC decoder and notifies the caller of the metadata chunks (album art, etc.).
+
+
+Parameters
+----------
+onRead (in)
+    The function to call when data needs to be read from the client.
+
+onSeek (in)
+    The function to call when the read position of the client data needs to move.
+
+onMeta (in)
+    The function to call for every metadata block.
+
+pUserData (in, optional)
+    A pointer to application defined data that will be passed to onRead, onSeek and onMeta.
+
+pAllocationCallbacks (in, optional)
+    A pointer to application defined callbacks for managing memory allocations.
+
+
+Return Value
+------------
+A pointer to an object representing the decoder.
+
+
+Remarks
+-------
+Close the decoder with `drflac_close()`.
+
+`pAllocationCallbacks` can be NULL in which case it will use `DRFLAC_MALLOC`, `DRFLAC_REALLOC` and `DRFLAC_FREE`.
+
+This is slower than `drflac_open()`, so avoid this one if you don't need metadata. Internally, this will allocate and free memory on the heap for every
+metadata block except for STREAMINFO and PADDING blocks.
+
+The caller is notified of the metadata via the `onMeta` callback. All metadata blocks will be handled before the function returns. This callback takes a
+pointer to a `drflac_metadata` object which is a union containing the data of all relevant metadata blocks. Use the `type` member to discriminate against
+the different metadata types.
+
+The STREAMINFO block must be present for this to succeed. Use `drflac_open_with_metadata_relaxed()` to open a FLAC stream where the header may not be present.
+
+Note that this will behave inconsistently with `drflac_open()` if the stream is an Ogg encapsulated stream and a metadata block is corrupted. This is due to
+the way the Ogg stream recovers from corrupted pages. When `drflac_open_with_metadata()` is being used, the open routine will try to read the contents of the
+metadata block, whereas `drflac_open()` will simply seek past it (for the sake of efficiency). This inconsistency can result in different samples being
+returned depending on whether or not the stream is being opened with metadata.
+
+
+Seek Also
+---------
+drflac_open_file_with_metadata()
+drflac_open_memory_with_metadata()
+drflac_open()
+drflac_close()
+*/
+DRFLAC_API drflac* drflac_open_with_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/*
+The same as drflac_open_with_metadata(), except attempts to open the stream even when a header block is not present.
+
+See Also
+--------
+drflac_open_with_metadata()
+drflac_open_relaxed()
+*/
+DRFLAC_API drflac* drflac_open_with_metadata_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_meta_proc onMeta, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Closes the given FLAC decoder.
+
+
+Parameters
+----------
+pFlac (in)
+    The decoder to close.
+
+
+Remarks
+-------
+This will destroy the decoder object.
+
+
+See Also
+--------
+drflac_open()
+drflac_open_with_metadata()
+drflac_open_file()
+drflac_open_file_w()
+drflac_open_file_with_metadata()
+drflac_open_file_with_metadata_w()
+drflac_open_memory()
+drflac_open_memory_with_metadata()
+*/
+DRFLAC_API void drflac_close(drflac* pFlac);
+
+
+/*
+Reads sample data from the given FLAC decoder, output as interleaved signed 32-bit PCM.
+
+
+Parameters
+----------
+pFlac (in)
+    The decoder.
+
+framesToRead (in)
+    The number of PCM frames to read.
+
+pBufferOut (out, optional)
+    A pointer to the buffer that will receive the decoded samples.
+
+
+Return Value
+------------
+Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
+
+
+Remarks
+-------
+pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
+*/
+DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut);
+
+
+/*
+Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
+
+
+Parameters
+----------
+pFlac (in)
+    The decoder.
+
+framesToRead (in)
+    The number of PCM frames to read.
+
+pBufferOut (out, optional)
+    A pointer to the buffer that will receive the decoded samples.
+
+
+Return Value
+------------
+Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
+
+
+Remarks
+-------
+pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
+
+Note that this is lossy for streams where the bits per sample is larger than 16.
+*/
+DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut);
+
+/*
+Reads sample data from the given FLAC decoder, output as interleaved 32-bit floating point PCM.
+
+
+Parameters
+----------
+pFlac (in)
+    The decoder.
+
+framesToRead (in)
+    The number of PCM frames to read.
+
+pBufferOut (out, optional)
+    A pointer to the buffer that will receive the decoded samples.
+
+
+Return Value
+------------
+Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
+
+
+Remarks
+-------
+pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
+
+Note that this should be considered lossy due to the nature of floating point numbers not being able to exactly represent every possible number.
+*/
+DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut);
+
+/*
+Seeks to the PCM frame at the given index.
+
+
+Parameters
+----------
+pFlac (in)
+    The decoder.
+
+pcmFrameIndex (in)
+    The index of the PCM frame to seek to. See notes below.
+
+
+Return Value
+-------------
+`DRFLAC_TRUE` if successful; `DRFLAC_FALSE` otherwise.
+*/
+DRFLAC_API drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex);
+
+
+
+#ifndef DR_FLAC_NO_STDIO
+/*
+Opens a FLAC decoder from the file at the given path.
+
+
+Parameters
+----------
+pFileName (in)
+    The path of the file to open, either absolute or relative to the current directory.
+
+pAllocationCallbacks (in, optional)
+    A pointer to application defined callbacks for managing memory allocations.
+
+
+Return Value
+------------
+A pointer to an object representing the decoder.
+
+
+Remarks
+-------
+Close the decoder with drflac_close().
+
+
+Remarks
+-------
+This will hold a handle to the file until the decoder is closed with drflac_close(). Some platforms will restrict the number of files a process can have open
+at any given time, so keep this mind if you have many decoders open at the same time.
+
+
+See Also
+--------
+drflac_open_file_with_metadata()
+drflac_open()
+drflac_close()
+*/
+DRFLAC_API drflac* drflac_open_file(const char* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks);
+DRFLAC_API drflac* drflac_open_file_w(const wchar_t* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Opens a FLAC decoder from the file at the given path and notifies the caller of the metadata chunks (album art, etc.)
+
+
+Parameters
+----------
+pFileName (in)
+    The path of the file to open, either absolute or relative to the current directory.
+
+pAllocationCallbacks (in, optional)
+    A pointer to application defined callbacks for managing memory allocations.
+
+onMeta (in)
+    The callback to fire for each metadata block.
+
+pUserData (in)
+    A pointer to the user data to pass to the metadata callback.
+
+pAllocationCallbacks (in)
+    A pointer to application defined callbacks for managing memory allocations.
+
+
+Remarks
+-------
+Look at the documentation for drflac_open_with_metadata() for more information on how metadata is handled.
+
+
+See Also
+--------
+drflac_open_with_metadata()
+drflac_open()
+drflac_close()
+*/
+DRFLAC_API drflac* drflac_open_file_with_metadata(const char* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
+DRFLAC_API drflac* drflac_open_file_with_metadata_w(const wchar_t* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
+#endif
+
+/*
+Opens a FLAC decoder from a pre-allocated block of memory
+
+
+Parameters
+----------
+pData (in)
+    A pointer to the raw encoded FLAC data.
+
+dataSize (in)
+    The size in bytes of `data`.
+
+pAllocationCallbacks (in)
+    A pointer to application defined callbacks for managing memory allocations.
+
+
+Return Value
+------------
+A pointer to an object representing the decoder.
+
+
+Remarks
+-------
+This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for the lifetime of the decoder.
+
+
+See Also
+--------
+drflac_open()
+drflac_close()
+*/
+DRFLAC_API drflac* drflac_open_memory(const void* pData, size_t dataSize, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Opens a FLAC decoder from a pre-allocated block of memory and notifies the caller of the metadata chunks (album art, etc.)
+
+
+Parameters
+----------
+pData (in)
+    A pointer to the raw encoded FLAC data.
+
+dataSize (in)
+    The size in bytes of `data`.
+
+onMeta (in)
+    The callback to fire for each metadata block.
+
+pUserData (in)
+    A pointer to the user data to pass to the metadata callback.
+
+pAllocationCallbacks (in)
+    A pointer to application defined callbacks for managing memory allocations.
+
+
+Remarks
+-------
+Look at the documentation for drflac_open_with_metadata() for more information on how metadata is handled.
+
+
+See Also
+-------
+drflac_open_with_metadata()
+drflac_open()
+drflac_close()
+*/
+DRFLAC_API drflac* drflac_open_memory_with_metadata(const void* pData, size_t dataSize, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+
+
+/* High Level APIs */
+
+/*
+Opens a FLAC stream from the given callbacks and fully decodes it in a single operation. The return value is a
+pointer to the sample data as interleaved signed 32-bit PCM. The returned data must be freed with drflac_free().
+
+You can pass in custom memory allocation callbacks via the pAllocationCallbacks parameter. This can be NULL in which
+case it will use DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE.
+
+Sometimes a FLAC file won't keep track of the total sample count. In this situation the function will continuously
+read samples into a dynamically sized buffer on the heap until no samples are left.
+
+Do not call this function on a broadcast type of stream (like internet radio streams and whatnot).
+*/
+DRFLAC_API drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/* Same as drflac_open_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
+DRFLAC_API drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/* Same as drflac_open_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
+DRFLAC_API float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+#ifndef DR_FLAC_NO_STDIO
+/* Same as drflac_open_and_read_pcm_frames_s32() except opens the decoder from a file. */
+DRFLAC_API drflac_int32* drflac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/* Same as drflac_open_file_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
+DRFLAC_API drflac_int16* drflac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/* Same as drflac_open_file_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
+DRFLAC_API float* drflac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+#endif
+
+/* Same as drflac_open_and_read_pcm_frames_s32() except opens the decoder from a block of memory. */
+DRFLAC_API drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/* Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
+DRFLAC_API drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/* Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
+DRFLAC_API float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Frees memory that was allocated internally by dr_flac.
+
+Set pAllocationCallbacks to the same object that was passed to drflac_open_*_and_read_pcm_frames_*(). If you originally passed in NULL, pass in NULL for this.
+*/
+DRFLAC_API void drflac_free(void* p, const drflac_allocation_callbacks* pAllocationCallbacks);
+
+
+/* Structure representing an iterator for vorbis comments in a VORBIS_COMMENT metadata block. */
+typedef struct
+{
+    drflac_uint32 countRemaining;
+    const char* pRunningData;
+} drflac_vorbis_comment_iterator;
+
+/*
+Initializes a vorbis comment iterator. This can be used for iterating over the vorbis comments in a VORBIS_COMMENT
+metadata block.
+*/
+DRFLAC_API void drflac_init_vorbis_comment_iterator(drflac_vorbis_comment_iterator* pIter, drflac_uint32 commentCount, const void* pComments);
+
+/*
+Goes to the next vorbis comment in the given iterator. If null is returned it means there are no more comments. The
+returned string is NOT null terminated.
+*/
+DRFLAC_API const char* drflac_next_vorbis_comment(drflac_vorbis_comment_iterator* pIter, drflac_uint32* pCommentLengthOut);
+
+
+/* Structure representing an iterator for cuesheet tracks in a CUESHEET metadata block. */
+typedef struct
+{
+    drflac_uint32 countRemaining;
+    const char* pRunningData;
+} drflac_cuesheet_track_iterator;
+
+/* The order of members here is important because we map this directly to the raw data within the CUESHEET metadata block. */
+typedef struct
+{
+    drflac_uint64 offset;
+    drflac_uint8 index;
+    drflac_uint8 reserved[3];
+} drflac_cuesheet_track_index;
+
+typedef struct
+{
+    drflac_uint64 offset;
+    drflac_uint8 trackNumber;
+    char ISRC[12];
+    drflac_bool8 isAudio;
+    drflac_bool8 preEmphasis;
+    drflac_uint8 indexCount;
+    const drflac_cuesheet_track_index* pIndexPoints;
+} drflac_cuesheet_track;
+
+/*
+Initializes a cuesheet track iterator. This can be used for iterating over the cuesheet tracks in a CUESHEET metadata
+block.
+*/
+DRFLAC_API void drflac_init_cuesheet_track_iterator(drflac_cuesheet_track_iterator* pIter, drflac_uint32 trackCount, const void* pTrackData);
+
+/* Goes to the next cuesheet track in the given iterator. If DRFLAC_FALSE is returned it means there are no more comments. */
+DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter, drflac_cuesheet_track* pCuesheetTrack);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* dr_flac_h */
+
+
+/************************************************************************************************************************************************************
+ ************************************************************************************************************************************************************
+
+ IMPLEMENTATION
+
+ ************************************************************************************************************************************************************
+ ************************************************************************************************************************************************************/
+#if defined(DR_FLAC_IMPLEMENTATION) || defined(DRFLAC_IMPLEMENTATION)
+#ifndef dr_flac_c
+#define dr_flac_c
+
+/* Disable some annoying warnings. */
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic push
+    #if __GNUC__ >= 7
+    #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+    #endif
+#endif
+
+#ifdef __linux__
+    #ifndef _BSD_SOURCE
+        #define _BSD_SOURCE
+    #endif
+    #ifndef _DEFAULT_SOURCE
+        #define _DEFAULT_SOURCE
+    #endif
+    #ifndef __USE_BSD
+        #define __USE_BSD
+    #endif
+    #include <endian.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+/* Inline */
+#ifdef _MSC_VER
+    #if defined(CPPPARSER)
+        #define DRFLAC_INLINE inline
+    #else
+        #define DRFLAC_INLINE __forceinline
+    #endif
+#elif defined(__GNUC__)
+    /*
+    I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
+    the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
+    case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
+    command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
+    I am using "__inline__" only when we're compiling in strict ANSI mode.
+    */
+    #if defined(__STRICT_ANSI__)
+        #define DRFLAC_GNUC_INLINE_HINT __inline__
+    #else
+        #define DRFLAC_GNUC_INLINE_HINT inline
+    #endif
+
+    #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)) || defined(__clang__)
+        #define DRFLAC_INLINE DRFLAC_GNUC_INLINE_HINT __attribute__((always_inline))
+    #else
+        #define DRFLAC_INLINE DRFLAC_GNUC_INLINE_HINT
+    #endif
+#elif defined(__WATCOMC__)
+    #define DRFLAC_INLINE __inline
+#else
+    #define DRFLAC_INLINE
+#endif
+/* End Inline */
+
+/*
+Intrinsics Support
+
+There's a bug in GCC 4.2.x which results in an incorrect compilation error when using _mm_slli_epi32() where it complains with
+
+    "error: shift must be an immediate"
+
+Unfortuantely dr_flac depends on this for a few things so we're just going to disable SSE on GCC 4.2 and below.
+*/
+#if !defined(DR_FLAC_NO_SIMD)
+    #if defined(DRFLAC_X64) || defined(DRFLAC_X86)
+        #if defined(_MSC_VER) && !defined(__clang__)
+            /* MSVC. */
+            #if _MSC_VER >= 1400 && !defined(DRFLAC_NO_SSE2)    /* 2005 */
+                #define DRFLAC_SUPPORT_SSE2
+            #endif
+            #if _MSC_VER >= 1600 && !defined(DRFLAC_NO_SSE41)   /* 2010 */
+                #define DRFLAC_SUPPORT_SSE41
+            #endif
+        #elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
+            /* Assume GNUC-style. */
+            #if defined(__SSE2__) && !defined(DRFLAC_NO_SSE2)
+                #define DRFLAC_SUPPORT_SSE2
+            #endif
+            #if defined(__SSE4_1__) && !defined(DRFLAC_NO_SSE41)
+                #define DRFLAC_SUPPORT_SSE41
+            #endif
+        #endif
+
+        /* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
+        #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
+            #if !defined(DRFLAC_SUPPORT_SSE2) && !defined(DRFLAC_NO_SSE2) && __has_include(<emmintrin.h>)
+                #define DRFLAC_SUPPORT_SSE2
+            #endif
+            #if !defined(DRFLAC_SUPPORT_SSE41) && !defined(DRFLAC_NO_SSE41) && __has_include(<smmintrin.h>)
+                #define DRFLAC_SUPPORT_SSE41
+            #endif
+        #endif
+
+        #if defined(DRFLAC_SUPPORT_SSE41)
+            #include <smmintrin.h>
+        #elif defined(DRFLAC_SUPPORT_SSE2)
+            #include <emmintrin.h>
+        #endif
+    #endif
+
+    #if defined(DRFLAC_ARM)
+        #if !defined(DRFLAC_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
+            #define DRFLAC_SUPPORT_NEON
+            #include <arm_neon.h>
+        #endif
+    #endif
+#endif
+
+/* Compile-time CPU feature support. */
+#if !defined(DR_FLAC_NO_SIMD) && (defined(DRFLAC_X86) || defined(DRFLAC_X64))
+    #if defined(_MSC_VER) && !defined(__clang__)
+        #if _MSC_VER >= 1400
+            #include <intrin.h>
+            static void drflac__cpuid(int info[4], int fid)
+            {
+                __cpuid(info, fid);
+            }
+        #else
+            #define DRFLAC_NO_CPUID
+        #endif
+    #else
+        #if defined(__GNUC__) || defined(__clang__)
+            static void drflac__cpuid(int info[4], int fid)
+            {
+                /*
+                It looks like the -fPIC option uses the ebx register which GCC complains about. We can work around this by just using a different register, the
+                specific register of which I'm letting the compiler decide on. The "k" prefix is used to specify a 32-bit register. The {...} syntax is for
+                supporting different assembly dialects.
+
+                What's basically happening is that we're saving and restoring the ebx register manually.
+                */
+                #if defined(DRFLAC_X86) && defined(__PIC__)
+                    __asm__ __volatile__ (
+                        "xchg{l} {%%}ebx, %k1;"
+                        "cpuid;"
+                        "xchg{l} {%%}ebx, %k1;"
+                        : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
+                    );
+                #else
+                    __asm__ __volatile__ (
+                        "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
+                    );
+                #endif
+            }
+        #else
+            #define DRFLAC_NO_CPUID
+        #endif
+    #endif
+#else
+    #define DRFLAC_NO_CPUID
+#endif
+
+static DRFLAC_INLINE drflac_bool32 drflac_has_sse2(void)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE2)
+        #if defined(DRFLAC_X64)
+            return DRFLAC_TRUE;    /* 64-bit targets always support SSE2. */
+        #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
+            return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate SSE2 code we can assume support. */
+        #else
+            #if defined(DRFLAC_NO_CPUID)
+                return DRFLAC_FALSE;
+            #else
+                int info[4];
+                drflac__cpuid(info, 1);
+                return (info[3] & (1 << 26)) != 0;
+            #endif
+        #endif
+    #else
+        return DRFLAC_FALSE;       /* SSE2 is only supported on x86 and x64 architectures. */
+    #endif
+#else
+    return DRFLAC_FALSE;           /* No compiler support. */
+#endif
+}
+
+static DRFLAC_INLINE drflac_bool32 drflac_has_sse41(void)
+{
+#if defined(DRFLAC_SUPPORT_SSE41)
+    #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE41)
+        #if defined(__SSE4_1__) || defined(__AVX__)
+            return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate SSE41 code we can assume support. */
+        #else
+            #if defined(DRFLAC_NO_CPUID)
+                return DRFLAC_FALSE;
+            #else
+                int info[4];
+                drflac__cpuid(info, 1);
+                return (info[2] & (1 << 19)) != 0;
+            #endif
+        #endif
+    #else
+        return DRFLAC_FALSE;       /* SSE41 is only supported on x86 and x64 architectures. */
+    #endif
+#else
+    return DRFLAC_FALSE;           /* No compiler support. */
+#endif
+}
+
+
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(DRFLAC_X86) || defined(DRFLAC_X64)) && !defined(__clang__)
+    #define DRFLAC_HAS_LZCNT_INTRINSIC
+#elif (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+    #define DRFLAC_HAS_LZCNT_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_clzll) || __has_builtin(__builtin_clzl)
+            #define DRFLAC_HAS_LZCNT_INTRINSIC
+        #endif
+    #endif
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400 && !defined(__clang__)
+    #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
+    #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
+    #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_bswap16)
+            #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap32)
+            #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap64)
+            #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
+        #endif
+    #endif
+#elif defined(__GNUC__)
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+        #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
+        #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
+    #endif
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+        #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
+    #endif
+#elif defined(__WATCOMC__) && defined(__386__)
+    #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
+    #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
+    #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
+    extern __inline drflac_uint16 _watcom_bswap16(drflac_uint16);
+    extern __inline drflac_uint32 _watcom_bswap32(drflac_uint32);
+    extern __inline drflac_uint64 _watcom_bswap64(drflac_uint64);
+#pragma aux _watcom_bswap16 = \
+    "xchg al, ah" \
+    parm  [ax]    \
+    value [ax]    \
+    modify nomemory;
+#pragma aux _watcom_bswap32 = \
+    "bswap eax" \
+    parm  [eax] \
+    value [eax] \
+    modify nomemory;
+#pragma aux _watcom_bswap64 = \
+    "bswap eax"     \
+    "bswap edx"     \
+    "xchg eax,edx"  \
+    parm [eax edx]  \
+    value [eax edx] \
+    modify nomemory;
+#endif
+
+
+/* Standard library stuff. */
+#ifndef DRFLAC_ASSERT
+#include <assert.h>
+#define DRFLAC_ASSERT(expression)           assert(expression)
+#endif
+#ifndef DRFLAC_MALLOC
+#define DRFLAC_MALLOC(sz)                   malloc((sz))
+#endif
+#ifndef DRFLAC_REALLOC
+#define DRFLAC_REALLOC(p, sz)               realloc((p), (sz))
+#endif
+#ifndef DRFLAC_FREE
+#define DRFLAC_FREE(p)                      free((p))
+#endif
+#ifndef DRFLAC_COPY_MEMORY
+#define DRFLAC_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
+#endif
+#ifndef DRFLAC_ZERO_MEMORY
+#define DRFLAC_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
+#endif
+#ifndef DRFLAC_ZERO_OBJECT
+#define DRFLAC_ZERO_OBJECT(p)               DRFLAC_ZERO_MEMORY((p), sizeof(*(p)))
+#endif
+
+#define DRFLAC_MAX_SIMD_VECTOR_SIZE                     64  /* 64 for AVX-512 in the future. */
+
+/* Result Codes */
+typedef drflac_int32 drflac_result;
+#define DRFLAC_SUCCESS                                   0
+#define DRFLAC_ERROR                                    -1   /* A generic error. */
+#define DRFLAC_INVALID_ARGS                             -2
+#define DRFLAC_INVALID_OPERATION                        -3
+#define DRFLAC_OUT_OF_MEMORY                            -4
+#define DRFLAC_OUT_OF_RANGE                             -5
+#define DRFLAC_ACCESS_DENIED                            -6
+#define DRFLAC_DOES_NOT_EXIST                           -7
+#define DRFLAC_ALREADY_EXISTS                           -8
+#define DRFLAC_TOO_MANY_OPEN_FILES                      -9
+#define DRFLAC_INVALID_FILE                             -10
+#define DRFLAC_TOO_BIG                                  -11
+#define DRFLAC_PATH_TOO_LONG                            -12
+#define DRFLAC_NAME_TOO_LONG                            -13
+#define DRFLAC_NOT_DIRECTORY                            -14
+#define DRFLAC_IS_DIRECTORY                             -15
+#define DRFLAC_DIRECTORY_NOT_EMPTY                      -16
+#define DRFLAC_END_OF_FILE                              -17
+#define DRFLAC_NO_SPACE                                 -18
+#define DRFLAC_BUSY                                     -19
+#define DRFLAC_IO_ERROR                                 -20
+#define DRFLAC_INTERRUPT                                -21
+#define DRFLAC_UNAVAILABLE                              -22
+#define DRFLAC_ALREADY_IN_USE                           -23
+#define DRFLAC_BAD_ADDRESS                              -24
+#define DRFLAC_BAD_SEEK                                 -25
+#define DRFLAC_BAD_PIPE                                 -26
+#define DRFLAC_DEADLOCK                                 -27
+#define DRFLAC_TOO_MANY_LINKS                           -28
+#define DRFLAC_NOT_IMPLEMENTED                          -29
+#define DRFLAC_NO_MESSAGE                               -30
+#define DRFLAC_BAD_MESSAGE                              -31
+#define DRFLAC_NO_DATA_AVAILABLE                        -32
+#define DRFLAC_INVALID_DATA                             -33
+#define DRFLAC_TIMEOUT                                  -34
+#define DRFLAC_NO_NETWORK                               -35
+#define DRFLAC_NOT_UNIQUE                               -36
+#define DRFLAC_NOT_SOCKET                               -37
+#define DRFLAC_NO_ADDRESS                               -38
+#define DRFLAC_BAD_PROTOCOL                             -39
+#define DRFLAC_PROTOCOL_UNAVAILABLE                     -40
+#define DRFLAC_PROTOCOL_NOT_SUPPORTED                   -41
+#define DRFLAC_PROTOCOL_FAMILY_NOT_SUPPORTED            -42
+#define DRFLAC_ADDRESS_FAMILY_NOT_SUPPORTED             -43
+#define DRFLAC_SOCKET_NOT_SUPPORTED                     -44
+#define DRFLAC_CONNECTION_RESET                         -45
+#define DRFLAC_ALREADY_CONNECTED                        -46
+#define DRFLAC_NOT_CONNECTED                            -47
+#define DRFLAC_CONNECTION_REFUSED                       -48
+#define DRFLAC_NO_HOST                                  -49
+#define DRFLAC_IN_PROGRESS                              -50
+#define DRFLAC_CANCELLED                                -51
+#define DRFLAC_MEMORY_ALREADY_MAPPED                    -52
+#define DRFLAC_AT_END                                   -53
+
+#define DRFLAC_CRC_MISMATCH                             -100
+/* End Result Codes */
+
+
+#define DRFLAC_SUBFRAME_CONSTANT                        0
+#define DRFLAC_SUBFRAME_VERBATIM                        1
+#define DRFLAC_SUBFRAME_FIXED                           8
+#define DRFLAC_SUBFRAME_LPC                             32
+#define DRFLAC_SUBFRAME_RESERVED                        255
+
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
+
+#define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
+#define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
+#define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
+#define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
+
+#define DRFLAC_SEEKPOINT_SIZE_IN_BYTES                  18
+#define DRFLAC_CUESHEET_TRACK_SIZE_IN_BYTES             36
+#define DRFLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES       12
+
+#define drflac_align(x, a)                              ((((x) + (a) - 1) / (a)) * (a))
+
+
+DRFLAC_API void drflac_version(drflac_uint32* pMajor, drflac_uint32* pMinor, drflac_uint32* pRevision)
+{
+    if (pMajor) {
+        *pMajor = DRFLAC_VERSION_MAJOR;
+    }
+
+    if (pMinor) {
+        *pMinor = DRFLAC_VERSION_MINOR;
+    }
+
+    if (pRevision) {
+        *pRevision = DRFLAC_VERSION_REVISION;
+    }
+}
+
+DRFLAC_API const char* drflac_version_string(void)
+{
+    return DRFLAC_VERSION_STRING;
+}
+
+
+/* CPU caps. */
+#if defined(__has_feature)
+    #if __has_feature(thread_sanitizer)
+        #define DRFLAC_NO_THREAD_SANITIZE __attribute__((no_sanitize("thread")))
+    #else
+        #define DRFLAC_NO_THREAD_SANITIZE
+    #endif
+#else
+    #define DRFLAC_NO_THREAD_SANITIZE
+#endif
+
+#if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
+static drflac_bool32 drflac__gIsLZCNTSupported = DRFLAC_FALSE;
+#endif
+
+#ifndef DRFLAC_NO_CPUID
+static drflac_bool32 drflac__gIsSSE2Supported  = DRFLAC_FALSE;
+static drflac_bool32 drflac__gIsSSE41Supported = DRFLAC_FALSE;
+
+/*
+I've had a bug report that Clang's ThreadSanitizer presents a warning in this function. Having reviewed this, this does
+actually make sense. However, since CPU caps should never differ for a running process, I don't think the trade off of
+complicating internal API's by passing around CPU caps versus just disabling the warnings is worthwhile. I'm therefore
+just going to disable these warnings. This is disabled via the DRFLAC_NO_THREAD_SANITIZE attribute.
+*/
+DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void)
+{
+    static drflac_bool32 isCPUCapsInitialized = DRFLAC_FALSE;
+
+    if (!isCPUCapsInitialized) {
+        /* LZCNT */
+#if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
+        int info[4] = {0};
+        drflac__cpuid(info, 0x80000001);
+        drflac__gIsLZCNTSupported = (info[2] & (1 << 5)) != 0;
+#endif
+
+        /* SSE2 */
+        drflac__gIsSSE2Supported = drflac_has_sse2();
+
+        /* SSE4.1 */
+        drflac__gIsSSE41Supported = drflac_has_sse41();
+
+        /* Initialized. */
+        isCPUCapsInitialized = DRFLAC_TRUE;
+    }
+}
+#else
+static drflac_bool32 drflac__gIsNEONSupported  = DRFLAC_FALSE;
+
+static DRFLAC_INLINE drflac_bool32 drflac__has_neon(void)
+{
+#if defined(DRFLAC_SUPPORT_NEON)
+    #if defined(DRFLAC_ARM) && !defined(DRFLAC_NO_NEON)
+        #if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
+            return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate NEON code we can assume support. */
+        #else
+            /* TODO: Runtime check. */
+            return DRFLAC_FALSE;
+        #endif
+    #else
+        return DRFLAC_FALSE;       /* NEON is only supported on ARM architectures. */
+    #endif
+#else
+    return DRFLAC_FALSE;           /* No compiler support. */
+#endif
+}
+
+DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void)
+{
+    drflac__gIsNEONSupported = drflac__has_neon();
+
+#if defined(DRFLAC_HAS_LZCNT_INTRINSIC) && defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
+    drflac__gIsLZCNTSupported = DRFLAC_TRUE;
+#endif
+}
+#endif
+
+
+/* Endian Management */
+static DRFLAC_INLINE drflac_bool32 drflac__is_little_endian(void)
+{
+#if defined(DRFLAC_X86) || defined(DRFLAC_X64)
+    return DRFLAC_TRUE;
+#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
+    return DRFLAC_TRUE;
+#else
+    int n = 1;
+    return (*(char*)&n) == 1;
+#endif
+}
+
+static DRFLAC_INLINE drflac_uint16 drflac__swap_endian_uint16(drflac_uint16 n)
+{
+#ifdef DRFLAC_HAS_BYTESWAP16_INTRINSIC
+    #if defined(_MSC_VER) && !defined(__clang__)
+        return _byteswap_ushort(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap16(n);
+    #elif defined(__WATCOMC__) && defined(__386__)
+        return _watcom_bswap16(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF00) >> 8) |
+           ((n & 0x00FF) << 8);
+#endif
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac__swap_endian_uint32(drflac_uint32 n)
+{
+#ifdef DRFLAC_HAS_BYTESWAP32_INTRINSIC
+    #if defined(_MSC_VER) && !defined(__clang__)
+        return _byteswap_ulong(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        #if defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(__ARM_ARCH_6M__) && !defined(DRFLAC_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
+            /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
+            drflac_uint32 r;
+            __asm__ __volatile__ (
+            #if defined(DRFLAC_64BIT)
+                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
+            #else
+                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
+            #endif
+            );
+            return r;
+        #else
+            return __builtin_bswap32(n);
+        #endif
+    #elif defined(__WATCOMC__) && defined(__386__)
+        return _watcom_bswap32(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+
+static DRFLAC_INLINE drflac_uint64 drflac__swap_endian_uint64(drflac_uint64 n)
+{
+#ifdef DRFLAC_HAS_BYTESWAP64_INTRINSIC
+    #if defined(_MSC_VER) && !defined(__clang__)
+        return _byteswap_uint64(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap64(n);
+    #elif defined(__WATCOMC__) && defined(__386__)
+        return _watcom_bswap64(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
+    return ((n & ((drflac_uint64)0xFF000000 << 32)) >> 56) |
+           ((n & ((drflac_uint64)0x00FF0000 << 32)) >> 40) |
+           ((n & ((drflac_uint64)0x0000FF00 << 32)) >> 24) |
+           ((n & ((drflac_uint64)0x000000FF << 32)) >>  8) |
+           ((n & ((drflac_uint64)0xFF000000      )) <<  8) |
+           ((n & ((drflac_uint64)0x00FF0000      )) << 24) |
+           ((n & ((drflac_uint64)0x0000FF00      )) << 40) |
+           ((n & ((drflac_uint64)0x000000FF      )) << 56);
+#endif
+}
+
+
+static DRFLAC_INLINE drflac_uint16 drflac__be2host_16(drflac_uint16 n)
+{
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint16(n);
+    }
+
+    return n;
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac__be2host_32(drflac_uint32 n)
+{
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint32(n);
+    }
+
+    return n;
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac__be2host_32_ptr_unaligned(const void* pData)
+{
+    const drflac_uint8* pNum = (drflac_uint8*)pData;
+    return *(pNum) << 24 | *(pNum+1) << 16 | *(pNum+2) << 8 | *(pNum+3);
+}
+
+static DRFLAC_INLINE drflac_uint64 drflac__be2host_64(drflac_uint64 n)
+{
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint64(n);
+    }
+
+    return n;
+}
+
+
+static DRFLAC_INLINE drflac_uint32 drflac__le2host_32(drflac_uint32 n)
+{
+    if (!drflac__is_little_endian()) {
+        return drflac__swap_endian_uint32(n);
+    }
+
+    return n;
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac__le2host_32_ptr_unaligned(const void* pData)
+{
+    const drflac_uint8* pNum = (drflac_uint8*)pData;
+    return *pNum | *(pNum+1) << 8 |  *(pNum+2) << 16 | *(pNum+3) << 24;
+}
+
+
+static DRFLAC_INLINE drflac_uint32 drflac__unsynchsafe_32(drflac_uint32 n)
+{
+    drflac_uint32 result = 0;
+    result |= (n & 0x7F000000) >> 3;
+    result |= (n & 0x007F0000) >> 2;
+    result |= (n & 0x00007F00) >> 1;
+    result |= (n & 0x0000007F) >> 0;
+
+    return result;
+}
+
+
+
+/* The CRC code below is based on this document: http://zlib.net/crc_v3.txt */
+static drflac_uint8 drflac__crc8_table[] = {
+    0x00, 0x07, 0x0E, 0x09, 0x1C, 0x1B, 0x12, 0x15, 0x38, 0x3F, 0x36, 0x31, 0x24, 0x23, 0x2A, 0x2D,
+    0x70, 0x77, 0x7E, 0x79, 0x6C, 0x6B, 0x62, 0x65, 0x48, 0x4F, 0x46, 0x41, 0x54, 0x53, 0x5A, 0x5D,
+    0xE0, 0xE7, 0xEE, 0xE9, 0xFC, 0xFB, 0xF2, 0xF5, 0xD8, 0xDF, 0xD6, 0xD1, 0xC4, 0xC3, 0xCA, 0xCD,
+    0x90, 0x97, 0x9E, 0x99, 0x8C, 0x8B, 0x82, 0x85, 0xA8, 0xAF, 0xA6, 0xA1, 0xB4, 0xB3, 0xBA, 0xBD,
+    0xC7, 0xC0, 0xC9, 0xCE, 0xDB, 0xDC, 0xD5, 0xD2, 0xFF, 0xF8, 0xF1, 0xF6, 0xE3, 0xE4, 0xED, 0xEA,
+    0xB7, 0xB0, 0xB9, 0xBE, 0xAB, 0xAC, 0xA5, 0xA2, 0x8F, 0x88, 0x81, 0x86, 0x93, 0x94, 0x9D, 0x9A,
+    0x27, 0x20, 0x29, 0x2E, 0x3B, 0x3C, 0x35, 0x32, 0x1F, 0x18, 0x11, 0x16, 0x03, 0x04, 0x0D, 0x0A,
+    0x57, 0x50, 0x59, 0x5E, 0x4B, 0x4C, 0x45, 0x42, 0x6F, 0x68, 0x61, 0x66, 0x73, 0x74, 0x7D, 0x7A,
+    0x89, 0x8E, 0x87, 0x80, 0x95, 0x92, 0x9B, 0x9C, 0xB1, 0xB6, 0xBF, 0xB8, 0xAD, 0xAA, 0xA3, 0xA4,
+    0xF9, 0xFE, 0xF7, 0xF0, 0xE5, 0xE2, 0xEB, 0xEC, 0xC1, 0xC6, 0xCF, 0xC8, 0xDD, 0xDA, 0xD3, 0xD4,
+    0x69, 0x6E, 0x67, 0x60, 0x75, 0x72, 0x7B, 0x7C, 0x51, 0x56, 0x5F, 0x58, 0x4D, 0x4A, 0x43, 0x44,
+    0x19, 0x1E, 0x17, 0x10, 0x05, 0x02, 0x0B, 0x0C, 0x21, 0x26, 0x2F, 0x28, 0x3D, 0x3A, 0x33, 0x34,
+    0x4E, 0x49, 0x40, 0x47, 0x52, 0x55, 0x5C, 0x5B, 0x76, 0x71, 0x78, 0x7F, 0x6A, 0x6D, 0x64, 0x63,
+    0x3E, 0x39, 0x30, 0x37, 0x22, 0x25, 0x2C, 0x2B, 0x06, 0x01, 0x08, 0x0F, 0x1A, 0x1D, 0x14, 0x13,
+    0xAE, 0xA9, 0xA0, 0xA7, 0xB2, 0xB5, 0xBC, 0xBB, 0x96, 0x91, 0x98, 0x9F, 0x8A, 0x8D, 0x84, 0x83,
+    0xDE, 0xD9, 0xD0, 0xD7, 0xC2, 0xC5, 0xCC, 0xCB, 0xE6, 0xE1, 0xE8, 0xEF, 0xFA, 0xFD, 0xF4, 0xF3
+};
+
+static drflac_uint16 drflac__crc16_table[] = {
+    0x0000, 0x8005, 0x800F, 0x000A, 0x801B, 0x001E, 0x0014, 0x8011,
+    0x8033, 0x0036, 0x003C, 0x8039, 0x0028, 0x802D, 0x8027, 0x0022,
+    0x8063, 0x0066, 0x006C, 0x8069, 0x0078, 0x807D, 0x8077, 0x0072,
+    0x0050, 0x8055, 0x805F, 0x005A, 0x804B, 0x004E, 0x0044, 0x8041,
+    0x80C3, 0x00C6, 0x00CC, 0x80C9, 0x00D8, 0x80DD, 0x80D7, 0x00D2,
+    0x00F0, 0x80F5, 0x80FF, 0x00FA, 0x80EB, 0x00EE, 0x00E4, 0x80E1,
+    0x00A0, 0x80A5, 0x80AF, 0x00AA, 0x80BB, 0x00BE, 0x00B4, 0x80B1,
+    0x8093, 0x0096, 0x009C, 0x8099, 0x0088, 0x808D, 0x8087, 0x0082,
+    0x8183, 0x0186, 0x018C, 0x8189, 0x0198, 0x819D, 0x8197, 0x0192,
+    0x01B0, 0x81B5, 0x81BF, 0x01BA, 0x81AB, 0x01AE, 0x01A4, 0x81A1,
+    0x01E0, 0x81E5, 0x81EF, 0x01EA, 0x81FB, 0x01FE, 0x01F4, 0x81F1,
+    0x81D3, 0x01D6, 0x01DC, 0x81D9, 0x01C8, 0x81CD, 0x81C7, 0x01C2,
+    0x0140, 0x8145, 0x814F, 0x014A, 0x815B, 0x015E, 0x0154, 0x8151,
+    0x8173, 0x0176, 0x017C, 0x8179, 0x0168, 0x816D, 0x8167, 0x0162,
+    0x8123, 0x0126, 0x012C, 0x8129, 0x0138, 0x813D, 0x8137, 0x0132,
+    0x0110, 0x8115, 0x811F, 0x011A, 0x810B, 0x010E, 0x0104, 0x8101,
+    0x8303, 0x0306, 0x030C, 0x8309, 0x0318, 0x831D, 0x8317, 0x0312,
+    0x0330, 0x8335, 0x833F, 0x033A, 0x832B, 0x032E, 0x0324, 0x8321,
+    0x0360, 0x8365, 0x836F, 0x036A, 0x837B, 0x037E, 0x0374, 0x8371,
+    0x8353, 0x0356, 0x035C, 0x8359, 0x0348, 0x834D, 0x8347, 0x0342,
+    0x03C0, 0x83C5, 0x83CF, 0x03CA, 0x83DB, 0x03DE, 0x03D4, 0x83D1,
+    0x83F3, 0x03F6, 0x03FC, 0x83F9, 0x03E8, 0x83ED, 0x83E7, 0x03E2,
+    0x83A3, 0x03A6, 0x03AC, 0x83A9, 0x03B8, 0x83BD, 0x83B7, 0x03B2,
+    0x0390, 0x8395, 0x839F, 0x039A, 0x838B, 0x038E, 0x0384, 0x8381,
+    0x0280, 0x8285, 0x828F, 0x028A, 0x829B, 0x029E, 0x0294, 0x8291,
+    0x82B3, 0x02B6, 0x02BC, 0x82B9, 0x02A8, 0x82AD, 0x82A7, 0x02A2,
+    0x82E3, 0x02E6, 0x02EC, 0x82E9, 0x02F8, 0x82FD, 0x82F7, 0x02F2,
+    0x02D0, 0x82D5, 0x82DF, 0x02DA, 0x82CB, 0x02CE, 0x02C4, 0x82C1,
+    0x8243, 0x0246, 0x024C, 0x8249, 0x0258, 0x825D, 0x8257, 0x0252,
+    0x0270, 0x8275, 0x827F, 0x027A, 0x826B, 0x026E, 0x0264, 0x8261,
+    0x0220, 0x8225, 0x822F, 0x022A, 0x823B, 0x023E, 0x0234, 0x8231,
+    0x8213, 0x0216, 0x021C, 0x8219, 0x0208, 0x820D, 0x8207, 0x0202
+};
+
+static DRFLAC_INLINE drflac_uint8 drflac_crc8_byte(drflac_uint8 crc, drflac_uint8 data)
+{
+    return drflac__crc8_table[crc ^ data];
+}
+
+static DRFLAC_INLINE drflac_uint8 drflac_crc8(drflac_uint8 crc, drflac_uint32 data, drflac_uint32 count)
+{
+#ifdef DR_FLAC_NO_CRC
+    (void)crc;
+    (void)data;
+    (void)count;
+    return 0;
+#else
+#if 0
+    /* REFERENCE (use of this implementation requires an explicit flush by doing "drflac_crc8(crc, 0, 8);") */
+    drflac_uint8 p = 0x07;
+    for (int i = count-1; i >= 0; --i) {
+        drflac_uint8 bit = (data & (1 << i)) >> i;
+        if (crc & 0x80) {
+            crc = ((crc << 1) | bit) ^ p;
+        } else {
+            crc = ((crc << 1) | bit);
+        }
+    }
+    return crc;
+#else
+    drflac_uint32 wholeBytes;
+    drflac_uint32 leftoverBits;
+    drflac_uint64 leftoverDataMask;
+
+    static drflac_uint64 leftoverDataMaskTable[8] = {
+        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
+    };
+
+    DRFLAC_ASSERT(count <= 32);
+
+    wholeBytes = count >> 3;
+    leftoverBits = count - (wholeBytes*8);
+    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
+
+    switch (wholeBytes) {
+        case 4: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
+        case 3: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
+        case 2: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
+        case 1: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
+        case 0: if (leftoverBits > 0) crc = (drflac_uint8)((crc << leftoverBits) ^ drflac__crc8_table[(crc >> (8 - leftoverBits)) ^ (data & leftoverDataMask)]);
+    }
+    return crc;
+#endif
+#endif
+}
+
+static DRFLAC_INLINE drflac_uint16 drflac_crc16_byte(drflac_uint16 crc, drflac_uint8 data)
+{
+    return (crc << 8) ^ drflac__crc16_table[(drflac_uint8)(crc >> 8) ^ data];
+}
+
+static DRFLAC_INLINE drflac_uint16 drflac_crc16_cache(drflac_uint16 crc, drflac_cache_t data)
+{
+#ifdef DRFLAC_64BIT
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 56) & 0xFF));
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 48) & 0xFF));
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 40) & 0xFF));
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 32) & 0xFF));
+#endif
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 24) & 0xFF));
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 16) & 0xFF));
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  8) & 0xFF));
+    crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  0) & 0xFF));
+
+    return crc;
+}
+
+static DRFLAC_INLINE drflac_uint16 drflac_crc16_bytes(drflac_uint16 crc, drflac_cache_t data, drflac_uint32 byteCount)
+{
+    switch (byteCount)
+    {
+#ifdef DRFLAC_64BIT
+    case 8: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 56) & 0xFF));
+    case 7: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 48) & 0xFF));
+    case 6: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 40) & 0xFF));
+    case 5: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 32) & 0xFF));
+#endif
+    case 4: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 24) & 0xFF));
+    case 3: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 16) & 0xFF));
+    case 2: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  8) & 0xFF));
+    case 1: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  0) & 0xFF));
+    }
+
+    return crc;
+}
+
+#if 0
+static DRFLAC_INLINE drflac_uint16 drflac_crc16__32bit(drflac_uint16 crc, drflac_uint32 data, drflac_uint32 count)
+{
+#ifdef DR_FLAC_NO_CRC
+    (void)crc;
+    (void)data;
+    (void)count;
+    return 0;
+#else
+#if 0
+    /* REFERENCE (use of this implementation requires an explicit flush by doing "drflac_crc16(crc, 0, 16);") */
+    drflac_uint16 p = 0x8005;
+    for (int i = count-1; i >= 0; --i) {
+        drflac_uint16 bit = (data & (1ULL << i)) >> i;
+        if (r & 0x8000) {
+            r = ((r << 1) | bit) ^ p;
+        } else {
+            r = ((r << 1) | bit);
+        }
+    }
+
+    return crc;
+#else
+    drflac_uint32 wholeBytes;
+    drflac_uint32 leftoverBits;
+    drflac_uint64 leftoverDataMask;
+
+    static drflac_uint64 leftoverDataMaskTable[8] = {
+        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
+    };
+
+    DRFLAC_ASSERT(count <= 64);
+
+    wholeBytes = count >> 3;
+    leftoverBits = count & 7;
+    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
+
+    switch (wholeBytes) {
+        default:
+        case 4: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
+        case 3: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
+        case 2: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
+        case 1: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
+        case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ drflac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
+    }
+    return crc;
+#endif
+#endif
+}
+
+static DRFLAC_INLINE drflac_uint16 drflac_crc16__64bit(drflac_uint16 crc, drflac_uint64 data, drflac_uint32 count)
+{
+#ifdef DR_FLAC_NO_CRC
+    (void)crc;
+    (void)data;
+    (void)count;
+    return 0;
+#else
+    drflac_uint32 wholeBytes;
+    drflac_uint32 leftoverBits;
+    drflac_uint64 leftoverDataMask;
+
+    static drflac_uint64 leftoverDataMaskTable[8] = {
+        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
+    };
+
+    DRFLAC_ASSERT(count <= 64);
+
+    wholeBytes = count >> 3;
+    leftoverBits = count & 7;
+    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
+
+    switch (wholeBytes) {
+        default:
+        case 8: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0xFF000000 << 32) << leftoverBits)) >> (56 + leftoverBits)));    /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
+        case 7: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x00FF0000 << 32) << leftoverBits)) >> (48 + leftoverBits)));
+        case 6: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x0000FF00 << 32) << leftoverBits)) >> (40 + leftoverBits)));
+        case 5: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x000000FF << 32) << leftoverBits)) >> (32 + leftoverBits)));
+        case 4: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0xFF000000      ) << leftoverBits)) >> (24 + leftoverBits)));
+        case 3: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x00FF0000      ) << leftoverBits)) >> (16 + leftoverBits)));
+        case 2: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x0000FF00      ) << leftoverBits)) >> ( 8 + leftoverBits)));
+        case 1: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x000000FF      ) << leftoverBits)) >> ( 0 + leftoverBits)));
+        case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ drflac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
+    }
+    return crc;
+#endif
+}
+
+
+static DRFLAC_INLINE drflac_uint16 drflac_crc16(drflac_uint16 crc, drflac_cache_t data, drflac_uint32 count)
+{
+#ifdef DRFLAC_64BIT
+    return drflac_crc16__64bit(crc, data, count);
+#else
+    return drflac_crc16__32bit(crc, data, count);
+#endif
+}
+#endif
+
+
+#ifdef DRFLAC_64BIT
+#define drflac__be2host__cache_line drflac__be2host_64
+#else
+#define drflac__be2host__cache_line drflac__be2host_32
+#endif
+
+/*
+BIT READING ATTEMPT #2
+
+This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
+on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
+is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
+array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
+from onRead() is read into.
+*/
+#define DRFLAC_CACHE_L1_SIZE_BYTES(bs)                      (sizeof((bs)->cache))
+#define DRFLAC_CACHE_L1_SIZE_BITS(bs)                       (sizeof((bs)->cache)*8)
+#define DRFLAC_CACHE_L1_BITS_REMAINING(bs)                  (DRFLAC_CACHE_L1_SIZE_BITS(bs) - (bs)->consumedBits)
+#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)           (~((~(drflac_cache_t)0) >> (_bitCount)))
+#define DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, _bitCount)      (DRFLAC_CACHE_L1_SIZE_BITS(bs) - (_bitCount))
+#define DRFLAC_CACHE_L1_SELECT(bs, _bitCount)               (((bs)->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
+#define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, _bitCount)     (DRFLAC_CACHE_L1_SELECT((bs), (_bitCount)) >>  DRFLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)))
+#define DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, _bitCount)(DRFLAC_CACHE_L1_SELECT((bs), (_bitCount)) >> (DRFLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)) & (DRFLAC_CACHE_L1_SIZE_BITS(bs)-1)))
+#define DRFLAC_CACHE_L2_SIZE_BYTES(bs)                      (sizeof((bs)->cacheL2))
+#define DRFLAC_CACHE_L2_LINE_COUNT(bs)                      (DRFLAC_CACHE_L2_SIZE_BYTES(bs) / sizeof((bs)->cacheL2[0]))
+#define DRFLAC_CACHE_L2_LINES_REMAINING(bs)                 (DRFLAC_CACHE_L2_LINE_COUNT(bs) - (bs)->nextL2Line)
+
+
+#ifndef DR_FLAC_NO_CRC
+static DRFLAC_INLINE void drflac__reset_crc16(drflac_bs* bs)
+{
+    bs->crc16 = 0;
+    bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
+}
+
+static DRFLAC_INLINE void drflac__update_crc16(drflac_bs* bs)
+{
+    if (bs->crc16CacheIgnoredBytes == 0) {
+        bs->crc16 = drflac_crc16_cache(bs->crc16, bs->crc16Cache);
+    } else {
+        bs->crc16 = drflac_crc16_bytes(bs->crc16, bs->crc16Cache, DRFLAC_CACHE_L1_SIZE_BYTES(bs) - bs->crc16CacheIgnoredBytes);
+        bs->crc16CacheIgnoredBytes = 0;
+    }
+}
+
+static DRFLAC_INLINE drflac_uint16 drflac__flush_crc16(drflac_bs* bs)
+{
+    /* We should never be flushing in a situation where we are not aligned on a byte boundary. */
+    DRFLAC_ASSERT((DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7) == 0);
+
+    /*
+    The bits that were read from the L1 cache need to be accumulated. The number of bytes needing to be accumulated is determined
+    by the number of bits that have been consumed.
+    */
+    if (DRFLAC_CACHE_L1_BITS_REMAINING(bs) == 0) {
+        drflac__update_crc16(bs);
+    } else {
+        /* We only accumulate the consumed bits. */
+        bs->crc16 = drflac_crc16_bytes(bs->crc16, bs->crc16Cache >> DRFLAC_CACHE_L1_BITS_REMAINING(bs), (bs->consumedBits >> 3) - bs->crc16CacheIgnoredBytes);
+
+        /*
+        The bits that we just accumulated should never be accumulated again. We need to keep track of how many bytes were accumulated
+        so we can handle that later.
+        */
+        bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
+    }
+
+    return bs->crc16;
+}
+#endif
+
+static DRFLAC_INLINE drflac_bool32 drflac__reload_l1_cache_from_l2(drflac_bs* bs)
+{
+    size_t bytesRead;
+    size_t alignedL1LineCount;
+
+    /* Fast path. Try loading straight from L2. */
+    if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+        bs->cache = bs->cacheL2[bs->nextL2Line++];
+        return DRFLAC_TRUE;
+    }
+
+    /*
+    If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client, if there's
+    any left.
+    */
+    if (bs->unalignedByteCount > 0) {
+        return DRFLAC_FALSE;   /* If we have any unaligned bytes it means there's no more aligned bytes left in the client. */
+    }
+
+    bytesRead = bs->onRead(bs->pUserData, bs->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES(bs));
+
+    bs->nextL2Line = 0;
+    if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES(bs)) {
+        bs->cache = bs->cacheL2[bs->nextL2Line++];
+        return DRFLAC_TRUE;
+    }
+
+
+    /*
+    If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
+    means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
+    and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
+    the size of the L1 so we'll need to seek backwards by any misaligned bytes.
+    */
+    alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES(bs);
+
+    /* We need to keep track of any unaligned bytes for later use. */
+    bs->unalignedByteCount = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES(bs));
+    if (bs->unalignedByteCount > 0) {
+        bs->unalignedCache = bs->cacheL2[alignedL1LineCount];
+    }
+
+    if (alignedL1LineCount > 0) {
+        size_t offset = DRFLAC_CACHE_L2_LINE_COUNT(bs) - alignedL1LineCount;
+        size_t i;
+        for (i = alignedL1LineCount; i > 0; --i) {
+            bs->cacheL2[i-1 + offset] = bs->cacheL2[i-1];
+        }
+
+        bs->nextL2Line = (drflac_uint32)offset;
+        bs->cache = bs->cacheL2[bs->nextL2Line++];
+        return DRFLAC_TRUE;
+    } else {
+        /* If we get into this branch it means we weren't able to load any L1-aligned data. */
+        bs->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT(bs);
+        return DRFLAC_FALSE;
+    }
+}
+
+static drflac_bool32 drflac__reload_cache(drflac_bs* bs)
+{
+    size_t bytesRead;
+
+#ifndef DR_FLAC_NO_CRC
+    drflac__update_crc16(bs);
+#endif
+
+    /* Fast path. Try just moving the next value in the L2 cache to the L1 cache. */
+    if (drflac__reload_l1_cache_from_l2(bs)) {
+        bs->cache = drflac__be2host__cache_line(bs->cache);
+        bs->consumedBits = 0;
+#ifndef DR_FLAC_NO_CRC
+        bs->crc16Cache = bs->cache;
+#endif
+        return DRFLAC_TRUE;
+    }
+
+    /* Slow path. */
+
+    /*
+    If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
+    few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
+    data from the unaligned cache.
+    */
+    bytesRead = bs->unalignedByteCount;
+    if (bytesRead == 0) {
+        bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);   /* <-- The stream has been exhausted, so marked the bits as consumed. */
+        return DRFLAC_FALSE;
+    }
+
+    DRFLAC_ASSERT(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES(bs));
+    bs->consumedBits = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BYTES(bs) - bytesRead) * 8;
+
+    bs->cache = drflac__be2host__cache_line(bs->unalignedCache);
+    bs->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_BITS_REMAINING(bs));    /* <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property. */
+    bs->unalignedByteCount = 0;     /* <-- At this point the unaligned bytes have been moved into the cache and we thus have no more unaligned bytes. */
+
+#ifndef DR_FLAC_NO_CRC
+    bs->crc16Cache = bs->cache >> bs->consumedBits;
+    bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
+#endif
+    return DRFLAC_TRUE;
+}
+
+static void drflac__reset_cache(drflac_bs* bs)
+{
+    bs->nextL2Line   = DRFLAC_CACHE_L2_LINE_COUNT(bs);  /* <-- This clears the L2 cache. */
+    bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);   /* <-- This clears the L1 cache. */
+    bs->cache = 0;
+    bs->unalignedByteCount = 0;                         /* <-- This clears the trailing unaligned bytes. */
+    bs->unalignedCache = 0;
+
+#ifndef DR_FLAC_NO_CRC
+    bs->crc16Cache = 0;
+    bs->crc16CacheIgnoredBytes = 0;
+#endif
+}
+
+
+static DRFLAC_INLINE drflac_bool32 drflac__read_uint32(drflac_bs* bs, unsigned int bitCount, drflac_uint32* pResultOut)
+{
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pResultOut != NULL);
+    DRFLAC_ASSERT(bitCount > 0);
+    DRFLAC_ASSERT(bitCount <= 32);
+
+    if (bs->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
+        if (!drflac__reload_cache(bs)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+        /*
+        If we want to load all 32-bits from a 32-bit cache we need to do it slightly differently because we can't do
+        a 32-bit shift on a 32-bit integer. This will never be the case on 64-bit caches, so we can have a slightly
+        more optimal solution for this.
+        */
+#ifdef DRFLAC_64BIT
+        *pResultOut = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
+        bs->consumedBits += bitCount;
+        bs->cache <<= bitCount;
+#else
+        if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
+            *pResultOut = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
+            bs->consumedBits += bitCount;
+            bs->cache <<= bitCount;
+        } else {
+            /* Cannot shift by 32-bits, so need to do it differently. */
+            *pResultOut = (drflac_uint32)bs->cache;
+            bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);
+            bs->cache = 0;
+        }
+#endif
+
+        return DRFLAC_TRUE;
+    } else {
+        /* It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them. */
+        drflac_uint32 bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING(bs);
+        drflac_uint32 bitCountLo = bitCount - bitCountHi;
+        drflac_uint32 resultHi;
+
+        DRFLAC_ASSERT(bitCountHi > 0);
+        DRFLAC_ASSERT(bitCountHi < 32);
+        resultHi = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountHi);
+
+        if (!drflac__reload_cache(bs)) {
+            return DRFLAC_FALSE;
+        }
+        if (bitCountLo > DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+            /* This happens when we get to end of stream */
+            return DRFLAC_FALSE;
+        }
+
+        *pResultOut = (resultHi << bitCountLo) | (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountLo);
+        bs->consumedBits += bitCountLo;
+        bs->cache <<= bitCountLo;
+        return DRFLAC_TRUE;
+    }
+}
+
+static drflac_bool32 drflac__read_int32(drflac_bs* bs, unsigned int bitCount, drflac_int32* pResult)
+{
+    drflac_uint32 result;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pResult != NULL);
+    DRFLAC_ASSERT(bitCount > 0);
+    DRFLAC_ASSERT(bitCount <= 32);
+
+    if (!drflac__read_uint32(bs, bitCount, &result)) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Do not attempt to shift by 32 as it's undefined. */
+    if (bitCount < 32) {
+        drflac_uint32 signbit;
+        signbit = ((result >> (bitCount-1)) & 0x01);
+        result |= (~signbit + 1) << bitCount;
+    }
+
+    *pResult = (drflac_int32)result;
+    return DRFLAC_TRUE;
+}
+
+#ifdef DRFLAC_64BIT
+static drflac_bool32 drflac__read_uint64(drflac_bs* bs, unsigned int bitCount, drflac_uint64* pResultOut)
+{
+    drflac_uint32 resultHi;
+    drflac_uint32 resultLo;
+
+    DRFLAC_ASSERT(bitCount <= 64);
+    DRFLAC_ASSERT(bitCount >  32);
+
+    if (!drflac__read_uint32(bs, bitCount - 32, &resultHi)) {
+        return DRFLAC_FALSE;
+    }
+
+    if (!drflac__read_uint32(bs, 32, &resultLo)) {
+        return DRFLAC_FALSE;
+    }
+
+    *pResultOut = (((drflac_uint64)resultHi) << 32) | ((drflac_uint64)resultLo);
+    return DRFLAC_TRUE;
+}
+#endif
+
+/* Function below is unused, but leaving it here in case I need to quickly add it again. */
+#if 0
+static drflac_bool32 drflac__read_int64(drflac_bs* bs, unsigned int bitCount, drflac_int64* pResultOut)
+{
+    drflac_uint64 result;
+    drflac_uint64 signbit;
+
+    DRFLAC_ASSERT(bitCount <= 64);
+
+    if (!drflac__read_uint64(bs, bitCount, &result)) {
+        return DRFLAC_FALSE;
+    }
+
+    signbit = ((result >> (bitCount-1)) & 0x01);
+    result |= (~signbit + 1) << bitCount;
+
+    *pResultOut = (drflac_int64)result;
+    return DRFLAC_TRUE;
+}
+#endif
+
+static drflac_bool32 drflac__read_uint16(drflac_bs* bs, unsigned int bitCount, drflac_uint16* pResult)
+{
+    drflac_uint32 result;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pResult != NULL);
+    DRFLAC_ASSERT(bitCount > 0);
+    DRFLAC_ASSERT(bitCount <= 16);
+
+    if (!drflac__read_uint32(bs, bitCount, &result)) {
+        return DRFLAC_FALSE;
+    }
+
+    *pResult = (drflac_uint16)result;
+    return DRFLAC_TRUE;
+}
+
+#if 0
+static drflac_bool32 drflac__read_int16(drflac_bs* bs, unsigned int bitCount, drflac_int16* pResult)
+{
+    drflac_int32 result;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pResult != NULL);
+    DRFLAC_ASSERT(bitCount > 0);
+    DRFLAC_ASSERT(bitCount <= 16);
+
+    if (!drflac__read_int32(bs, bitCount, &result)) {
+        return DRFLAC_FALSE;
+    }
+
+    *pResult = (drflac_int16)result;
+    return DRFLAC_TRUE;
+}
+#endif
+
+static drflac_bool32 drflac__read_uint8(drflac_bs* bs, unsigned int bitCount, drflac_uint8* pResult)
+{
+    drflac_uint32 result;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pResult != NULL);
+    DRFLAC_ASSERT(bitCount > 0);
+    DRFLAC_ASSERT(bitCount <= 8);
+
+    if (!drflac__read_uint32(bs, bitCount, &result)) {
+        return DRFLAC_FALSE;
+    }
+
+    *pResult = (drflac_uint8)result;
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__read_int8(drflac_bs* bs, unsigned int bitCount, drflac_int8* pResult)
+{
+    drflac_int32 result;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pResult != NULL);
+    DRFLAC_ASSERT(bitCount > 0);
+    DRFLAC_ASSERT(bitCount <= 8);
+
+    if (!drflac__read_int32(bs, bitCount, &result)) {
+        return DRFLAC_FALSE;
+    }
+
+    *pResult = (drflac_int8)result;
+    return DRFLAC_TRUE;
+}
+
+
+static drflac_bool32 drflac__seek_bits(drflac_bs* bs, size_t bitsToSeek)
+{
+    if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+        bs->consumedBits += (drflac_uint32)bitsToSeek;
+        bs->cache <<= bitsToSeek;
+        return DRFLAC_TRUE;
+    } else {
+        /* It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here. */
+        bitsToSeek       -= DRFLAC_CACHE_L1_BITS_REMAINING(bs);
+        bs->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING(bs);
+        bs->cache         = 0;
+
+        /* Simple case. Seek in groups of the same number as bits that fit within a cache line. */
+#ifdef DRFLAC_64BIT
+        while (bitsToSeek >= DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
+            drflac_uint64 bin;
+            if (!drflac__read_uint64(bs, DRFLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
+                return DRFLAC_FALSE;
+            }
+            bitsToSeek -= DRFLAC_CACHE_L1_SIZE_BITS(bs);
+        }
+#else
+        while (bitsToSeek >= DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
+            drflac_uint32 bin;
+            if (!drflac__read_uint32(bs, DRFLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
+                return DRFLAC_FALSE;
+            }
+            bitsToSeek -= DRFLAC_CACHE_L1_SIZE_BITS(bs);
+        }
+#endif
+
+        /* Whole leftover bytes. */
+        while (bitsToSeek >= 8) {
+            drflac_uint8 bin;
+            if (!drflac__read_uint8(bs, 8, &bin)) {
+                return DRFLAC_FALSE;
+            }
+            bitsToSeek -= 8;
+        }
+
+        /* Leftover bits. */
+        if (bitsToSeek > 0) {
+            drflac_uint8 bin;
+            if (!drflac__read_uint8(bs, (drflac_uint32)bitsToSeek, &bin)) {
+                return DRFLAC_FALSE;
+            }
+            bitsToSeek = 0; /* <-- Necessary for the assert below. */
+        }
+
+        DRFLAC_ASSERT(bitsToSeek == 0);
+        return DRFLAC_TRUE;
+    }
+}
+
+
+/* This function moves the bit streamer to the first bit after the sync code (bit 15 of the of the frame header). It will also update the CRC-16. */
+static drflac_bool32 drflac__find_and_seek_to_next_sync_code(drflac_bs* bs)
+{
+    DRFLAC_ASSERT(bs != NULL);
+
+    /*
+    The sync code is always aligned to 8 bits. This is convenient for us because it means we can do byte-aligned movements. The first
+    thing to do is align to the next byte.
+    */
+    if (!drflac__seek_bits(bs, DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
+        return DRFLAC_FALSE;
+    }
+
+    for (;;) {
+        drflac_uint8 hi;
+
+#ifndef DR_FLAC_NO_CRC
+        drflac__reset_crc16(bs);
+#endif
+
+        if (!drflac__read_uint8(bs, 8, &hi)) {
+            return DRFLAC_FALSE;
+        }
+
+        if (hi == 0xFF) {
+            drflac_uint8 lo;
+            if (!drflac__read_uint8(bs, 6, &lo)) {
+                return DRFLAC_FALSE;
+            }
+
+            if (lo == 0x3E) {
+                return DRFLAC_TRUE;
+            } else {
+                if (!drflac__seek_bits(bs, DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
+                    return DRFLAC_FALSE;
+                }
+            }
+        }
+    }
+
+    /* Should never get here. */
+    /*return DRFLAC_FALSE;*/
+}
+
+
+#if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
+#define DRFLAC_IMPLEMENT_CLZ_LZCNT
+#endif
+#if  defined(_MSC_VER) && _MSC_VER >= 1400 && (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(__clang__)
+#define DRFLAC_IMPLEMENT_CLZ_MSVC
+#endif
+#if  defined(__WATCOMC__) && defined(__386__)
+#define DRFLAC_IMPLEMENT_CLZ_WATCOM
+#endif
+#ifdef __MRC__
+#include <intrinsics.h>
+#define DRFLAC_IMPLEMENT_CLZ_MRC
+#endif
+
+static DRFLAC_INLINE drflac_uint32 drflac__clz_software(drflac_cache_t x)
+{
+    drflac_uint32 n;
+    static drflac_uint32 clz_table_4[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    if (x == 0) {
+        return sizeof(x)*8;
+    }
+
+    n = clz_table_4[x >> (sizeof(x)*8 - 4)];
+    if (n == 0) {
+#ifdef DRFLAC_64BIT
+        if ((x & ((drflac_uint64)0xFFFFFFFF << 32)) == 0) { n  = 32; x <<= 32; }
+        if ((x & ((drflac_uint64)0xFFFF0000 << 32)) == 0) { n += 16; x <<= 16; }
+        if ((x & ((drflac_uint64)0xFF000000 << 32)) == 0) { n += 8;  x <<= 8;  }
+        if ((x & ((drflac_uint64)0xF0000000 << 32)) == 0) { n += 4;  x <<= 4;  }
+#else
+        if ((x & 0xFFFF0000) == 0) { n  = 16; x <<= 16; }
+        if ((x & 0xFF000000) == 0) { n += 8;  x <<= 8;  }
+        if ((x & 0xF0000000) == 0) { n += 4;  x <<= 4;  }
+#endif
+        n += clz_table_4[x >> (sizeof(x)*8 - 4)];
+    }
+
+    return n - 1;
+}
+
+#ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT
+static DRFLAC_INLINE drflac_bool32 drflac__is_lzcnt_supported(void)
+{
+    /* Fast compile time check for ARM. */
+#if defined(DRFLAC_HAS_LZCNT_INTRINSIC) && defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
+    return DRFLAC_TRUE;
+#elif defined(__MRC__)
+    return DRFLAC_TRUE;
+#else
+    /* If the compiler itself does not support the intrinsic then we'll need to return false. */
+    #ifdef DRFLAC_HAS_LZCNT_INTRINSIC
+        return drflac__gIsLZCNTSupported;
+    #else
+        return DRFLAC_FALSE;
+    #endif
+#endif
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac__clz_lzcnt(drflac_cache_t x)
+{
+    /*
+    It's critical for competitive decoding performance that this function be highly optimal. With MSVC we can use the __lzcnt64() and __lzcnt() intrinsics
+    to achieve good performance, however on GCC and Clang it's a little bit more annoying. The __builtin_clzl() and __builtin_clzll() intrinsics leave
+    it undefined as to the return value when `x` is 0. We need this to be well defined as returning 32 or 64, depending on whether or not it's a 32- or
+    64-bit build. To work around this we would need to add a conditional to check for the x = 0 case, but this creates unnecessary inefficiency. To work
+    around this problem I have written some inline assembly to emit the LZCNT (x86) or CLZ (ARM) instruction directly which removes the need to include
+    the conditional. This has worked well in the past, but for some reason Clang's MSVC compatible driver, clang-cl, does not seem to be handling this
+    in the same way as the normal Clang driver. It seems that `clang-cl` is just outputting the wrong results sometimes, maybe due to some register
+    getting clobbered?
+
+    I'm not sure if this is a bug with dr_flac's inlined assembly (most likely), a bug in `clang-cl` or just a misunderstanding on my part with inline
+    assembly rules for `clang-cl`. If somebody can identify an error in dr_flac's inlined assembly I'm happy to get that fixed.
+
+    Fortunately there is an easy workaround for this. Clang implements MSVC-specific intrinsics for compatibility. It also defines _MSC_VER for extra
+    compatibility. We can therefore just check for _MSC_VER and use the MSVC intrinsic which, fortunately for us, Clang supports. It would still be nice
+    to know how to fix the inlined assembly for correctness sake, however.
+    */
+
+#if defined(_MSC_VER) /*&& !defined(__clang__)*/    /* <-- Intentionally wanting Clang to use the MSVC __lzcnt64/__lzcnt intrinsics due to above ^. */
+    #ifdef DRFLAC_64BIT
+        return (drflac_uint32)__lzcnt64(x);
+    #else
+        return (drflac_uint32)__lzcnt(x);
+    #endif
+#else
+    #if defined(__GNUC__) || defined(__clang__)
+        #if defined(DRFLAC_X64)
+            {
+                drflac_uint64 r;
+                __asm__ __volatile__ (
+                    "lzcnt{ %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
+                );
+
+                return (drflac_uint32)r;
+            }
+        #elif defined(DRFLAC_X86)
+            {
+                drflac_uint32 r;
+                __asm__ __volatile__ (
+                    "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
+                );
+
+                return r;
+            }
+        #elif defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5) && !defined(__ARM_ARCH_6M__) && !(defined(__thumb__) && !defined(__thumb2__)) && !defined(DRFLAC_64BIT)   /* <-- I haven't tested 64-bit inline assembly, so only enabling this for the 32-bit build for now. */
+            {
+                unsigned int r;
+                __asm__ __volatile__ (
+                #if defined(DRFLAC_64BIT)
+                    "clz %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(x)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
+                #else
+                    "clz %[out], %[in]" : [out]"=r"(r) : [in]"r"(x)
+                #endif
+                );
+
+                return r;
+            }
+        #else
+            if (x == 0) {
+                return sizeof(x)*8;
+            }
+            #ifdef DRFLAC_64BIT
+                return (drflac_uint32)__builtin_clzll((drflac_uint64)x);
+            #else
+                return (drflac_uint32)__builtin_clzl((drflac_uint32)x);
+            #endif
+        #endif
+    #else
+        /* Unsupported compiler. */
+        #error "This compiler does not support the lzcnt intrinsic."
+    #endif
+#endif
+}
+#endif
+
+#ifdef DRFLAC_IMPLEMENT_CLZ_MSVC
+#include <intrin.h> /* For BitScanReverse(). */
+
+static DRFLAC_INLINE drflac_uint32 drflac__clz_msvc(drflac_cache_t x)
+{
+    drflac_uint32 n;
+
+    if (x == 0) {
+        return sizeof(x)*8;
+    }
+
+#ifdef DRFLAC_64BIT
+    _BitScanReverse64((unsigned long*)&n, x);
+#else
+    _BitScanReverse((unsigned long*)&n, x);
+#endif
+    return sizeof(x)*8 - n - 1;
+}
+#endif
+
+#ifdef DRFLAC_IMPLEMENT_CLZ_WATCOM
+static __inline drflac_uint32 drflac__clz_watcom (drflac_uint32);
+#ifdef DRFLAC_IMPLEMENT_CLZ_WATCOM_LZCNT
+/* Use the LZCNT instruction (only available on some processors since the 2010s). */
+#pragma aux drflac__clz_watcom_lzcnt = \
+    "db 0F3h, 0Fh, 0BDh, 0C0h" /* lzcnt eax, eax */ \
+    parm [eax] \
+    value [eax] \
+    modify nomemory;
+#else
+/* Use the 386+-compatible implementation. */
+#pragma aux drflac__clz_watcom = \
+    "bsr eax, eax" \
+    "xor eax, 31" \
+    parm [eax] nomemory \
+    value [eax] \
+    modify exact [eax] nomemory;
+#endif
+#endif
+
+static DRFLAC_INLINE drflac_uint32 drflac__clz(drflac_cache_t x)
+{
+#ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT
+    if (drflac__is_lzcnt_supported()) {
+        return drflac__clz_lzcnt(x);
+    } else
+#endif
+    {
+#ifdef DRFLAC_IMPLEMENT_CLZ_MSVC
+        return drflac__clz_msvc(x);
+#elif defined(DRFLAC_IMPLEMENT_CLZ_WATCOM_LZCNT)
+        return drflac__clz_watcom_lzcnt(x);
+#elif defined(DRFLAC_IMPLEMENT_CLZ_WATCOM)
+        return (x == 0) ? sizeof(x)*8 : drflac__clz_watcom(x);
+#elif defined(__MRC__)
+        return __cntlzw(x);
+#else
+        return drflac__clz_software(x);
+#endif
+    }
+}
+
+
+static DRFLAC_INLINE drflac_bool32 drflac__seek_past_next_set_bit(drflac_bs* bs, unsigned int* pOffsetOut)
+{
+    drflac_uint32 zeroCounter = 0;
+    drflac_uint32 setBitOffsetPlus1;
+
+    while (bs->cache == 0) {
+        zeroCounter += (drflac_uint32)DRFLAC_CACHE_L1_BITS_REMAINING(bs);
+        if (!drflac__reload_cache(bs)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    if (bs->cache == 1) {
+        /* Not catching this would lead to undefined behaviour: a shift of a 32-bit number by 32 or more is undefined */
+        *pOffsetOut = zeroCounter + (drflac_uint32)DRFLAC_CACHE_L1_BITS_REMAINING(bs) - 1;
+        if (!drflac__reload_cache(bs)) {
+            return DRFLAC_FALSE;
+        }
+
+        return DRFLAC_TRUE;
+    }
+
+    setBitOffsetPlus1 = drflac__clz(bs->cache);
+    setBitOffsetPlus1 += 1;
+
+    if (setBitOffsetPlus1 > DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+        /* This happens when we get to end of stream */
+        return DRFLAC_FALSE;
+    }
+
+    bs->consumedBits += setBitOffsetPlus1;
+    bs->cache <<= setBitOffsetPlus1;
+
+    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
+    return DRFLAC_TRUE;
+}
+
+
+
+static drflac_bool32 drflac__seek_to_byte(drflac_bs* bs, drflac_uint64 offsetFromStart)
+{
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(offsetFromStart > 0);
+
+    /*
+    Seeking from the start is not quite as trivial as it sounds because the onSeek callback takes a signed 32-bit integer (which
+    is intentional because it simplifies the implementation of the onSeek callbacks), however offsetFromStart is unsigned 64-bit.
+    To resolve we just need to do an initial seek from the start, and then a series of offset seeks to make up the remainder.
+    */
+    if (offsetFromStart > 0x7FFFFFFF) {
+        drflac_uint64 bytesRemaining = offsetFromStart;
+        if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, DRFLAC_SEEK_SET)) {
+            return DRFLAC_FALSE;
+        }
+        bytesRemaining -= 0x7FFFFFFF;
+
+        while (bytesRemaining > 0x7FFFFFFF) {
+            if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, DRFLAC_SEEK_CUR)) {
+                return DRFLAC_FALSE;
+            }
+            bytesRemaining -= 0x7FFFFFFF;
+        }
+
+        if (bytesRemaining > 0) {
+            if (!bs->onSeek(bs->pUserData, (int)bytesRemaining, DRFLAC_SEEK_CUR)) {
+                return DRFLAC_FALSE;
+            }
+        }
+    } else {
+        if (!bs->onSeek(bs->pUserData, (int)offsetFromStart, DRFLAC_SEEK_SET)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    /* The cache should be reset to force a reload of fresh data from the client. */
+    drflac__reset_cache(bs);
+    return DRFLAC_TRUE;
+}
+
+
+static drflac_result drflac__read_utf8_coded_number(drflac_bs* bs, drflac_uint64* pNumberOut, drflac_uint8* pCRCOut)
+{
+    drflac_uint8 crc;
+    drflac_uint64 result;
+    drflac_uint8 utf8[7] = {0};
+    int byteCount;
+    int i;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pNumberOut != NULL);
+    DRFLAC_ASSERT(pCRCOut != NULL);
+
+    crc = *pCRCOut;
+
+    if (!drflac__read_uint8(bs, 8, utf8)) {
+        *pNumberOut = 0;
+        return DRFLAC_AT_END;
+    }
+    crc = drflac_crc8(crc, utf8[0], 8);
+
+    if ((utf8[0] & 0x80) == 0) {
+        *pNumberOut = utf8[0];
+        *pCRCOut = crc;
+        return DRFLAC_SUCCESS;
+    }
+
+    /*byteCount = 1;*/
+    if ((utf8[0] & 0xE0) == 0xC0) {
+        byteCount = 2;
+    } else if ((utf8[0] & 0xF0) == 0xE0) {
+        byteCount = 3;
+    } else if ((utf8[0] & 0xF8) == 0xF0) {
+        byteCount = 4;
+    } else if ((utf8[0] & 0xFC) == 0xF8) {
+        byteCount = 5;
+    } else if ((utf8[0] & 0xFE) == 0xFC) {
+        byteCount = 6;
+    } else if ((utf8[0] & 0xFF) == 0xFE) {
+        byteCount = 7;
+    } else {
+        *pNumberOut = 0;
+        return DRFLAC_CRC_MISMATCH;     /* Bad UTF-8 encoding. */
+    }
+
+    /* Read extra bytes. */
+    DRFLAC_ASSERT(byteCount > 1);
+
+    result = (drflac_uint64)(utf8[0] & (0xFF >> (byteCount + 1)));
+    for (i = 1; i < byteCount; ++i) {
+        if (!drflac__read_uint8(bs, 8, utf8 + i)) {
+            *pNumberOut = 0;
+            return DRFLAC_AT_END;
+        }
+        crc = drflac_crc8(crc, utf8[i], 8);
+
+        result = (result << 6) | (utf8[i] & 0x3F);
+    }
+
+    *pNumberOut = result;
+    *pCRCOut = crc;
+    return DRFLAC_SUCCESS;
+}
+
+
+static DRFLAC_INLINE drflac_uint32 drflac__ilog2_u32(drflac_uint32 x)
+{
+#if 1   /* Needs optimizing. */
+    drflac_uint32 result = 0;
+    while (x > 0) {
+        result += 1;
+        x >>= 1;
+    }
+
+    return result;
+#endif
+}
+
+static DRFLAC_INLINE drflac_bool32 drflac__use_64_bit_prediction(drflac_uint32 bitsPerSample, drflac_uint32 order, drflac_uint32 precision)
+{
+    /* https://web.archive.org/web/20220205005724/https://github.com/ietf-wg-cellar/flac-specification/blob/37a49aa48ba4ba12e8757badfc59c0df35435fec/rfc_backmatter.md */
+    return bitsPerSample + precision + drflac__ilog2_u32(order) > 32;
+}
+
+
+/*
+The next two functions are responsible for calculating the prediction.
+
+When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
+safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
+*/
+#if defined(__clang__)
+__attribute__((no_sanitize("signed-integer-overflow")))
+#endif
+static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_32(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
+{
+    drflac_int32 prediction = 0;
+
+    DRFLAC_ASSERT(order <= 32);
+
+    /* 32-bit version. */
+
+    /* VC++ optimizes this to a single jmp. I've not yet verified this for other compilers. */
+    switch (order)
+    {
+    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
+    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
+    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
+    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
+    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
+    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
+    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
+    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
+    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
+    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
+    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
+    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
+    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
+    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
+    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
+    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
+    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
+    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
+    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
+    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
+    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
+    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
+    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
+    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
+    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
+    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
+    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
+    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
+    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
+    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
+    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
+    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
+    }
+
+    return (drflac_int32)(prediction >> shift);
+}
+
+static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_64(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
+{
+    drflac_int64 prediction;
+
+    DRFLAC_ASSERT(order <= 32);
+
+    /* 64-bit version. */
+
+    /* This method is faster on the 32-bit build when compiling with VC++. See note below. */
+#ifndef DRFLAC_64BIT
+    if (order == 8)
+    {
+        prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
+        prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
+        prediction += coefficients[6] * (drflac_int64)pDecodedSamples[-7];
+        prediction += coefficients[7] * (drflac_int64)pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
+        prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
+        prediction += coefficients[6] * (drflac_int64)pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
+        prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
+        prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
+        prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
+        prediction += coefficients[11] * (drflac_int64)pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
+        prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
+        prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
+        prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
+    }
+    else
+    {
+        int j;
+
+        prediction = 0;
+        for (j = 0; j < (int)order; ++j) {
+            prediction += coefficients[j] * (drflac_int64)pDecodedSamples[-j-1];
+        }
+    }
+#endif
+
+    /*
+    VC++ optimizes this to a single jmp instruction, but only the 64-bit build. The 32-bit build generates less efficient code for some
+    reason. The ugly version above is faster so we'll just switch between the two depending on the target platform.
+    */
+#ifdef DRFLAC_64BIT
+    prediction = 0;
+    switch (order)
+    {
+    case 32: prediction += coefficients[31] * (drflac_int64)pDecodedSamples[-32];
+    case 31: prediction += coefficients[30] * (drflac_int64)pDecodedSamples[-31];
+    case 30: prediction += coefficients[29] * (drflac_int64)pDecodedSamples[-30];
+    case 29: prediction += coefficients[28] * (drflac_int64)pDecodedSamples[-29];
+    case 28: prediction += coefficients[27] * (drflac_int64)pDecodedSamples[-28];
+    case 27: prediction += coefficients[26] * (drflac_int64)pDecodedSamples[-27];
+    case 26: prediction += coefficients[25] * (drflac_int64)pDecodedSamples[-26];
+    case 25: prediction += coefficients[24] * (drflac_int64)pDecodedSamples[-25];
+    case 24: prediction += coefficients[23] * (drflac_int64)pDecodedSamples[-24];
+    case 23: prediction += coefficients[22] * (drflac_int64)pDecodedSamples[-23];
+    case 22: prediction += coefficients[21] * (drflac_int64)pDecodedSamples[-22];
+    case 21: prediction += coefficients[20] * (drflac_int64)pDecodedSamples[-21];
+    case 20: prediction += coefficients[19] * (drflac_int64)pDecodedSamples[-20];
+    case 19: prediction += coefficients[18] * (drflac_int64)pDecodedSamples[-19];
+    case 18: prediction += coefficients[17] * (drflac_int64)pDecodedSamples[-18];
+    case 17: prediction += coefficients[16] * (drflac_int64)pDecodedSamples[-17];
+    case 16: prediction += coefficients[15] * (drflac_int64)pDecodedSamples[-16];
+    case 15: prediction += coefficients[14] * (drflac_int64)pDecodedSamples[-15];
+    case 14: prediction += coefficients[13] * (drflac_int64)pDecodedSamples[-14];
+    case 13: prediction += coefficients[12] * (drflac_int64)pDecodedSamples[-13];
+    case 12: prediction += coefficients[11] * (drflac_int64)pDecodedSamples[-12];
+    case 11: prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
+    case 10: prediction += coefficients[ 9] * (drflac_int64)pDecodedSamples[-10];
+    case  9: prediction += coefficients[ 8] * (drflac_int64)pDecodedSamples[- 9];
+    case  8: prediction += coefficients[ 7] * (drflac_int64)pDecodedSamples[- 8];
+    case  7: prediction += coefficients[ 6] * (drflac_int64)pDecodedSamples[- 7];
+    case  6: prediction += coefficients[ 5] * (drflac_int64)pDecodedSamples[- 6];
+    case  5: prediction += coefficients[ 4] * (drflac_int64)pDecodedSamples[- 5];
+    case  4: prediction += coefficients[ 3] * (drflac_int64)pDecodedSamples[- 4];
+    case  3: prediction += coefficients[ 2] * (drflac_int64)pDecodedSamples[- 3];
+    case  2: prediction += coefficients[ 1] * (drflac_int64)pDecodedSamples[- 2];
+    case  1: prediction += coefficients[ 0] * (drflac_int64)pDecodedSamples[- 1];
+    }
+#endif
+
+    return (drflac_int32)(prediction >> shift);
+}
+
+
+#if 0
+/*
+Reference implementation for reading and decoding samples with residual. This is intentionally left unoptimized for the
+sake of readability and should only be used as a reference.
+*/
+static drflac_bool32 drflac__decode_samples_with_residual__rice__reference(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 lpcOrder, drflac_int32 lpcShift, drflac_uint32 lpcPrecision, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    drflac_uint32 i;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pSamplesOut != NULL);
+
+    for (i = 0; i < count; ++i) {
+        drflac_uint32 zeroCounter = 0;
+        for (;;) {
+            drflac_uint8 bit;
+            if (!drflac__read_uint8(bs, 1, &bit)) {
+                return DRFLAC_FALSE;
+            }
+
+            if (bit == 0) {
+                zeroCounter += 1;
+            } else {
+                break;
+            }
+        }
+
+        drflac_uint32 decodedRice;
+        if (riceParam > 0) {
+            if (!drflac__read_uint32(bs, riceParam, &decodedRice)) {
+                return DRFLAC_FALSE;
+            }
+        } else {
+            decodedRice = 0;
+        }
+
+        decodedRice |= (zeroCounter << riceParam);
+        if ((decodedRice & 0x01)) {
+            decodedRice = ~(decodedRice >> 1);
+        } else {
+            decodedRice =  (decodedRice >> 1);
+        }
+
+
+        if (drflac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            pSamplesOut[i] = decodedRice + drflac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        } else {
+            pSamplesOut[i] = decodedRice + drflac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        }
+    }
+
+    return DRFLAC_TRUE;
+}
+#endif
+
+#if 0
+static drflac_bool32 drflac__read_rice_parts__reference(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
+{
+    drflac_uint32 zeroCounter = 0;
+    drflac_uint32 decodedRice;
+
+    for (;;) {
+        drflac_uint8 bit;
+        if (!drflac__read_uint8(bs, 1, &bit)) {
+            return DRFLAC_FALSE;
+        }
+
+        if (bit == 0) {
+            zeroCounter += 1;
+        } else {
+            break;
+        }
+    }
+
+    if (riceParam > 0) {
+        if (!drflac__read_uint32(bs, riceParam, &decodedRice)) {
+            return DRFLAC_FALSE;
+        }
+    } else {
+        decodedRice = 0;
+    }
+
+    *pZeroCounterOut = zeroCounter;
+    *pRiceParamPartOut = decodedRice;
+    return DRFLAC_TRUE;
+}
+#endif
+
+#if 0
+static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
+{
+    drflac_cache_t riceParamMask;
+    drflac_uint32 zeroCounter;
+    drflac_uint32 setBitOffsetPlus1;
+    drflac_uint32 riceParamPart;
+    drflac_uint32 riceLength;
+
+    DRFLAC_ASSERT(riceParam > 0);   /* <-- riceParam should never be 0. drflac__read_rice_parts__param_equals_zero() should be used instead for this case. */
+
+    riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(riceParam);
+
+    zeroCounter = 0;
+    while (bs->cache == 0) {
+        zeroCounter += (drflac_uint32)DRFLAC_CACHE_L1_BITS_REMAINING(bs);
+        if (!drflac__reload_cache(bs)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    setBitOffsetPlus1 = drflac__clz(bs->cache);
+    zeroCounter += setBitOffsetPlus1;
+    setBitOffsetPlus1 += 1;
+
+    riceLength = setBitOffsetPlus1 + riceParam;
+    if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+        riceParamPart = (drflac_uint32)((bs->cache & (riceParamMask >> setBitOffsetPlus1)) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceLength));
+
+        bs->consumedBits += riceLength;
+        bs->cache <<= riceLength;
+    } else {
+        drflac_uint32 bitCountLo;
+        drflac_cache_t resultHi;
+
+        bs->consumedBits += riceLength;
+        bs->cache <<= setBitOffsetPlus1 & (DRFLAC_CACHE_L1_SIZE_BITS(bs)-1);    /* <-- Equivalent to "if (setBitOffsetPlus1 < DRFLAC_CACHE_L1_SIZE_BITS(bs)) { bs->cache <<= setBitOffsetPlus1; }" */
+
+        /* It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them. */
+        bitCountLo = bs->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS(bs);
+        resultHi = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, riceParam);  /* <-- Use DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE() if ever this function allows riceParam=0. */
+
+        if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+#ifndef DR_FLAC_NO_CRC
+            drflac__update_crc16(bs);
+#endif
+            bs->cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+            bs->consumedBits = 0;
+#ifndef DR_FLAC_NO_CRC
+            bs->crc16Cache = bs->cache;
+#endif
+        } else {
+            /* Slow path. We need to fetch more data from the client. */
+            if (!drflac__reload_cache(bs)) {
+                return DRFLAC_FALSE;
+            }
+            if (bitCountLo > DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+                /* This happens when we get to end of stream */
+                return DRFLAC_FALSE;
+            }
+        }
+
+        riceParamPart = (drflac_uint32)(resultHi | DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, bitCountLo));
+
+        bs->consumedBits += bitCountLo;
+        bs->cache <<= bitCountLo;
+    }
+
+    pZeroCounterOut[0] = zeroCounter;
+    pRiceParamPartOut[0] = riceParamPart;
+
+    return DRFLAC_TRUE;
+}
+#endif
+
+static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts_x1(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
+{
+    drflac_uint32  riceParamPlus1 = riceParam + 1;
+    /*drflac_cache_t riceParamPlus1Mask  = DRFLAC_CACHE_L1_SELECTION_MASK(riceParamPlus1);*/
+    drflac_uint32  riceParamPlus1Shift = DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPlus1);
+    drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
+
+    /*
+    The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
+    no idea how this will work in practice...
+    */
+    drflac_cache_t bs_cache = bs->cache;
+    drflac_uint32  bs_consumedBits = bs->consumedBits;
+
+    /* The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line. */
+    drflac_uint32  lzcount = drflac__clz(bs_cache);
+    if (lzcount < sizeof(bs_cache)*8) {
+        pZeroCounterOut[0] = lzcount;
+
+        /*
+        It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
+        this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
+        outside of this function at a higher level.
+        */
+    extract_rice_param_part:
+        bs_cache       <<= lzcount;
+        bs_consumedBits += lzcount;
+
+        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
+            /* Getting here means the rice parameter part is wholly contained within the current cache line. */
+            pRiceParamPartOut[0] = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
+            bs_cache       <<= riceParamPlus1;
+            bs_consumedBits += riceParamPlus1;
+        } else {
+            drflac_uint32 riceParamPartHi;
+            drflac_uint32 riceParamPartLo;
+            drflac_uint32 riceParamPartLoBitCount;
+
+            /*
+            Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
+            line, reload the cache, and then combine it with the head of the next cache line.
+            */
+
+            /* Grab the high part of the rice parameter part. */
+            riceParamPartHi = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
+
+            /* Before reloading the cache we need to grab the size in bits of the low part. */
+            riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
+            DRFLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
+
+            /* Now reload the cache. */
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = riceParamPartLoBitCount;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                /* Slow path. We need to fetch more data from the client. */
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+                if (riceParamPartLoBitCount > DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+                    /* This happens when we get to end of stream */
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
+            }
+
+            /* We should now have enough information to construct the rice parameter part. */
+            riceParamPartLo = (drflac_uint32)(bs_cache >> (DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPartLoBitCount)));
+            pRiceParamPartOut[0] = riceParamPartHi | riceParamPartLo;
+
+            bs_cache <<= riceParamPartLoBitCount;
+        }
+    } else {
+        /*
+        Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
+        to drflac__clz() and we need to reload the cache.
+        */
+        drflac_uint32 zeroCounter = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BITS(bs) - bs_consumedBits);
+        for (;;) {
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = 0;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                /* Slow path. We need to fetch more data from the client. */
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits;
+            }
+
+            lzcount = drflac__clz(bs_cache);
+            zeroCounter += lzcount;
+
+            if (lzcount < sizeof(bs_cache)*8) {
+                break;
+            }
+        }
+
+        pZeroCounterOut[0] = zeroCounter;
+        goto extract_rice_param_part;
+    }
+
+    /* Make sure the cache is restored at the end of it all. */
+    bs->cache = bs_cache;
+    bs->consumedBits = bs_consumedBits;
+
+    return DRFLAC_TRUE;
+}
+
+static DRFLAC_INLINE drflac_bool32 drflac__seek_rice_parts(drflac_bs* bs, drflac_uint8 riceParam)
+{
+    drflac_uint32  riceParamPlus1 = riceParam + 1;
+    drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
+
+    /*
+    The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
+    no idea how this will work in practice...
+    */
+    drflac_cache_t bs_cache = bs->cache;
+    drflac_uint32  bs_consumedBits = bs->consumedBits;
+
+    /* The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line. */
+    drflac_uint32  lzcount = drflac__clz(bs_cache);
+    if (lzcount < sizeof(bs_cache)*8) {
+        /*
+        It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
+        this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
+        outside of this function at a higher level.
+        */
+    extract_rice_param_part:
+        bs_cache       <<= lzcount;
+        bs_consumedBits += lzcount;
+
+        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
+            /* Getting here means the rice parameter part is wholly contained within the current cache line. */
+            bs_cache       <<= riceParamPlus1;
+            bs_consumedBits += riceParamPlus1;
+        } else {
+            /*
+            Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
+            line, reload the cache, and then combine it with the head of the next cache line.
+            */
+
+            /* Before reloading the cache we need to grab the size in bits of the low part. */
+            drflac_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
+            DRFLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
+
+            /* Now reload the cache. */
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = riceParamPartLoBitCount;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                /* Slow path. We need to fetch more data from the client. */
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+
+                if (riceParamPartLoBitCount > DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
+                    /* This happens when we get to end of stream */
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
+            }
+
+            bs_cache <<= riceParamPartLoBitCount;
+        }
+    } else {
+        /*
+        Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
+        to drflac__clz() and we need to reload the cache.
+        */
+        for (;;) {
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = 0;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                /* Slow path. We need to fetch more data from the client. */
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits;
+            }
+
+            lzcount = drflac__clz(bs_cache);
+            if (lzcount < sizeof(bs_cache)*8) {
+                break;
+            }
+        }
+
+        goto extract_rice_param_part;
+    }
+
+    /* Make sure the cache is restored at the end of it all. */
+    bs->cache = bs_cache;
+    bs->consumedBits = bs_consumedBits;
+
+    return DRFLAC_TRUE;
+}
+
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__scalar_zeroorder(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    drflac_uint32 zeroCountPart0;
+    drflac_uint32 riceParamPart0;
+    drflac_uint32 riceParamMask;
+    drflac_uint32 i;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pSamplesOut != NULL);
+
+    (void)bitsPerSample;
+    (void)order;
+    (void)shift;
+    (void)coefficients;
+
+    riceParamMask  = (drflac_uint32)~((~0UL) << riceParam);
+
+    i = 0;
+    while (i < count) {
+        /* Rice extraction. */
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Rice reconstruction. */
+        riceParamPart0 &= riceParamMask;
+        riceParamPart0 |= (zeroCountPart0 << riceParam);
+        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+
+        pSamplesOut[i] = riceParamPart0;
+
+        i += 1;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__scalar(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 lpcOrder, drflac_int32 lpcShift, drflac_uint32 lpcPrecision, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    drflac_uint32 zeroCountPart0 = 0;
+    drflac_uint32 zeroCountPart1 = 0;
+    drflac_uint32 zeroCountPart2 = 0;
+    drflac_uint32 zeroCountPart3 = 0;
+    drflac_uint32 riceParamPart0 = 0;
+    drflac_uint32 riceParamPart1 = 0;
+    drflac_uint32 riceParamPart2 = 0;
+    drflac_uint32 riceParamPart3 = 0;
+    drflac_uint32 riceParamMask;
+    const drflac_int32* pSamplesOutEnd;
+    drflac_uint32 i;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pSamplesOut != NULL);
+
+    if (lpcOrder == 0) {
+        return drflac__decode_samples_with_residual__rice__scalar_zeroorder(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+    }
+
+    riceParamMask  = (drflac_uint32)~((~0UL) << riceParam);
+    pSamplesOutEnd = pSamplesOut + (count & ~3);
+
+    if (drflac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+        while (pSamplesOut < pSamplesOutEnd) {
+            /*
+            Rice extraction. It's faster to do this one at a time against local variables than it is to use the x4 version
+            against an array. Not sure why, but perhaps it's making more efficient use of registers?
+            */
+            if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
+                return DRFLAC_FALSE;
+            }
+
+            riceParamPart0 &= riceParamMask;
+            riceParamPart1 &= riceParamMask;
+            riceParamPart2 &= riceParamMask;
+            riceParamPart3 &= riceParamMask;
+
+            riceParamPart0 |= (zeroCountPart0 << riceParam);
+            riceParamPart1 |= (zeroCountPart1 << riceParam);
+            riceParamPart2 |= (zeroCountPart2 << riceParam);
+            riceParamPart3 |= (zeroCountPart3 << riceParam);
+
+            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
+            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
+            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
+
+            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+            pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 1);
+            pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 2);
+            pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 3);
+
+            pSamplesOut += 4;
+        }
+    } else {
+        while (pSamplesOut < pSamplesOutEnd) {
+            if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
+                return DRFLAC_FALSE;
+            }
+
+            riceParamPart0 &= riceParamMask;
+            riceParamPart1 &= riceParamMask;
+            riceParamPart2 &= riceParamMask;
+            riceParamPart3 &= riceParamMask;
+
+            riceParamPart0 |= (zeroCountPart0 << riceParam);
+            riceParamPart1 |= (zeroCountPart1 << riceParam);
+            riceParamPart2 |= (zeroCountPart2 << riceParam);
+            riceParamPart3 |= (zeroCountPart3 << riceParam);
+
+            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
+            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
+            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
+
+            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+            pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 1);
+            pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 2);
+            pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 3);
+
+            pSamplesOut += 4;
+        }
+    }
+
+    i = (count & ~3);
+    while (i < count) {
+        /* Rice extraction. */
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Rice reconstruction. */
+        riceParamPart0 &= riceParamMask;
+        riceParamPart0 |= (zeroCountPart0 << riceParam);
+        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+        /*riceParamPart0  = (riceParamPart0 >> 1) ^ (~(riceParamPart0 & 0x01) + 1);*/
+
+        /* Sample reconstruction. */
+        if (drflac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+        } else {
+            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+        }
+
+        i += 1;
+        pSamplesOut += 1;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE __m128i drflac__mm_packs_interleaved_epi32(__m128i a, __m128i b)
+{
+    __m128i r;
+
+    /* Pack. */
+    r = _mm_packs_epi32(a, b);
+
+    /* a3a2 a1a0 b3b2 b1b0 -> a3a2 b3b2 a1a0 b1b0 */
+    r = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 1, 2, 0));
+
+    /* a3a2 b3b2 a1a0 b1b0 -> a3b3 a2b2 a1b1 a0b0 */
+    r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
+    r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
+
+    return r;
+}
+#endif
+
+#if defined(DRFLAC_SUPPORT_SSE41)
+static DRFLAC_INLINE __m128i drflac__mm_not_si128(__m128i a)
+{
+    return _mm_xor_si128(a, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+}
+
+static DRFLAC_INLINE __m128i drflac__mm_hadd_epi32(__m128i x)
+{
+    __m128i x64 = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+    __m128i x32 = _mm_shufflelo_epi16(x64, _MM_SHUFFLE(1, 0, 3, 2));
+    return _mm_add_epi32(x64, x32);
+}
+
+static DRFLAC_INLINE __m128i drflac__mm_hadd_epi64(__m128i x)
+{
+    return _mm_add_epi64(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+}
+
+static DRFLAC_INLINE __m128i drflac__mm_srai_epi64(__m128i x, int count)
+{
+    /*
+    To simplify this we are assuming count < 32. This restriction allows us to work on a low side and a high side. The low side
+    is shifted with zero bits, whereas the right side is shifted with sign bits.
+    */
+    __m128i lo = _mm_srli_epi64(x, count);
+    __m128i hi = _mm_srai_epi32(x, count);
+
+    hi = _mm_and_si128(hi, _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0));    /* The high part needs to have the low part cleared. */
+
+    return _mm_or_si128(lo, hi);
+}
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41_32(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    int i;
+    drflac_uint32 riceParamMask;
+    drflac_int32* pDecodedSamples    = pSamplesOut;
+    drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    drflac_uint32 zeroCountParts0 = 0;
+    drflac_uint32 zeroCountParts1 = 0;
+    drflac_uint32 zeroCountParts2 = 0;
+    drflac_uint32 zeroCountParts3 = 0;
+    drflac_uint32 riceParamParts0 = 0;
+    drflac_uint32 riceParamParts1 = 0;
+    drflac_uint32 riceParamParts2 = 0;
+    drflac_uint32 riceParamParts3 = 0;
+    __m128i coefficients128_0;
+    __m128i coefficients128_4;
+    __m128i coefficients128_8;
+    __m128i samples128_0;
+    __m128i samples128_4;
+    __m128i samples128_8;
+    __m128i riceParamMask128;
+
+    const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+
+    riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = _mm_set1_epi32(riceParamMask);
+
+    /* Pre-load. */
+    coefficients128_0 = _mm_setzero_si128();
+    coefficients128_4 = _mm_setzero_si128();
+    coefficients128_8 = _mm_setzero_si128();
+
+    samples128_0 = _mm_setzero_si128();
+    samples128_4 = _mm_setzero_si128();
+    samples128_8 = _mm_setzero_si128();
+
+    /*
+    Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
+    what's available in the input buffers. It would be convenient to use a fall-through switch to do this, but this results
+    in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
+    so I think there's opportunity for this to be simplified.
+    */
+#if 1
+    {
+        int runningOrder = order;
+
+        /* 0 - 3. */
+        if (runningOrder >= 4) {
+            coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
+            samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
+                case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
+                case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+
+        /* 4 - 7 */
+        if (runningOrder >= 4) {
+            coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
+            samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
+                case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
+                case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+
+        /* 8 - 11 */
+        if (runningOrder == 4) {
+            coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
+            samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
+                case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
+                case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
+            }
+            runningOrder = 0;
+        }
+
+        /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
+        coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
+    }
+#else
+    /* This causes strict-aliasing warnings with GCC. */
+    switch (order)
+    {
+    case 12: ((drflac_int32*)&coefficients128_8)[0] = coefficients[11]; ((drflac_int32*)&samples128_8)[0] = pDecodedSamples[-12];
+    case 11: ((drflac_int32*)&coefficients128_8)[1] = coefficients[10]; ((drflac_int32*)&samples128_8)[1] = pDecodedSamples[-11];
+    case 10: ((drflac_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((drflac_int32*)&samples128_8)[2] = pDecodedSamples[-10];
+    case 9:  ((drflac_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((drflac_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
+    case 8:  ((drflac_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((drflac_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
+    case 7:  ((drflac_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((drflac_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
+    case 6:  ((drflac_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((drflac_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
+    case 5:  ((drflac_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((drflac_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
+    case 4:  ((drflac_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((drflac_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
+    case 3:  ((drflac_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((drflac_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
+    case 2:  ((drflac_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((drflac_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
+    case 1:  ((drflac_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((drflac_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
+    }
+#endif
+
+    /* For this version we are doing one sample at a time. */
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        __m128i prediction128;
+        __m128i zeroCountPart128;
+        __m128i riceParamPart128;
+
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
+            return DRFLAC_FALSE;
+        }
+
+        zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
+        riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
+
+        riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
+        riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
+        riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(drflac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01))), _mm_set1_epi32(0x01)));  /* <-- SSE2 compatible */
+        /*riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_mullo_epi32(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01)), _mm_set1_epi32(0xFFFFFFFF)));*/   /* <-- Only supported from SSE4.1 and is slower in my testing... */
+
+        if (order <= 4) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 = _mm_mullo_epi32(coefficients128_0, samples128_0);
+
+                /* Horizontal add and shift. */
+                prediction128 = drflac__mm_hadd_epi32(prediction128);
+                prediction128 = _mm_srai_epi32(prediction128, shift);
+                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+
+                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+            }
+        } else if (order <= 8) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                              _mm_mullo_epi32(coefficients128_4, samples128_4);
+                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
+
+                /* Horizontal add and shift. */
+                prediction128 = drflac__mm_hadd_epi32(prediction128);
+                prediction128 = _mm_srai_epi32(prediction128, shift);
+                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+
+                samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
+                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+            }
+        } else {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                              _mm_mullo_epi32(coefficients128_8, samples128_8);
+                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_4, samples128_4));
+                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
+
+                /* Horizontal add and shift. */
+                prediction128 = drflac__mm_hadd_epi32(prediction128);
+                prediction128 = _mm_srai_epi32(prediction128, shift);
+                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+
+                samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
+                samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
+                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+            }
+        }
+
+        /* We store samples in groups of 4. */
+        _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+
+    /* Make sure we process the last few samples. */
+    i = (count & ~3);
+    while (i < (int)count) {
+        /* Rice extraction. */
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Rice reconstruction. */
+        riceParamParts0 &= riceParamMask;
+        riceParamParts0 |= (zeroCountParts0 << riceParam);
+        riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
+
+        /* Sample reconstruction. */
+        pDecodedSamples[0] = riceParamParts0 + drflac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
+
+        i += 1;
+        pDecodedSamples += 1;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41_64(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    int i;
+    drflac_uint32 riceParamMask;
+    drflac_int32* pDecodedSamples    = pSamplesOut;
+    drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    drflac_uint32 zeroCountParts0 = 0;
+    drflac_uint32 zeroCountParts1 = 0;
+    drflac_uint32 zeroCountParts2 = 0;
+    drflac_uint32 zeroCountParts3 = 0;
+    drflac_uint32 riceParamParts0 = 0;
+    drflac_uint32 riceParamParts1 = 0;
+    drflac_uint32 riceParamParts2 = 0;
+    drflac_uint32 riceParamParts3 = 0;
+    __m128i coefficients128_0;
+    __m128i coefficients128_4;
+    __m128i coefficients128_8;
+    __m128i samples128_0;
+    __m128i samples128_4;
+    __m128i samples128_8;
+    __m128i prediction128;
+    __m128i riceParamMask128;
+
+    const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+
+    DRFLAC_ASSERT(order <= 12);
+
+    riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = _mm_set1_epi32(riceParamMask);
+
+    prediction128 = _mm_setzero_si128();
+
+    /* Pre-load. */
+    coefficients128_0  = _mm_setzero_si128();
+    coefficients128_4  = _mm_setzero_si128();
+    coefficients128_8  = _mm_setzero_si128();
+
+    samples128_0  = _mm_setzero_si128();
+    samples128_4  = _mm_setzero_si128();
+    samples128_8  = _mm_setzero_si128();
+
+#if 1
+    {
+        int runningOrder = order;
+
+        /* 0 - 3. */
+        if (runningOrder >= 4) {
+            coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
+            samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
+                case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
+                case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+
+        /* 4 - 7 */
+        if (runningOrder >= 4) {
+            coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
+            samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
+                case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
+                case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+
+        /* 8 - 11 */
+        if (runningOrder == 4) {
+            coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
+            samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
+                case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
+                case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
+            }
+            runningOrder = 0;
+        }
+
+        /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
+        coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
+    }
+#else
+    switch (order)
+    {
+    case 12: ((drflac_int32*)&coefficients128_8)[0] = coefficients[11]; ((drflac_int32*)&samples128_8)[0] = pDecodedSamples[-12];
+    case 11: ((drflac_int32*)&coefficients128_8)[1] = coefficients[10]; ((drflac_int32*)&samples128_8)[1] = pDecodedSamples[-11];
+    case 10: ((drflac_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((drflac_int32*)&samples128_8)[2] = pDecodedSamples[-10];
+    case 9:  ((drflac_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((drflac_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
+    case 8:  ((drflac_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((drflac_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
+    case 7:  ((drflac_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((drflac_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
+    case 6:  ((drflac_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((drflac_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
+    case 5:  ((drflac_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((drflac_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
+    case 4:  ((drflac_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((drflac_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
+    case 3:  ((drflac_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((drflac_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
+    case 2:  ((drflac_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((drflac_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
+    case 1:  ((drflac_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((drflac_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
+    }
+#endif
+
+    /* For this version we are doing one sample at a time. */
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        __m128i zeroCountPart128;
+        __m128i riceParamPart128;
+
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
+            return DRFLAC_FALSE;
+        }
+
+        zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
+        riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
+
+        riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
+        riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
+        riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(drflac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(1))), _mm_set1_epi32(1)));
+
+        for (i = 0; i < 4; i += 1) {
+            prediction128 = _mm_xor_si128(prediction128, prediction128);    /* Reset to 0. */
+
+            switch (order)
+            {
+            case 12:
+            case 11: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(1, 1, 0, 0))));
+            case 10:
+            case  9: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(3, 3, 2, 2))));
+            case  8:
+            case  7: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(1, 1, 0, 0))));
+            case  6:
+            case  5: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(3, 3, 2, 2))));
+            case  4:
+            case  3: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(1, 1, 0, 0))));
+            case  2:
+            case  1: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(3, 3, 2, 2))));
+            }
+
+            /* Horizontal add and shift. */
+            prediction128 = drflac__mm_hadd_epi64(prediction128);
+            prediction128 = drflac__mm_srai_epi64(prediction128, shift);
+            prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+
+            /* Our value should be sitting in prediction128[0]. We need to combine this with our SSE samples. */
+            samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
+            samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
+            samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+
+            /* Slide our rice parameter down so that the value in position 0 contains the next one to process. */
+            riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+        }
+
+        /* We store samples in groups of 4. */
+        _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+
+    /* Make sure we process the last few samples. */
+    i = (count & ~3);
+    while (i < (int)count) {
+        /* Rice extraction. */
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Rice reconstruction. */
+        riceParamParts0 &= riceParamMask;
+        riceParamParts0 |= (zeroCountParts0 << riceParam);
+        riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
+
+        /* Sample reconstruction. */
+        pDecodedSamples[0] = riceParamParts0 + drflac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
+
+        i += 1;
+        pDecodedSamples += 1;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 lpcOrder, drflac_int32 lpcShift, drflac_uint32 lpcPrecision, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pSamplesOut != NULL);
+
+    /* In my testing the order is rarely > 12, so in this case I'm going to simplify the SSE implementation by only handling order <= 12. */
+    if (lpcOrder > 0 && lpcOrder <= 12) {
+        if (drflac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            return drflac__decode_samples_with_residual__rice__sse41_64(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        } else {
+            return drflac__decode_samples_with_residual__rice__sse41_32(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        }
+    } else {
+        return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    }
+}
+#endif
+
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac__vst2q_s32(drflac_int32* p, int32x4x2_t x)
+{
+    vst1q_s32(p+0, x.val[0]);
+    vst1q_s32(p+4, x.val[1]);
+}
+
+static DRFLAC_INLINE void drflac__vst2q_u32(drflac_uint32* p, uint32x4x2_t x)
+{
+    vst1q_u32(p+0, x.val[0]);
+    vst1q_u32(p+4, x.val[1]);
+}
+
+static DRFLAC_INLINE void drflac__vst2q_f32(float* p, float32x4x2_t x)
+{
+    vst1q_f32(p+0, x.val[0]);
+    vst1q_f32(p+4, x.val[1]);
+}
+
+static DRFLAC_INLINE void drflac__vst2q_s16(drflac_int16* p, int16x4x2_t x)
+{
+    vst1q_s16(p, vcombine_s16(x.val[0], x.val[1]));
+}
+
+static DRFLAC_INLINE void drflac__vst2q_u16(drflac_uint16* p, uint16x4x2_t x)
+{
+    vst1q_u16(p, vcombine_u16(x.val[0], x.val[1]));
+}
+
+static DRFLAC_INLINE int32x4_t drflac__vdupq_n_s32x4(drflac_int32 x3, drflac_int32 x2, drflac_int32 x1, drflac_int32 x0)
+{
+    drflac_int32 x[4];
+    x[3] = x3;
+    x[2] = x2;
+    x[1] = x1;
+    x[0] = x0;
+    return vld1q_s32(x);
+}
+
+static DRFLAC_INLINE int32x4_t drflac__valignrq_s32_1(int32x4_t a, int32x4_t b)
+{
+    /* Equivalent to SSE's _mm_alignr_epi8(a, b, 4) */
+
+    /* Reference */
+    /*return drflac__vdupq_n_s32x4(
+        vgetq_lane_s32(a, 0),
+        vgetq_lane_s32(b, 3),
+        vgetq_lane_s32(b, 2),
+        vgetq_lane_s32(b, 1)
+    );*/
+
+    return vextq_s32(b, a, 1);
+}
+
+static DRFLAC_INLINE uint32x4_t drflac__valignrq_u32_1(uint32x4_t a, uint32x4_t b)
+{
+    /* Equivalent to SSE's _mm_alignr_epi8(a, b, 4) */
+
+    /* Reference */
+    /*return drflac__vdupq_n_s32x4(
+        vgetq_lane_s32(a, 0),
+        vgetq_lane_s32(b, 3),
+        vgetq_lane_s32(b, 2),
+        vgetq_lane_s32(b, 1)
+    );*/
+
+    return vextq_u32(b, a, 1);
+}
+
+static DRFLAC_INLINE int32x2_t drflac__vhaddq_s32(int32x4_t x)
+{
+    /* The sum must end up in position 0. */
+
+    /* Reference */
+    /*return vdupq_n_s32(
+        vgetq_lane_s32(x, 3) +
+        vgetq_lane_s32(x, 2) +
+        vgetq_lane_s32(x, 1) +
+        vgetq_lane_s32(x, 0)
+    );*/
+
+    int32x2_t r = vadd_s32(vget_high_s32(x), vget_low_s32(x));
+    return vpadd_s32(r, r);
+}
+
+static DRFLAC_INLINE int64x1_t drflac__vhaddq_s64(int64x2_t x)
+{
+    return vadd_s64(vget_high_s64(x), vget_low_s64(x));
+}
+
+static DRFLAC_INLINE int32x4_t drflac__vrevq_s32(int32x4_t x)
+{
+    /* Reference */
+    /*return drflac__vdupq_n_s32x4(
+        vgetq_lane_s32(x, 0),
+        vgetq_lane_s32(x, 1),
+        vgetq_lane_s32(x, 2),
+        vgetq_lane_s32(x, 3)
+    );*/
+
+    return vrev64q_s32(vcombine_s32(vget_high_s32(x), vget_low_s32(x)));
+}
+
+static DRFLAC_INLINE int32x4_t drflac__vnotq_s32(int32x4_t x)
+{
+    return veorq_s32(x, vdupq_n_s32(0xFFFFFFFF));
+}
+
+static DRFLAC_INLINE uint32x4_t drflac__vnotq_u32(uint32x4_t x)
+{
+    return veorq_u32(x, vdupq_n_u32(0xFFFFFFFF));
+}
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__neon_32(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    int i;
+    drflac_uint32 riceParamMask;
+    drflac_int32* pDecodedSamples    = pSamplesOut;
+    drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    drflac_uint32 zeroCountParts[4];
+    drflac_uint32 riceParamParts[4];
+    int32x4_t coefficients128_0;
+    int32x4_t coefficients128_4;
+    int32x4_t coefficients128_8;
+    int32x4_t samples128_0;
+    int32x4_t samples128_4;
+    int32x4_t samples128_8;
+    uint32x4_t riceParamMask128;
+    int32x4_t riceParam128;
+    int32x2_t shift64;
+    uint32x4_t one128;
+
+    const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+
+    riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = vdupq_n_u32(riceParamMask);
+
+    riceParam128 = vdupq_n_s32(riceParam);
+    shift64 = vdup_n_s32(-shift); /* Negate the shift because we'll be doing a variable shift using vshlq_s32(). */
+    one128 = vdupq_n_u32(1);
+
+    /*
+    Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
+    what's available in the input buffers. It would be conenient to use a fall-through switch to do this, but this results
+    in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
+    so I think there's opportunity for this to be simplified.
+    */
+    {
+        int runningOrder = order;
+        drflac_int32 tempC[4] = {0, 0, 0, 0};
+        drflac_int32 tempS[4] = {0, 0, 0, 0};
+
+        /* 0 - 3. */
+        if (runningOrder >= 4) {
+            coefficients128_0 = vld1q_s32(coefficients + 0);
+            samples128_0      = vld1q_s32(pSamplesOut  - 4);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3]; /* fallthrough */
+                case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2]; /* fallthrough */
+                case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1]; /* fallthrough */
+            }
+
+            coefficients128_0 = vld1q_s32(tempC);
+            samples128_0      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+
+        /* 4 - 7 */
+        if (runningOrder >= 4) {
+            coefficients128_4 = vld1q_s32(coefficients + 4);
+            samples128_4      = vld1q_s32(pSamplesOut  - 8);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7]; /* fallthrough */
+                case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6]; /* fallthrough */
+                case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5]; /* fallthrough */
+            }
+
+            coefficients128_4 = vld1q_s32(tempC);
+            samples128_4      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+
+        /* 8 - 11 */
+        if (runningOrder == 4) {
+            coefficients128_8 = vld1q_s32(coefficients + 8);
+            samples128_8      = vld1q_s32(pSamplesOut  - 12);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11]; /* fallthrough */
+                case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10]; /* fallthrough */
+                case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9]; /* fallthrough */
+            }
+
+            coefficients128_8 = vld1q_s32(tempC);
+            samples128_8      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+
+        /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
+        coefficients128_0 = drflac__vrevq_s32(coefficients128_0);
+        coefficients128_4 = drflac__vrevq_s32(coefficients128_4);
+        coefficients128_8 = drflac__vrevq_s32(coefficients128_8);
+    }
+
+    /* For this version we are doing one sample at a time. */
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        int32x4_t prediction128;
+        int32x2_t prediction64;
+        uint32x4_t zeroCountPart128;
+        uint32x4_t riceParamPart128;
+
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
+            return DRFLAC_FALSE;
+        }
+
+        zeroCountPart128 = vld1q_u32(zeroCountParts);
+        riceParamPart128 = vld1q_u32(riceParamParts);
+
+        riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
+        riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
+        riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(drflac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
+
+        if (order <= 4) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 = vmulq_s32(coefficients128_0, samples128_0);
+
+                /* Horizontal add and shift. */
+                prediction64 = drflac__vhaddq_s32(prediction128);
+                prediction64 = vshl_s32(prediction64, shift64);
+                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
+
+                samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
+                riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+            }
+        } else if (order <= 8) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                vmulq_s32(coefficients128_4, samples128_4);
+                prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
+
+                /* Horizontal add and shift. */
+                prediction64 = drflac__vhaddq_s32(prediction128);
+                prediction64 = vshl_s32(prediction64, shift64);
+                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
+
+                samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
+                samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
+                riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+            }
+        } else {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                vmulq_s32(coefficients128_8, samples128_8);
+                prediction128 = vmlaq_s32(prediction128, coefficients128_4, samples128_4);
+                prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
+
+                /* Horizontal add and shift. */
+                prediction64 = drflac__vhaddq_s32(prediction128);
+                prediction64 = vshl_s32(prediction64, shift64);
+                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
+
+                samples128_8 = drflac__valignrq_s32_1(samples128_4, samples128_8);
+                samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
+                samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
+                riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+            }
+        }
+
+        /* We store samples in groups of 4. */
+        vst1q_s32(pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+
+    /* Make sure we process the last few samples. */
+    i = (count & ~3);
+    while (i < (int)count) {
+        /* Rice extraction. */
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Rice reconstruction. */
+        riceParamParts[0] &= riceParamMask;
+        riceParamParts[0] |= (zeroCountParts[0] << riceParam);
+        riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
+
+        /* Sample reconstruction. */
+        pDecodedSamples[0] = riceParamParts[0] + drflac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
+
+        i += 1;
+        pDecodedSamples += 1;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__neon_64(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    int i;
+    drflac_uint32 riceParamMask;
+    drflac_int32* pDecodedSamples    = pSamplesOut;
+    drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    drflac_uint32 zeroCountParts[4];
+    drflac_uint32 riceParamParts[4];
+    int32x4_t coefficients128_0;
+    int32x4_t coefficients128_4;
+    int32x4_t coefficients128_8;
+    int32x4_t samples128_0;
+    int32x4_t samples128_4;
+    int32x4_t samples128_8;
+    uint32x4_t riceParamMask128;
+    int32x4_t riceParam128;
+    int64x1_t shift64;
+    uint32x4_t one128;
+    int64x2_t prediction128 = { 0 };
+    uint32x4_t zeroCountPart128;
+    uint32x4_t riceParamPart128;
+
+    const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+
+    riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = vdupq_n_u32(riceParamMask);
+
+    riceParam128 = vdupq_n_s32(riceParam);
+    shift64 = vdup_n_s64(-shift); /* Negate the shift because we'll be doing a variable shift using vshlq_s32(). */
+    one128 = vdupq_n_u32(1);
+
+    /*
+    Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
+    what's available in the input buffers. It would be convenient to use a fall-through switch to do this, but this results
+    in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
+    so I think there's opportunity for this to be simplified.
+    */
+    {
+        int runningOrder = order;
+        drflac_int32 tempC[4] = {0, 0, 0, 0};
+        drflac_int32 tempS[4] = {0, 0, 0, 0};
+
+        /* 0 - 3. */
+        if (runningOrder >= 4) {
+            coefficients128_0 = vld1q_s32(coefficients + 0);
+            samples128_0      = vld1q_s32(pSamplesOut  - 4);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3]; /* fallthrough */
+                case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2]; /* fallthrough */
+                case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1]; /* fallthrough */
+            }
+
+            coefficients128_0 = vld1q_s32(tempC);
+            samples128_0      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+
+        /* 4 - 7 */
+        if (runningOrder >= 4) {
+            coefficients128_4 = vld1q_s32(coefficients + 4);
+            samples128_4      = vld1q_s32(pSamplesOut  - 8);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7]; /* fallthrough */
+                case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6]; /* fallthrough */
+                case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5]; /* fallthrough */
+            }
+
+            coefficients128_4 = vld1q_s32(tempC);
+            samples128_4      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+
+        /* 8 - 11 */
+        if (runningOrder == 4) {
+            coefficients128_8 = vld1q_s32(coefficients + 8);
+            samples128_8      = vld1q_s32(pSamplesOut  - 12);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11]; /* fallthrough */
+                case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10]; /* fallthrough */
+                case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9]; /* fallthrough */
+            }
+
+            coefficients128_8 = vld1q_s32(tempC);
+            samples128_8      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+
+        /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
+        coefficients128_0 = drflac__vrevq_s32(coefficients128_0);
+        coefficients128_4 = drflac__vrevq_s32(coefficients128_4);
+        coefficients128_8 = drflac__vrevq_s32(coefficients128_8);
+    }
+
+    /* For this version we are doing one sample at a time. */
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
+            !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
+            return DRFLAC_FALSE;
+        }
+
+        zeroCountPart128 = vld1q_u32(zeroCountParts);
+        riceParamPart128 = vld1q_u32(riceParamParts);
+
+        riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
+        riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
+        riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(drflac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
+
+        for (i = 0; i < 4; i += 1) {
+            int64x1_t prediction64;
+
+            prediction128 = veorq_s64(prediction128, prediction128);    /* Reset to 0. */
+            switch (order)
+            {
+            case 12:
+            case 11: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_8), vget_low_s32(samples128_8)));
+            case 10:
+            case  9: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_8), vget_high_s32(samples128_8)));
+            case  8:
+            case  7: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_4), vget_low_s32(samples128_4)));
+            case  6:
+            case  5: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_4), vget_high_s32(samples128_4)));
+            case  4:
+            case  3: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_0), vget_low_s32(samples128_0)));
+            case  2:
+            case  1: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_0), vget_high_s32(samples128_0)));
+            }
+
+            /* Horizontal add and shift. */
+            prediction64 = drflac__vhaddq_s64(prediction128);
+            prediction64 = vshl_s64(prediction64, shift64);
+            prediction64 = vadd_s64(prediction64, vdup_n_s64(vgetq_lane_u32(riceParamPart128, 0)));
+
+            /* Our value should be sitting in prediction64[0]. We need to combine this with our SSE samples. */
+            samples128_8 = drflac__valignrq_s32_1(samples128_4, samples128_8);
+            samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
+            samples128_0 = drflac__valignrq_s32_1(vcombine_s32(vreinterpret_s32_s64(prediction64), vdup_n_s32(0)), samples128_0);
+
+            /* Slide our rice parameter down so that the value in position 0 contains the next one to process. */
+            riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+        }
+
+        /* We store samples in groups of 4. */
+        vst1q_s32(pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+
+    /* Make sure we process the last few samples. */
+    i = (count & ~3);
+    while (i < (int)count) {
+        /* Rice extraction. */
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Rice reconstruction. */
+        riceParamParts[0] &= riceParamMask;
+        riceParamParts[0] |= (zeroCountParts[0] << riceParam);
+        riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
+
+        /* Sample reconstruction. */
+        pDecodedSamples[0] = riceParamParts[0] + drflac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
+
+        i += 1;
+        pDecodedSamples += 1;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice__neon(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 lpcOrder, drflac_int32 lpcShift, drflac_uint32 lpcPrecision, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(pSamplesOut != NULL);
+
+    /* In my testing the order is rarely > 12, so in this case I'm going to simplify the NEON implementation by only handling order <= 12. */
+    if (lpcOrder > 0 && lpcOrder <= 12) {
+        if (drflac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            return drflac__decode_samples_with_residual__rice__neon_64(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        } else {
+            return drflac__decode_samples_with_residual__rice__neon_32(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        }
+    } else {
+        return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    }
+}
+#endif
+
+static drflac_bool32 drflac__decode_samples_with_residual__rice(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 lpcOrder, drflac_int32 lpcShift, drflac_uint32 lpcPrecision, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+#if defined(DRFLAC_SUPPORT_SSE41)
+    if (drflac__gIsSSE41Supported) {
+        return drflac__decode_samples_with_residual__rice__sse41(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported) {
+        return drflac__decode_samples_with_residual__rice__neon(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+    #if 0
+        return drflac__decode_samples_with_residual__rice__reference(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    #else
+        return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    #endif
+    }
+}
+
+/* Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes. */
+static drflac_bool32 drflac__read_and_seek_residual__rice(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam)
+{
+    drflac_uint32 i;
+
+    DRFLAC_ASSERT(bs != NULL);
+
+    for (i = 0; i < count; ++i) {
+        if (!drflac__seek_rice_parts(bs, riceParam)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    return DRFLAC_TRUE;
+}
+
+#if defined(__clang__)
+__attribute__((no_sanitize("signed-integer-overflow")))
+#endif
+static drflac_bool32 drflac__decode_samples_with_residual__unencoded(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 unencodedBitsPerSample, drflac_uint32 lpcOrder, drflac_int32 lpcShift, drflac_uint32 lpcPrecision, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+{
+    drflac_uint32 i;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(unencodedBitsPerSample <= 31);    /* <-- unencodedBitsPerSample is a 5 bit number, so cannot exceed 31. */
+    DRFLAC_ASSERT(pSamplesOut != NULL);
+
+    for (i = 0; i < count; ++i) {
+        if (unencodedBitsPerSample > 0) {
+            if (!drflac__read_int32(bs, unencodedBitsPerSample, pSamplesOut + i)) {
+                return DRFLAC_FALSE;
+            }
+        } else {
+            pSamplesOut[i] = 0;
+        }
+
+        if (drflac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            pSamplesOut[i] += drflac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        } else {
+            pSamplesOut[i] += drflac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        }
+    }
+
+    return DRFLAC_TRUE;
+}
+
+
+/*
+Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
+when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
+<blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+*/
+static drflac_bool32 drflac__decode_samples_with_residual(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 blockSize, drflac_uint32 lpcOrder, drflac_int32 lpcShift, drflac_uint32 lpcPrecision, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
+{
+    drflac_uint8 residualMethod;
+    drflac_uint8 partitionOrder;
+    drflac_uint32 samplesInPartition;
+    drflac_uint32 partitionsRemaining;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(blockSize != 0);
+    DRFLAC_ASSERT(pDecodedSamples != NULL);       /* <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode? */
+
+    if (!drflac__read_uint8(bs, 2, &residualMethod)) {
+        return DRFLAC_FALSE;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return DRFLAC_FALSE;    /* Unknown or unsupported residual coding method. */
+    }
+
+    /* Ignore the first <order> values. */
+    pDecodedSamples += lpcOrder;
+
+    if (!drflac__read_uint8(bs, 4, &partitionOrder)) {
+        return DRFLAC_FALSE;
+    }
+
+    /*
+    From the FLAC spec:
+      The Rice partition order in a Rice-coded residual section must be less than or equal to 8.
+    */
+    if (partitionOrder > 8) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Validation check. */
+    if ((blockSize / (1 << partitionOrder)) < lpcOrder) {
+        return DRFLAC_FALSE;
+    }
+
+    samplesInPartition = (blockSize / (1 << partitionOrder)) - lpcOrder;
+    partitionsRemaining = (1 << partitionOrder);
+    for (;;) {
+        drflac_uint8 riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(bs, 4, &riceParam)) {
+                return DRFLAC_FALSE;
+            }
+            if (riceParam == 15) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(bs, 5, &riceParam)) {
+                return DRFLAC_FALSE;
+            }
+            if (riceParam == 31) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__decode_samples_with_residual__rice(bs, bitsPerSample, samplesInPartition, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
+                return DRFLAC_FALSE;
+            }
+        } else {
+            drflac_uint8 unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
+                return DRFLAC_FALSE;
+            }
+
+            if (!drflac__decode_samples_with_residual__unencoded(bs, bitsPerSample, samplesInPartition, unencodedBitsPerSample, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
+                return DRFLAC_FALSE;
+            }
+        }
+
+        pDecodedSamples += samplesInPartition;
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+
+        if (partitionOrder != 0) {
+            samplesInPartition = blockSize / (1 << partitionOrder);
+        }
+    }
+
+    return DRFLAC_TRUE;
+}
+
+/*
+Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
+when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
+<blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+*/
+static drflac_bool32 drflac__read_and_seek_residual(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 order)
+{
+    drflac_uint8 residualMethod;
+    drflac_uint8 partitionOrder;
+    drflac_uint32 samplesInPartition;
+    drflac_uint32 partitionsRemaining;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(blockSize != 0);
+
+    if (!drflac__read_uint8(bs, 2, &residualMethod)) {
+        return DRFLAC_FALSE;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return DRFLAC_FALSE;    /* Unknown or unsupported residual coding method. */
+    }
+
+    if (!drflac__read_uint8(bs, 4, &partitionOrder)) {
+        return DRFLAC_FALSE;
+    }
+
+    /*
+    From the FLAC spec:
+      The Rice partition order in a Rice-coded residual section must be less than or equal to 8.
+    */
+    if (partitionOrder > 8) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Validation check. */
+    if ((blockSize / (1 << partitionOrder)) <= order) {
+        return DRFLAC_FALSE;
+    }
+
+    samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        drflac_uint8 riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(bs, 4, &riceParam)) {
+                return DRFLAC_FALSE;
+            }
+            if (riceParam == 15) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(bs, 5, &riceParam)) {
+                return DRFLAC_FALSE;
+            }
+            if (riceParam == 31) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__read_and_seek_residual__rice(bs, samplesInPartition, riceParam)) {
+                return DRFLAC_FALSE;
+            }
+        } else {
+            drflac_uint8 unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
+                return DRFLAC_FALSE;
+            }
+
+            if (!drflac__seek_bits(bs, unencodedBitsPerSample * samplesInPartition)) {
+                return DRFLAC_FALSE;
+            }
+        }
+
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+
+    return DRFLAC_TRUE;
+}
+
+
+static drflac_bool32 drflac__decode_samples__constant(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_int32* pDecodedSamples)
+{
+    drflac_uint32 i;
+
+    /* Only a single sample needs to be decoded here. */
+    drflac_int32 sample;
+    if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
+        return DRFLAC_FALSE;
+    }
+
+    /*
+    We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
+    we'll want to look at a more efficient way.
+    */
+    for (i = 0; i < blockSize; ++i) {
+        pDecodedSamples[i] = sample;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples__verbatim(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_int32* pDecodedSamples)
+{
+    drflac_uint32 i;
+
+    for (i = 0; i < blockSize; ++i) {
+        drflac_int32 sample;
+        if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
+            return DRFLAC_FALSE;
+        }
+
+        pDecodedSamples[i] = sample;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples__fixed(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_uint8 lpcOrder, drflac_int32* pDecodedSamples)
+{
+    drflac_uint32 i;
+
+    static drflac_int32 lpcCoefficientsTable[5][4] = {
+        {0,  0, 0,  0},
+        {1,  0, 0,  0},
+        {2, -1, 0,  0},
+        {3, -3, 1,  0},
+        {4, -6, 4, -1}
+    };
+
+    /* Warm up samples and coefficients. */
+    for (i = 0; i < lpcOrder; ++i) {
+        drflac_int32 sample;
+        if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
+            return DRFLAC_FALSE;
+        }
+
+        pDecodedSamples[i] = sample;
+    }
+
+    if (!drflac__decode_samples_with_residual(bs, subframeBitsPerSample, blockSize, lpcOrder, 0, 4, lpcCoefficientsTable[lpcOrder], pDecodedSamples)) {
+        return DRFLAC_FALSE;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_samples__lpc(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 bitsPerSample, drflac_uint8 lpcOrder, drflac_int32* pDecodedSamples)
+{
+    drflac_uint8 i;
+    drflac_uint8 lpcPrecision;
+    drflac_int8 lpcShift;
+    drflac_int32 coefficients[32];
+
+    /* Warm up samples. */
+    for (i = 0; i < lpcOrder; ++i) {
+        drflac_int32 sample;
+        if (!drflac__read_int32(bs, bitsPerSample, &sample)) {
+            return DRFLAC_FALSE;
+        }
+
+        pDecodedSamples[i] = sample;
+    }
+
+    if (!drflac__read_uint8(bs, 4, &lpcPrecision)) {
+        return DRFLAC_FALSE;
+    }
+    if (lpcPrecision == 15) {
+        return DRFLAC_FALSE;    /* Invalid. */
+    }
+    lpcPrecision += 1;
+
+    if (!drflac__read_int8(bs, 5, &lpcShift)) {
+        return DRFLAC_FALSE;
+    }
+
+    /*
+    From the FLAC specification:
+
+        Quantized linear predictor coefficient shift needed in bits (NOTE: this number is signed two's-complement)
+
+    Emphasis on the "signed two's-complement". In practice there does not seem to be any encoders nor decoders supporting negative shifts. For now dr_flac is
+    not going to support negative shifts as I don't have any reference files. However, when a reference file comes through I will consider adding support.
+    */
+    if (lpcShift < 0) {
+        return DRFLAC_FALSE;
+    }
+
+    DRFLAC_ZERO_MEMORY(coefficients, sizeof(coefficients));
+    for (i = 0; i < lpcOrder; ++i) {
+        if (!drflac__read_int32(bs, lpcPrecision, coefficients + i)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    if (!drflac__decode_samples_with_residual(bs, bitsPerSample, blockSize, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
+        return DRFLAC_FALSE;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+
+static drflac_bool32 drflac__read_next_flac_frame_header(drflac_bs* bs, drflac_uint8 streaminfoBitsPerSample, drflac_frame_header* header)
+{
+    const drflac_uint32 sampleRateTable[12]  = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
+    const drflac_uint8 bitsPerSampleTable[8] = {0, 8, 12, (drflac_uint8)-1, 16, 20, 24, (drflac_uint8)-1};   /* -1 = reserved. */
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(header != NULL);
+
+    /* Keep looping until we find a valid sync code. */
+    for (;;) {
+        drflac_uint8 crc8 = 0xCE; /* 0xCE = drflac_crc8(0, 0x3FFE, 14); */
+        drflac_uint8 reserved = 0;
+        drflac_uint8 blockingStrategy = 0;
+        drflac_uint8 blockSize = 0;
+        drflac_uint8 sampleRate = 0;
+        drflac_uint8 channelAssignment = 0;
+        drflac_uint8 bitsPerSample = 0;
+        drflac_bool32 isVariableBlockSize;
+
+        if (!drflac__find_and_seek_to_next_sync_code(bs)) {
+            return DRFLAC_FALSE;
+        }
+
+        if (!drflac__read_uint8(bs, 1, &reserved)) {
+            return DRFLAC_FALSE;
+        }
+        if (reserved == 1) {
+            continue;
+        }
+        crc8 = drflac_crc8(crc8, reserved, 1);
+
+        if (!drflac__read_uint8(bs, 1, &blockingStrategy)) {
+            return DRFLAC_FALSE;
+        }
+        crc8 = drflac_crc8(crc8, blockingStrategy, 1);
+
+        if (!drflac__read_uint8(bs, 4, &blockSize)) {
+            return DRFLAC_FALSE;
+        }
+        if (blockSize == 0) {
+            continue;
+        }
+        crc8 = drflac_crc8(crc8, blockSize, 4);
+
+        if (!drflac__read_uint8(bs, 4, &sampleRate)) {
+            return DRFLAC_FALSE;
+        }
+        crc8 = drflac_crc8(crc8, sampleRate, 4);
+
+        if (!drflac__read_uint8(bs, 4, &channelAssignment)) {
+            return DRFLAC_FALSE;
+        }
+        if (channelAssignment > 10) {
+            continue;
+        }
+        crc8 = drflac_crc8(crc8, channelAssignment, 4);
+
+        if (!drflac__read_uint8(bs, 3, &bitsPerSample)) {
+            return DRFLAC_FALSE;
+        }
+        if (bitsPerSample == 3 || bitsPerSample == 7) {
+            continue;
+        }
+        crc8 = drflac_crc8(crc8, bitsPerSample, 3);
+
+
+        if (!drflac__read_uint8(bs, 1, &reserved)) {
+            return DRFLAC_FALSE;
+        }
+        if (reserved == 1) {
+            continue;
+        }
+        crc8 = drflac_crc8(crc8, reserved, 1);
+
+
+        isVariableBlockSize = blockingStrategy == 1;
+        if (isVariableBlockSize) {
+            drflac_uint64 pcmFrameNumber;
+            drflac_result result = drflac__read_utf8_coded_number(bs, &pcmFrameNumber, &crc8);
+            if (result != DRFLAC_SUCCESS) {
+                if (result == DRFLAC_AT_END) {
+                    return DRFLAC_FALSE;
+                } else {
+                    continue;
+                }
+            }
+            header->flacFrameNumber  = 0;
+            header->pcmFrameNumber = pcmFrameNumber;
+        } else {
+            drflac_uint64 flacFrameNumber = 0;
+            drflac_result result = drflac__read_utf8_coded_number(bs, &flacFrameNumber, &crc8);
+            if (result != DRFLAC_SUCCESS) {
+                if (result == DRFLAC_AT_END) {
+                    return DRFLAC_FALSE;
+                } else {
+                    continue;
+                }
+            }
+            header->flacFrameNumber  = (drflac_uint32)flacFrameNumber;   /* <-- Safe cast. */
+            header->pcmFrameNumber = 0;
+        }
+
+
+        DRFLAC_ASSERT(blockSize > 0);
+        if (blockSize == 1) {
+            header->blockSizeInPCMFrames = 192;
+        } else if (blockSize <= 5) {
+            DRFLAC_ASSERT(blockSize >= 2);
+            header->blockSizeInPCMFrames = 576 * (1 << (blockSize - 2));
+        } else if (blockSize == 6) {
+            if (!drflac__read_uint16(bs, 8, &header->blockSizeInPCMFrames)) {
+                return DRFLAC_FALSE;
+            }
+            crc8 = drflac_crc8(crc8, header->blockSizeInPCMFrames, 8);
+            header->blockSizeInPCMFrames += 1;
+        } else if (blockSize == 7) {
+            if (!drflac__read_uint16(bs, 16, &header->blockSizeInPCMFrames)) {
+                return DRFLAC_FALSE;
+            }
+            crc8 = drflac_crc8(crc8, header->blockSizeInPCMFrames, 16);
+            if (header->blockSizeInPCMFrames == 0xFFFF) {
+                return DRFLAC_FALSE;    /* Frame is too big. This is the size of the frame minus 1. The STREAMINFO block defines the max block size which is 16-bits. Adding one will make it 17 bits and therefore too big. */
+            }
+            header->blockSizeInPCMFrames += 1;
+        } else {
+            DRFLAC_ASSERT(blockSize >= 8);
+            header->blockSizeInPCMFrames = 256 * (1 << (blockSize - 8));
+        }
+
+
+        if (sampleRate <= 11) {
+            header->sampleRate = sampleRateTable[sampleRate];
+        } else if (sampleRate == 12) {
+            if (!drflac__read_uint32(bs, 8, &header->sampleRate)) {
+                return DRFLAC_FALSE;
+            }
+            crc8 = drflac_crc8(crc8, header->sampleRate, 8);
+            header->sampleRate *= 1000;
+        } else if (sampleRate == 13) {
+            if (!drflac__read_uint32(bs, 16, &header->sampleRate)) {
+                return DRFLAC_FALSE;
+            }
+            crc8 = drflac_crc8(crc8, header->sampleRate, 16);
+        } else if (sampleRate == 14) {
+            if (!drflac__read_uint32(bs, 16, &header->sampleRate)) {
+                return DRFLAC_FALSE;
+            }
+            crc8 = drflac_crc8(crc8, header->sampleRate, 16);
+            header->sampleRate *= 10;
+        } else {
+            continue;  /* Invalid. Assume an invalid block. */
+        }
+
+
+        header->channelAssignment = channelAssignment;
+
+        header->bitsPerSample = bitsPerSampleTable[bitsPerSample];
+        if (header->bitsPerSample == 0) {
+            header->bitsPerSample = streaminfoBitsPerSample;
+        }
+
+        if (header->bitsPerSample != streaminfoBitsPerSample) {
+            /* If this subframe has a different bitsPerSample then streaminfo or the first frame, reject it */
+            return DRFLAC_FALSE;
+        }
+
+        if (!drflac__read_uint8(bs, 8, &header->crc8)) {
+            return DRFLAC_FALSE;
+        }
+
+#ifndef DR_FLAC_NO_CRC
+        if (header->crc8 != crc8) {
+            continue;    /* CRC mismatch. Loop back to the top and find the next sync code. */
+        }
+#endif
+        return DRFLAC_TRUE;
+    }
+}
+
+static drflac_bool32 drflac__read_subframe_header(drflac_bs* bs, drflac_subframe* pSubframe)
+{
+    drflac_uint8 header;
+    int type;
+
+    if (!drflac__read_uint8(bs, 8, &header)) {
+        return DRFLAC_FALSE;
+    }
+
+    /* First bit should always be 0. */
+    if ((header & 0x80) != 0) {
+        return DRFLAC_FALSE;
+    }
+
+    /*
+    Default to 0 for the LPC order. It's important that we always set this to 0 for non LPC
+    and FIXED subframes because we'll be using it in a generic validation check later.
+    */
+    pSubframe->lpcOrder = 0;
+
+    type = (header & 0x7E) >> 1;
+    if (type == 0) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
+    } else if (type == 1) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
+    } else {
+        if ((type & 0x20) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
+            pSubframe->lpcOrder = (drflac_uint8)(type & 0x1F) + 1;
+        } else if ((type & 0x08) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
+            pSubframe->lpcOrder = (drflac_uint8)(type & 0x07);
+            if (pSubframe->lpcOrder > 4) {
+                pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+                pSubframe->lpcOrder = 0;
+            }
+        } else {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+        }
+    }
+
+    if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Wasted bits per sample. */
+    pSubframe->wastedBitsPerSample = 0;
+    if ((header & 0x01) == 1) {
+        unsigned int wastedBitsPerSample;
+        if (!drflac__seek_past_next_set_bit(bs, &wastedBitsPerSample)) {
+            return DRFLAC_FALSE;
+        }
+        pSubframe->wastedBitsPerSample = (drflac_uint8)wastedBitsPerSample + 1;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_subframe(drflac_bs* bs, drflac_frame* frame, int subframeIndex, drflac_int32* pDecodedSamplesOut)
+{
+    drflac_subframe* pSubframe;
+    drflac_uint32 subframeBitsPerSample;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(frame != NULL);
+
+    pSubframe = frame->subframes + subframeIndex;
+    if (!drflac__read_subframe_header(bs, pSubframe)) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Side channels require an extra bit per sample. Took a while to figure that one out... */
+    subframeBitsPerSample = frame->header.bitsPerSample;
+    if ((frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        subframeBitsPerSample += 1;
+    } else if (frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        subframeBitsPerSample += 1;
+    }
+
+    if (subframeBitsPerSample > 32) {
+        /* libFLAC and ffmpeg reject 33-bit subframes as well */
+        return DRFLAC_FALSE;
+    }
+
+    /* Need to handle wasted bits per sample. */
+    if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
+        return DRFLAC_FALSE;
+    }
+    subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
+
+    pSubframe->pSamplesS32 = pDecodedSamplesOut;
+
+    /*
+    pDecodedSamplesOut will be pointing to a buffer that was allocated with enough memory to store
+    maxBlockSizeInPCMFrames samples (as specified in the FLAC header). We need to guard against an
+    overflow here. At a higher level we are checking maxBlockSizeInPCMFrames from the header, but
+    here we need to do an additional check to ensure this frame's block size fully encompasses any
+    warmup samples which is determined by the LPC order. For non LPC and FIXED subframes, the LPC
+    order will be have been set to 0 in drflac__read_subframe_header().
+    */
+    if (frame->header.blockSizeInPCMFrames < pSubframe->lpcOrder) {
+        return DRFLAC_FALSE;
+    }
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            drflac__decode_samples__constant(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            drflac__decode_samples__verbatim(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            drflac__decode_samples__fixed(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            drflac__decode_samples__lpc(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
+        } break;
+
+        default: return DRFLAC_FALSE;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__seek_subframe(drflac_bs* bs, drflac_frame* frame, int subframeIndex)
+{
+    drflac_subframe* pSubframe;
+    drflac_uint32 subframeBitsPerSample;
+
+    DRFLAC_ASSERT(bs != NULL);
+    DRFLAC_ASSERT(frame != NULL);
+
+    pSubframe = frame->subframes + subframeIndex;
+    if (!drflac__read_subframe_header(bs, pSubframe)) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Side channels require an extra bit per sample. Took a while to figure that one out... */
+    subframeBitsPerSample = frame->header.bitsPerSample;
+    if ((frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        subframeBitsPerSample += 1;
+    } else if (frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        subframeBitsPerSample += 1;
+    }
+
+    /* Need to handle wasted bits per sample. */
+    if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
+        return DRFLAC_FALSE;
+    }
+    subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
+
+    pSubframe->pSamplesS32 = NULL;
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            if (!drflac__seek_bits(bs, subframeBitsPerSample)) {
+                return DRFLAC_FALSE;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            unsigned int bitsToSeek = frame->header.blockSizeInPCMFrames * subframeBitsPerSample;
+            if (!drflac__seek_bits(bs, bitsToSeek)) {
+                return DRFLAC_FALSE;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
+            if (!drflac__seek_bits(bs, bitsToSeek)) {
+                return DRFLAC_FALSE;
+            }
+
+            if (!drflac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
+                return DRFLAC_FALSE;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            drflac_uint8 lpcPrecision;
+
+            unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
+            if (!drflac__seek_bits(bs, bitsToSeek)) {
+                return DRFLAC_FALSE;
+            }
+
+            if (!drflac__read_uint8(bs, 4, &lpcPrecision)) {
+                return DRFLAC_FALSE;
+            }
+            if (lpcPrecision == 15) {
+                return DRFLAC_FALSE;    /* Invalid. */
+            }
+            lpcPrecision += 1;
+
+
+            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    /* +5 for shift. */
+            if (!drflac__seek_bits(bs, bitsToSeek)) {
+                return DRFLAC_FALSE;
+            }
+
+            if (!drflac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
+                return DRFLAC_FALSE;
+            }
+        } break;
+
+        default: return DRFLAC_FALSE;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+
+static DRFLAC_INLINE drflac_uint8 drflac__get_channel_count_from_channel_assignment(drflac_int8 channelAssignment)
+{
+    drflac_uint8 lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
+
+    DRFLAC_ASSERT(channelAssignment <= 10);
+    return lookup[channelAssignment];
+}
+
+static drflac_result drflac__decode_flac_frame(drflac* pFlac)
+{
+    int channelCount;
+    int i;
+    drflac_uint8 paddingSizeInBits;
+    drflac_uint16 desiredCRC16;
+#ifndef DR_FLAC_NO_CRC
+    drflac_uint16 actualCRC16;
+#endif
+
+    /* This function should be called while the stream is sitting on the first byte after the frame header. */
+    DRFLAC_ZERO_MEMORY(pFlac->currentFLACFrame.subframes, sizeof(pFlac->currentFLACFrame.subframes));
+
+    /* The frame block size must never be larger than the maximum block size defined by the FLAC stream. */
+    if (pFlac->currentFLACFrame.header.blockSizeInPCMFrames > pFlac->maxBlockSizeInPCMFrames) {
+        return DRFLAC_ERROR;
+    }
+
+    /* The number of channels in the frame must match the channel count from the STREAMINFO block. */
+    channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+    if (channelCount != (int)pFlac->channels) {
+        return DRFLAC_ERROR;
+    }
+
+    for (i = 0; i < channelCount; ++i) {
+        if (!drflac__decode_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i, pFlac->pDecodedSamples + (pFlac->currentFLACFrame.header.blockSizeInPCMFrames * i))) {
+            return DRFLAC_ERROR;
+        }
+    }
+
+    paddingSizeInBits = (drflac_uint8)(DRFLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7);
+    if (paddingSizeInBits > 0) {
+        drflac_uint8 padding = 0;
+        if (!drflac__read_uint8(&pFlac->bs, paddingSizeInBits, &padding)) {
+            return DRFLAC_AT_END;
+        }
+    }
+
+#ifndef DR_FLAC_NO_CRC
+    actualCRC16 = drflac__flush_crc16(&pFlac->bs);
+#endif
+    if (!drflac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
+        return DRFLAC_AT_END;
+    }
+
+#ifndef DR_FLAC_NO_CRC
+    if (actualCRC16 != desiredCRC16) {
+        return DRFLAC_CRC_MISMATCH;    /* CRC mismatch. */
+    }
+#endif
+
+    pFlac->currentFLACFrame.pcmFramesRemaining = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
+
+    return DRFLAC_SUCCESS;
+}
+
+static drflac_result drflac__seek_flac_frame(drflac* pFlac)
+{
+    int channelCount;
+    int i;
+    drflac_uint16 desiredCRC16;
+#ifndef DR_FLAC_NO_CRC
+    drflac_uint16 actualCRC16;
+#endif
+
+    channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+    for (i = 0; i < channelCount; ++i) {
+        if (!drflac__seek_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i)) {
+            return DRFLAC_ERROR;
+        }
+    }
+
+    /* Padding. */
+    if (!drflac__seek_bits(&pFlac->bs, DRFLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7)) {
+        return DRFLAC_ERROR;
+    }
+
+    /* CRC. */
+#ifndef DR_FLAC_NO_CRC
+    actualCRC16 = drflac__flush_crc16(&pFlac->bs);
+#endif
+    if (!drflac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
+        return DRFLAC_AT_END;
+    }
+
+#ifndef DR_FLAC_NO_CRC
+    if (actualCRC16 != desiredCRC16) {
+        return DRFLAC_CRC_MISMATCH;    /* CRC mismatch. */
+    }
+#endif
+
+    return DRFLAC_SUCCESS;
+}
+
+static drflac_bool32 drflac__read_and_decode_next_flac_frame(drflac* pFlac)
+{
+    DRFLAC_ASSERT(pFlac != NULL);
+
+    for (;;) {
+        drflac_result result;
+
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return DRFLAC_FALSE;
+        }
+
+        result = drflac__decode_flac_frame(pFlac);
+        if (result != DRFLAC_SUCCESS) {
+            if (result == DRFLAC_CRC_MISMATCH) {
+                continue;   /* CRC mismatch. Skip to the next frame. */
+            } else {
+                return DRFLAC_FALSE;
+            }
+        }
+
+        return DRFLAC_TRUE;
+    }
+}
+
+static void drflac__get_pcm_frame_range_of_current_flac_frame(drflac* pFlac, drflac_uint64* pFirstPCMFrame, drflac_uint64* pLastPCMFrame)
+{
+    drflac_uint64 firstPCMFrame;
+    drflac_uint64 lastPCMFrame;
+
+    DRFLAC_ASSERT(pFlac != NULL);
+
+    firstPCMFrame = pFlac->currentFLACFrame.header.pcmFrameNumber;
+    if (firstPCMFrame == 0) {
+        firstPCMFrame = ((drflac_uint64)pFlac->currentFLACFrame.header.flacFrameNumber) * pFlac->maxBlockSizeInPCMFrames;
+    }
+
+    lastPCMFrame = firstPCMFrame + pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
+    if (lastPCMFrame > 0) {
+        lastPCMFrame -= 1; /* Needs to be zero based. */
+    }
+
+    if (pFirstPCMFrame) {
+        *pFirstPCMFrame = firstPCMFrame;
+    }
+    if (pLastPCMFrame) {
+        *pLastPCMFrame = lastPCMFrame;
+    }
+}
+
+static drflac_bool32 drflac__seek_to_first_frame(drflac* pFlac)
+{
+    drflac_bool32 result;
+
+    DRFLAC_ASSERT(pFlac != NULL);
+
+    result = drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes);
+
+    DRFLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
+    pFlac->currentPCMFrame = 0;
+
+    return result;
+}
+
+static DRFLAC_INLINE drflac_result drflac__seek_to_next_flac_frame(drflac* pFlac)
+{
+    /* This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section. */
+    DRFLAC_ASSERT(pFlac != NULL);
+    return drflac__seek_flac_frame(pFlac);
+}
+
+
+static drflac_uint64 drflac__seek_forward_by_pcm_frames(drflac* pFlac, drflac_uint64 pcmFramesToSeek)
+{
+    drflac_uint64 pcmFramesRead = 0;
+    while (pcmFramesToSeek > 0) {
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
+                break;  /* Couldn't read the next frame, so just break from the loop and return. */
+            }
+        } else {
+            if (pFlac->currentFLACFrame.pcmFramesRemaining > pcmFramesToSeek) {
+                pcmFramesRead   += pcmFramesToSeek;
+                pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)pcmFramesToSeek;   /* <-- Safe cast. Will always be < currentFrame.pcmFramesRemaining < 65536. */
+                pcmFramesToSeek  = 0;
+            } else {
+                pcmFramesRead   += pFlac->currentFLACFrame.pcmFramesRemaining;
+                pcmFramesToSeek -= pFlac->currentFLACFrame.pcmFramesRemaining;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+            }
+        }
+    }
+
+    pFlac->currentPCMFrame += pcmFramesRead;
+    return pcmFramesRead;
+}
+
+
+static drflac_bool32 drflac__seek_to_pcm_frame__brute_force(drflac* pFlac, drflac_uint64 pcmFrameIndex)
+{
+    drflac_bool32 isMidFrame = DRFLAC_FALSE;
+    drflac_uint64 runningPCMFrameCount;
+
+    DRFLAC_ASSERT(pFlac != NULL);
+
+    /* If we are seeking forward we start from the current position. Otherwise we need to start all the way from the start of the file. */
+    if (pcmFrameIndex >= pFlac->currentPCMFrame) {
+        /* Seeking forward. Need to seek from the current position. */
+        runningPCMFrameCount = pFlac->currentPCMFrame;
+
+        /* The frame header for the first frame may not yet have been read. We need to do that if necessary. */
+        if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                return DRFLAC_FALSE;
+            }
+        } else {
+            isMidFrame = DRFLAC_TRUE;
+        }
+    } else {
+        /* Seeking backwards. Need to seek from the start of the file. */
+        runningPCMFrameCount = 0;
+
+        /* Move back to the start. */
+        if (!drflac__seek_to_first_frame(pFlac)) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Decode the first frame in preparation for sample-exact seeking below. */
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    /*
+    We need to as quickly as possible find the frame that contains the target sample. To do this, we iterate over each frame and inspect its
+    header. If based on the header we can determine that the frame contains the sample, we do a full decode of that frame.
+    */
+    for (;;) {
+        drflac_uint64 pcmFrameCountInThisFLACFrame;
+        drflac_uint64 firstPCMFrameInFLACFrame = 0;
+        drflac_uint64 lastPCMFrameInFLACFrame = 0;
+
+        drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
+
+        pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
+        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
+            /*
+            The sample should be in this frame. We need to fully decode it, however if it's an invalid frame (a CRC mismatch), we need to pretend
+            it never existed and keep iterating.
+            */
+            drflac_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
+
+            if (!isMidFrame) {
+                drflac_result result = drflac__decode_flac_frame(pFlac);
+                if (result == DRFLAC_SUCCESS) {
+                    /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
+                    return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
+                } else {
+                    if (result == DRFLAC_CRC_MISMATCH) {
+                        goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
+                    } else {
+                        return DRFLAC_FALSE;
+                    }
+                }
+            } else {
+                /* We started seeking mid-frame which means we need to skip the frame decoding part. */
+                return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
+            }
+        } else {
+            /*
+            It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
+            frame never existed and leave the running sample count untouched.
+            */
+            if (!isMidFrame) {
+                drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
+                if (result == DRFLAC_SUCCESS) {
+                    runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
+                } else {
+                    if (result == DRFLAC_CRC_MISMATCH) {
+                        goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
+                    } else {
+                        return DRFLAC_FALSE;
+                    }
+                }
+            } else {
+                /*
+                We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
+                drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
+                */
+                runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+                isMidFrame = DRFLAC_FALSE;
+            }
+
+            /* If we are seeking to the end of the file and we've just hit it, we're done. */
+            if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
+                return DRFLAC_TRUE;
+            }
+        }
+
+    next_iteration:
+        /* Grab the next frame in preparation for the next iteration. */
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return DRFLAC_FALSE;
+        }
+    }
+}
+
+
+#if !defined(DR_FLAC_NO_CRC)
+/*
+We use an average compression ratio to determine our approximate start location. FLAC files are generally about 50%-70% the size of their
+uncompressed counterparts so we'll use this as a basis. I'm going to split the middle and use a factor of 0.6 to determine the starting
+location.
+*/
+#define DRFLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO 0.6f
+
+static drflac_bool32 drflac__seek_to_approximate_flac_frame_to_byte(drflac* pFlac, drflac_uint64 targetByte, drflac_uint64 rangeLo, drflac_uint64 rangeHi, drflac_uint64* pLastSuccessfulSeekOffset)
+{
+    DRFLAC_ASSERT(pFlac != NULL);
+    DRFLAC_ASSERT(pLastSuccessfulSeekOffset != NULL);
+    DRFLAC_ASSERT(targetByte >= rangeLo);
+    DRFLAC_ASSERT(targetByte <= rangeHi);
+
+    *pLastSuccessfulSeekOffset = pFlac->firstFLACFramePosInBytes;
+
+    for (;;) {
+        /* After rangeLo == rangeHi == targetByte fails, we need to break out. */
+        drflac_uint64 lastTargetByte = targetByte;
+
+        /* When seeking to a byte, failure probably means we've attempted to seek beyond the end of the stream. To counter this we just halve it each attempt. */
+        if (!drflac__seek_to_byte(&pFlac->bs, targetByte)) {
+            /* If we couldn't even seek to the first byte in the stream we have a problem. Just abandon the whole thing. */
+            if (targetByte == 0) {
+                drflac__seek_to_first_frame(pFlac); /* Try to recover. */
+                return DRFLAC_FALSE;
+            }
+
+            /* Halve the byte location and continue. */
+            targetByte = rangeLo + ((rangeHi - rangeLo)/2);
+            rangeHi = targetByte;
+        } else {
+            /* Getting here should mean that we have seeked to an appropriate byte. */
+
+            /* Clear the details of the FLAC frame so we don't misreport data. */
+            DRFLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
+
+            /*
+            Now seek to the next FLAC frame. We need to decode the entire frame (not just the header) because it's possible for the header to incorrectly pass the
+            CRC check and return bad data. We need to decode the entire frame to be more certain. Although this seems unlikely, this has happened to me in testing
+            so it needs to stay this way for now.
+            */
+#if 1
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
+                /* Halve the byte location and continue. */
+                targetByte = rangeLo + ((rangeHi - rangeLo)/2);
+                rangeHi = targetByte;
+            } else {
+                break;
+            }
+#else
+            if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                /* Halve the byte location and continue. */
+                targetByte = rangeLo + ((rangeHi - rangeLo)/2);
+                rangeHi = targetByte;
+            } else {
+                break;
+            }
+#endif
+        }
+
+        /* We already tried this byte and there are no more to try, break out. */
+        if(targetByte == lastTargetByte) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    /* The current PCM frame needs to be updated based on the frame we just seeked to. */
+    drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
+
+    DRFLAC_ASSERT(targetByte <= rangeHi);
+
+    *pLastSuccessfulSeekOffset = targetByte;
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(drflac* pFlac, drflac_uint64 offset)
+{
+    /* This section of code would be used if we were only decoding the FLAC frame header when calling drflac__seek_to_approximate_flac_frame_to_byte(). */
+#if 0
+    if (drflac__decode_flac_frame(pFlac) != DRFLAC_SUCCESS) {
+        /* We failed to decode this frame which may be due to it being corrupt. We'll just use the next valid FLAC frame. */
+        if (drflac__read_and_decode_next_flac_frame(pFlac) == DRFLAC_FALSE) {
+            return DRFLAC_FALSE;
+        }
+    }
+#endif
+
+    return drflac__seek_forward_by_pcm_frames(pFlac, offset) == offset;
+}
+
+
+static drflac_bool32 drflac__seek_to_pcm_frame__binary_search_internal(drflac* pFlac, drflac_uint64 pcmFrameIndex, drflac_uint64 byteRangeLo, drflac_uint64 byteRangeHi)
+{
+    /* This assumes pFlac->currentPCMFrame is sitting on byteRangeLo upon entry. */
+
+    drflac_uint64 targetByte;
+    drflac_uint64 pcmRangeLo = pFlac->totalPCMFrameCount;
+    drflac_uint64 pcmRangeHi = 0;
+    drflac_uint64 lastSuccessfulSeekOffset = (drflac_uint64)-1;
+    drflac_uint64 closestSeekOffsetBeforeTargetPCMFrame = byteRangeLo;
+    drflac_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
+
+    targetByte = byteRangeLo + (drflac_uint64)(((drflac_int64)((pcmFrameIndex - pFlac->currentPCMFrame) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * DRFLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO);
+    if (targetByte > byteRangeHi) {
+        targetByte = byteRangeHi;
+    }
+
+    for (;;) {
+        if (drflac__seek_to_approximate_flac_frame_to_byte(pFlac, targetByte, byteRangeLo, byteRangeHi, &lastSuccessfulSeekOffset)) {
+            /* We found a FLAC frame. We need to check if it contains the sample we're looking for. */
+            drflac_uint64 newPCMRangeLo;
+            drflac_uint64 newPCMRangeHi;
+            drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &newPCMRangeLo, &newPCMRangeHi);
+
+            /* If we selected the same frame, it means we should be pretty close. Just decode the rest. */
+            if (pcmRangeLo == newPCMRangeLo) {
+                if (!drflac__seek_to_approximate_flac_frame_to_byte(pFlac, closestSeekOffsetBeforeTargetPCMFrame, closestSeekOffsetBeforeTargetPCMFrame, byteRangeHi, &lastSuccessfulSeekOffset)) {
+                    break;  /* Failed to seek to closest frame. */
+                }
+
+                if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
+                    return DRFLAC_TRUE;
+                } else {
+                    break;  /* Failed to seek forward. */
+                }
+            }
+
+            pcmRangeLo = newPCMRangeLo;
+            pcmRangeHi = newPCMRangeHi;
+
+            if (pcmRangeLo <= pcmFrameIndex && pcmRangeHi >= pcmFrameIndex) {
+                /* The target PCM frame is in this FLAC frame. */
+                if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame) ) {
+                    return DRFLAC_TRUE;
+                } else {
+                    break;  /* Failed to seek to FLAC frame. */
+                }
+            } else {
+                const float approxCompressionRatio = (drflac_int64)(lastSuccessfulSeekOffset - pFlac->firstFLACFramePosInBytes) / ((drflac_int64)(pcmRangeLo * pFlac->channels * pFlac->bitsPerSample)/8.0f);
+
+                if (pcmRangeLo > pcmFrameIndex) {
+                    /* We seeked too far forward. We need to move our target byte backward and try again. */
+                    byteRangeHi = lastSuccessfulSeekOffset;
+                    if (byteRangeLo > byteRangeHi) {
+                        byteRangeLo = byteRangeHi;
+                    }
+
+                    targetByte = byteRangeLo + ((byteRangeHi - byteRangeLo) / 2);
+                    if (targetByte < byteRangeLo) {
+                        targetByte = byteRangeLo;
+                    }
+                } else /*if (pcmRangeHi < pcmFrameIndex)*/ {
+                    /* We didn't seek far enough. We need to move our target byte forward and try again. */
+
+                    /* If we're close enough we can just seek forward. */
+                    if ((pcmFrameIndex - pcmRangeLo) < seekForwardThreshold) {
+                        if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
+                            return DRFLAC_TRUE;
+                        } else {
+                            break;  /* Failed to seek to FLAC frame. */
+                        }
+                    } else {
+                        byteRangeLo = lastSuccessfulSeekOffset;
+                        if (byteRangeHi < byteRangeLo) {
+                            byteRangeHi = byteRangeLo;
+                        }
+
+                        targetByte = lastSuccessfulSeekOffset + (drflac_uint64)(((drflac_int64)((pcmFrameIndex-pcmRangeLo) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * approxCompressionRatio);
+                        if (targetByte > byteRangeHi) {
+                            targetByte = byteRangeHi;
+                        }
+
+                        if (closestSeekOffsetBeforeTargetPCMFrame < lastSuccessfulSeekOffset) {
+                            closestSeekOffsetBeforeTargetPCMFrame = lastSuccessfulSeekOffset;
+                        }
+                    }
+                }
+            }
+        } else {
+            /* Getting here is really bad. We just recover as best we can, but moving to the first frame in the stream, and then abort. */
+            break;
+        }
+    }
+
+    drflac__seek_to_first_frame(pFlac); /* <-- Try to recover. */
+    return DRFLAC_FALSE;
+}
+
+static drflac_bool32 drflac__seek_to_pcm_frame__binary_search(drflac* pFlac, drflac_uint64 pcmFrameIndex)
+{
+    drflac_uint64 byteRangeLo;
+    drflac_uint64 byteRangeHi;
+    drflac_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
+
+    /* Our algorithm currently assumes the FLAC stream is currently sitting at the start. */
+    if (drflac__seek_to_first_frame(pFlac) == DRFLAC_FALSE) {
+        return DRFLAC_FALSE;
+    }
+
+    /* If we're close enough to the start, just move to the start and seek forward. */
+    if (pcmFrameIndex < seekForwardThreshold) {
+        return drflac__seek_forward_by_pcm_frames(pFlac, pcmFrameIndex) == pcmFrameIndex;
+    }
+
+    /*
+    Our starting byte range is the byte position of the first FLAC frame and the approximate end of the file as if it were completely uncompressed. This ensures
+    the entire file is included, even though most of the time it'll exceed the end of the actual stream. This is OK as the frame searching logic will handle it.
+    */
+    byteRangeLo = pFlac->firstFLACFramePosInBytes;
+    byteRangeHi = pFlac->firstFLACFramePosInBytes + (drflac_uint64)((drflac_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
+
+    return drflac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi);
+}
+#endif  /* !DR_FLAC_NO_CRC */
+
+static drflac_bool32 drflac__seek_to_pcm_frame__seek_table(drflac* pFlac, drflac_uint64 pcmFrameIndex)
+{
+    drflac_uint32 iClosestSeekpoint = 0;
+    drflac_bool32 isMidFrame = DRFLAC_FALSE;
+    drflac_uint64 runningPCMFrameCount;
+    drflac_uint32 iSeekpoint;
+
+
+    DRFLAC_ASSERT(pFlac != NULL);
+
+    if (pFlac->pSeekpoints == NULL || pFlac->seekpointCount == 0) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Do not use the seektable if pcmFramIndex is not coverd by it. */
+    if (pFlac->pSeekpoints[0].firstPCMFrame > pcmFrameIndex) {
+        return DRFLAC_FALSE;
+    }
+
+    for (iSeekpoint = 0; iSeekpoint < pFlac->seekpointCount; ++iSeekpoint) {
+        if (pFlac->pSeekpoints[iSeekpoint].firstPCMFrame >= pcmFrameIndex) {
+            break;
+        }
+
+        iClosestSeekpoint = iSeekpoint;
+    }
+
+    /* There's been cases where the seek table contains only zeros. We need to do some basic validation on the closest seekpoint. */
+    if (pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount == 0 || pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount > pFlac->maxBlockSizeInPCMFrames) {
+        return DRFLAC_FALSE;
+    }
+    if (pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame > pFlac->totalPCMFrameCount && pFlac->totalPCMFrameCount > 0) {
+        return DRFLAC_FALSE;
+    }
+
+#if !defined(DR_FLAC_NO_CRC)
+    /* At this point we should know the closest seek point. We can use a binary search for this. We need to know the total sample count for this. */
+    if (pFlac->totalPCMFrameCount > 0) {
+        drflac_uint64 byteRangeLo;
+        drflac_uint64 byteRangeHi;
+
+        byteRangeHi = pFlac->firstFLACFramePosInBytes + (drflac_uint64)((drflac_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
+        byteRangeLo = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset;
+
+        /*
+        If our closest seek point is not the last one, we only need to search between it and the next one. The section below calculates an appropriate starting
+        value for byteRangeHi which will clamp it appropriately.
+
+        Note that the next seekpoint must have an offset greater than the closest seekpoint because otherwise our binary search algorithm will break down. There
+        have been cases where a seektable consists of seek points where every byte offset is set to 0 which causes problems. If this happens we need to abort.
+        */
+        if (iClosestSeekpoint < pFlac->seekpointCount-1) {
+            drflac_uint32 iNextSeekpoint = iClosestSeekpoint + 1;
+
+            /* Basic validation on the seekpoints to ensure they're usable. */
+            if (pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset >= pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset || pFlac->pSeekpoints[iNextSeekpoint].pcmFrameCount == 0) {
+                return DRFLAC_FALSE;    /* The next seekpoint doesn't look right. The seek table cannot be trusted from here. Abort. */
+            }
+
+            if (pFlac->pSeekpoints[iNextSeekpoint].firstPCMFrame != (((drflac_uint64)0xFFFFFFFF << 32) | 0xFFFFFFFF)) { /* Make sure it's not a placeholder seekpoint. */
+                byteRangeHi = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset - 1; /* byteRangeHi must be zero based. */
+            }
+        }
+
+        if (drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
+            if (drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
+
+                if (drflac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi)) {
+                    return DRFLAC_TRUE;
+                }
+            }
+        }
+    }
+#endif  /* !DR_FLAC_NO_CRC */
+
+    /* Getting here means we need to use a slower algorithm because the binary search method failed or cannot be used. */
+
+    /*
+    If we are seeking forward and the closest seekpoint is _before_ the current sample, we just seek forward from where we are. Otherwise we start seeking
+    from the seekpoint's first sample.
+    */
+    if (pcmFrameIndex >= pFlac->currentPCMFrame && pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame <= pFlac->currentPCMFrame) {
+        /* Optimized case. Just seek forward from where we are. */
+        runningPCMFrameCount = pFlac->currentPCMFrame;
+
+        /* The frame header for the first frame may not yet have been read. We need to do that if necessary. */
+        if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                return DRFLAC_FALSE;
+            }
+        } else {
+            isMidFrame = DRFLAC_TRUE;
+        }
+    } else {
+        /* Slower case. Seek to the start of the seekpoint and then seek forward from there. */
+        runningPCMFrameCount = pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame;
+
+        if (!drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Grab the frame the seekpoint is sitting on in preparation for the sample-exact seeking below. */
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return DRFLAC_FALSE;
+        }
+    }
+
+    for (;;) {
+        drflac_uint64 pcmFrameCountInThisFLACFrame;
+        drflac_uint64 firstPCMFrameInFLACFrame = 0;
+        drflac_uint64 lastPCMFrameInFLACFrame = 0;
+
+        drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
+
+        pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
+        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
+            /*
+            The sample should be in this frame. We need to fully decode it, but if it's an invalid frame (a CRC mismatch) we need to pretend
+            it never existed and keep iterating.
+            */
+            drflac_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
+
+            if (!isMidFrame) {
+                drflac_result result = drflac__decode_flac_frame(pFlac);
+                if (result == DRFLAC_SUCCESS) {
+                    /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
+                    return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
+                } else {
+                    if (result == DRFLAC_CRC_MISMATCH) {
+                        goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
+                    } else {
+                        return DRFLAC_FALSE;
+                    }
+                }
+            } else {
+                /* We started seeking mid-frame which means we need to skip the frame decoding part. */
+                return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
+            }
+        } else {
+            /*
+            It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
+            frame never existed and leave the running sample count untouched.
+            */
+            if (!isMidFrame) {
+                drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
+                if (result == DRFLAC_SUCCESS) {
+                    runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
+                } else {
+                    if (result == DRFLAC_CRC_MISMATCH) {
+                        goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
+                    } else {
+                        return DRFLAC_FALSE;
+                    }
+                }
+            } else {
+                /*
+                We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
+                drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
+                */
+                runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+                isMidFrame = DRFLAC_FALSE;
+            }
+
+            /* If we are seeking to the end of the file and we've just hit it, we're done. */
+            if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
+                return DRFLAC_TRUE;
+            }
+        }
+
+    next_iteration:
+        /* Grab the next frame in preparation for the next iteration. */
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return DRFLAC_FALSE;
+        }
+    }
+}
+
+
+#ifndef DR_FLAC_NO_OGG
+typedef struct
+{
+    drflac_uint8 capturePattern[4];  /* Should be "OggS" */
+    drflac_uint8 structureVersion;   /* Always 0. */
+    drflac_uint8 headerType;
+    drflac_uint64 granulePosition;
+    drflac_uint32 serialNumber;
+    drflac_uint32 sequenceNumber;
+    drflac_uint32 checksum;
+    drflac_uint8 segmentCount;
+    drflac_uint8 segmentTable[255];
+} drflac_ogg_page_header;
+#endif
+
+typedef struct
+{
+    drflac_read_proc onRead;
+    drflac_seek_proc onSeek;
+    drflac_tell_proc onTell;
+    drflac_meta_proc onMeta;
+    drflac_container container;
+    void* pUserData;
+    void* pUserDataMD;
+    drflac_uint32 sampleRate;
+    drflac_uint8  channels;
+    drflac_uint8  bitsPerSample;
+    drflac_uint64 totalPCMFrameCount;
+    drflac_uint16 maxBlockSizeInPCMFrames;
+    drflac_uint64 runningFilePos;
+    drflac_bool32 hasStreamInfoBlock;
+    drflac_bool32 hasMetadataBlocks;
+    drflac_bs bs;                           /* <-- A bit streamer is required for loading data during initialization. */
+    drflac_frame_header firstFrameHeader;   /* <-- The header of the first frame that was read during relaxed initalization. Only set if there is no STREAMINFO block. */
+
+#ifndef DR_FLAC_NO_OGG
+    drflac_uint32 oggSerial;
+    drflac_uint64 oggFirstBytePos;
+    drflac_ogg_page_header oggBosHeader;
+#endif
+} drflac_init_info;
+
+static DRFLAC_INLINE void drflac__decode_block_header(drflac_uint32 blockHeader, drflac_uint8* isLastBlock, drflac_uint8* blockType, drflac_uint32* blockSize)
+{
+    blockHeader = drflac__be2host_32(blockHeader);
+    *isLastBlock = (drflac_uint8)((blockHeader & 0x80000000UL) >> 31);
+    *blockType   = (drflac_uint8)((blockHeader & 0x7F000000UL) >> 24);
+    *blockSize   =                (blockHeader & 0x00FFFFFFUL);
+}
+
+static DRFLAC_INLINE drflac_bool32 drflac__read_and_decode_block_header(drflac_read_proc onRead, void* pUserData, drflac_uint8* isLastBlock, drflac_uint8* blockType, drflac_uint32* blockSize)
+{
+    drflac_uint32 blockHeader;
+
+    *blockSize = 0;
+    if (onRead(pUserData, &blockHeader, 4) != 4) {
+        return DRFLAC_FALSE;
+    }
+
+    drflac__decode_block_header(blockHeader, isLastBlock, blockType, blockSize);
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__read_streaminfo(drflac_read_proc onRead, void* pUserData, drflac_streaminfo* pStreamInfo)
+{
+    drflac_uint32 blockSizes;
+    drflac_uint64 frameSizes = 0;
+    drflac_uint64 importantProps;
+    drflac_uint8 md5[16];
+
+    /* min/max block size. */
+    if (onRead(pUserData, &blockSizes, 4) != 4) {
+        return DRFLAC_FALSE;
+    }
+
+    /* min/max frame size. */
+    if (onRead(pUserData, &frameSizes, 6) != 6) {
+        return DRFLAC_FALSE;
+    }
+
+    /* Sample rate, channels, bits per sample and total sample count. */
+    if (onRead(pUserData, &importantProps, 8) != 8) {
+        return DRFLAC_FALSE;
+    }
+
+    /* MD5 */
+    if (onRead(pUserData, md5, sizeof(md5)) != sizeof(md5)) {
+        return DRFLAC_FALSE;
+    }
+
+    blockSizes     = drflac__be2host_32(blockSizes);
+    frameSizes     = drflac__be2host_64(frameSizes);
+    importantProps = drflac__be2host_64(importantProps);
+
+    pStreamInfo->minBlockSizeInPCMFrames = (drflac_uint16)((blockSizes & 0xFFFF0000) >> 16);
+    pStreamInfo->maxBlockSizeInPCMFrames = (drflac_uint16) (blockSizes & 0x0000FFFF);
+    pStreamInfo->minFrameSizeInPCMFrames = (drflac_uint32)((frameSizes     &  (((drflac_uint64)0x00FFFFFF << 16) << 24)) >> 40);
+    pStreamInfo->maxFrameSizeInPCMFrames = (drflac_uint32)((frameSizes     &  (((drflac_uint64)0x00FFFFFF << 16) <<  0)) >> 16);
+    pStreamInfo->sampleRate              = (drflac_uint32)((importantProps &  (((drflac_uint64)0x000FFFFF << 16) << 28)) >> 44);
+    pStreamInfo->channels                = (drflac_uint8 )((importantProps &  (((drflac_uint64)0x0000000E << 16) << 24)) >> 41) + 1;
+    pStreamInfo->bitsPerSample           = (drflac_uint8 )((importantProps &  (((drflac_uint64)0x0000001F << 16) << 20)) >> 36) + 1;
+    pStreamInfo->totalPCMFrameCount      =                ((importantProps & ((((drflac_uint64)0x0000000F << 16) << 16) | 0xFFFFFFFF)));
+    DRFLAC_COPY_MEMORY(pStreamInfo->md5, md5, sizeof(md5));
+
+    return DRFLAC_TRUE;
+}
+
+
+static void* drflac__malloc_default(size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return DRFLAC_MALLOC(sz);
+}
+
+static void* drflac__realloc_default(void* p, size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return DRFLAC_REALLOC(p, sz);
+}
+
+static void drflac__free_default(void* p, void* pUserData)
+{
+    (void)pUserData;
+    DRFLAC_FREE(p);
+}
+
+
+static void* drflac__malloc_from_callbacks(size_t sz, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+
+    if (pAllocationCallbacks->onMalloc != NULL) {
+        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
+    }
+
+    /* Try using realloc(). */
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
+    }
+
+    return NULL;
+}
+
+static void* drflac__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
+    }
+
+    /* Try emulating realloc() in terms of malloc()/free(). */
+    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
+        void* p2;
+
+        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
+        if (p2 == NULL) {
+            return NULL;
+        }
+
+        if (p != NULL) {
+            DRFLAC_COPY_MEMORY(p2, p, szOld);
+            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+        }
+
+        return p2;
+    }
+
+    return NULL;
+}
+
+static void drflac__free_from_callbacks(void* p, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    if (p == NULL || pAllocationCallbacks == NULL) {
+        return;
+    }
+
+    if (pAllocationCallbacks->onFree != NULL) {
+        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+    }
+}
+
+
+static drflac_bool32 drflac__read_and_decode_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_uint64* pFirstFramePos, drflac_uint64* pSeektablePos, drflac_uint32* pSeekpointCount, drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    /*
+    We want to keep track of the byte position in the stream of the seektable. At the time of calling this function we know that
+    we'll be sitting on byte 42.
+    */
+    drflac_uint64 runningFilePos = 42;
+    drflac_uint64 seektablePos   = 0;
+    drflac_uint32 seektableSize  = 0;
+
+    (void)onTell;
+
+    for (;;) {
+        drflac_metadata metadata;
+        drflac_uint8 isLastBlock = 0;
+        drflac_uint8 blockType = 0;
+        drflac_uint32 blockSize;
+        if (drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize) == DRFLAC_FALSE) {
+            return DRFLAC_FALSE;
+        }
+        runningFilePos += 4;
+
+        metadata.type = blockType;
+        metadata.rawDataSize = 0;
+        metadata.rawDataOffset = runningFilePos;
+        metadata.pRawData = NULL;
+
+        switch (blockType)
+        {
+            case DRFLAC_METADATA_BLOCK_TYPE_APPLICATION:
+            {
+                if (blockSize < 4) {
+                    return DRFLAC_FALSE;
+                }
+
+                if (onMeta) {
+                    void* pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return DRFLAC_FALSE;
+                    }
+
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    metadata.data.application.id       = drflac__be2host_32(*(drflac_uint32*)pRawData);
+                    metadata.data.application.pData    = (const void*)((drflac_uint8*)pRawData + sizeof(drflac_uint32));
+                    metadata.data.application.dataSize = blockSize - sizeof(drflac_uint32);
+                    onMeta(pUserDataMD, &metadata);
+
+                    drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+
+            case DRFLAC_METADATA_BLOCK_TYPE_SEEKTABLE:
+            {
+                seektablePos  = runningFilePos;
+                seektableSize = blockSize;
+
+                if (onMeta) {
+                    drflac_uint32 seekpointCount;
+                    drflac_uint32 iSeekpoint;
+                    void* pRawData;
+
+                    seekpointCount = blockSize/DRFLAC_SEEKPOINT_SIZE_IN_BYTES;
+
+                    pRawData = drflac__malloc_from_callbacks(seekpointCount * sizeof(drflac_seekpoint), pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    /* We need to read seekpoint by seekpoint and do some processing. */
+                    for (iSeekpoint = 0; iSeekpoint < seekpointCount; ++iSeekpoint) {
+                        drflac_seekpoint* pSeekpoint = (drflac_seekpoint*)pRawData + iSeekpoint;
+
+                        if (onRead(pUserData, pSeekpoint, DRFLAC_SEEKPOINT_SIZE_IN_BYTES) != DRFLAC_SEEKPOINT_SIZE_IN_BYTES) {
+                            drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return DRFLAC_FALSE;
+                        }
+
+                        /* Endian swap. */
+                        pSeekpoint->firstPCMFrame   = drflac__be2host_64(pSeekpoint->firstPCMFrame);
+                        pSeekpoint->flacFrameOffset = drflac__be2host_64(pSeekpoint->flacFrameOffset);
+                        pSeekpoint->pcmFrameCount   = drflac__be2host_16(pSeekpoint->pcmFrameCount);
+                    }
+
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    metadata.data.seektable.seekpointCount = seekpointCount;
+                    metadata.data.seektable.pSeekpoints = (const drflac_seekpoint*)pRawData;
+
+                    onMeta(pUserDataMD, &metadata);
+
+                    drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+
+            case DRFLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT:
+            {
+                if (blockSize < 8) {
+                    return DRFLAC_FALSE;
+                }
+
+                if (onMeta) {
+                    void* pRawData;
+                    const char* pRunningData;
+                    const char* pRunningDataEnd;
+                    drflac_uint32 i;
+
+                    pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return DRFLAC_FALSE;
+                    }
+
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+
+                    pRunningData    = (const char*)pRawData;
+                    pRunningDataEnd = (const char*)pRawData + blockSize;
+
+                    metadata.data.vorbis_comment.vendorLength = drflac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+
+                    /* Need space for the rest of the block */
+                    if ((pRunningDataEnd - pRunningData) - 4 < (drflac_int64)metadata.data.vorbis_comment.vendorLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
+                        drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return DRFLAC_FALSE;
+                    }
+                    metadata.data.vorbis_comment.vendor       = pRunningData;                                            pRunningData += metadata.data.vorbis_comment.vendorLength;
+                    metadata.data.vorbis_comment.commentCount = drflac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+
+                    /* Need space for 'commentCount' comments after the block, which at minimum is a drflac_uint32 per comment */
+                    if ((pRunningDataEnd - pRunningData) / sizeof(drflac_uint32) < metadata.data.vorbis_comment.commentCount) { /* <-- Note the order of operations to avoid overflow to a valid value */
+                        drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return DRFLAC_FALSE;
+                    }
+                    metadata.data.vorbis_comment.pComments    = pRunningData;
+
+                    /* Check that the comments section is valid before passing it to the callback */
+                    for (i = 0; i < metadata.data.vorbis_comment.commentCount; ++i) {
+                        drflac_uint32 commentLength;
+
+                        if (pRunningDataEnd - pRunningData < 4) {
+                            drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return DRFLAC_FALSE;
+                        }
+
+                        commentLength = drflac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                        if (pRunningDataEnd - pRunningData < (drflac_int64)commentLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
+                            drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return DRFLAC_FALSE;
+                        }
+                        pRunningData += commentLength;
+                    }
+
+                    onMeta(pUserDataMD, &metadata);
+
+                    drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+
+            case DRFLAC_METADATA_BLOCK_TYPE_CUESHEET:
+            {
+                if (blockSize < 396) {
+                    return DRFLAC_FALSE;
+                }
+
+                if (onMeta) {
+                    void* pRawData;
+                    const char* pRunningData;
+                    const char* pRunningDataEnd;
+                    size_t bufferSize;
+                    drflac_uint8 iTrack;
+                    drflac_uint8 iIndex;
+                    void* pTrackData;
+
+                    /*
+                    This needs to be loaded in two passes. The first pass is used to calculate the size of the memory allocation
+                    we need for storing the necessary data. The second pass will fill that buffer with usable data.
+                    */
+                    pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return DRFLAC_FALSE;
+                    }
+
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+
+                    pRunningData    = (const char*)pRawData;
+                    pRunningDataEnd = (const char*)pRawData + blockSize;
+
+                    DRFLAC_COPY_MEMORY(metadata.data.cuesheet.catalog, pRunningData, 128);                              pRunningData += 128;
+                    metadata.data.cuesheet.leadInSampleCount = drflac__be2host_64(*(const drflac_uint64*)pRunningData); pRunningData += 8;
+                    metadata.data.cuesheet.isCD              = (pRunningData[0] & 0x80) != 0;                           pRunningData += 259;
+                    metadata.data.cuesheet.trackCount        = pRunningData[0];                                         pRunningData += 1;
+                    metadata.data.cuesheet.pTrackData        = NULL;    /* Will be filled later. */
+
+                    /* Pass 1: Calculate the size of the buffer for the track data. */
+                    {
+                        const char* pRunningDataSaved = pRunningData;   /* Will be restored at the end in preparation for the second pass. */
+
+                        bufferSize = metadata.data.cuesheet.trackCount * DRFLAC_CUESHEET_TRACK_SIZE_IN_BYTES;
+
+                        for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
+                            drflac_uint8 indexCount;
+                            drflac_uint32 indexPointSize;
+
+                            if (pRunningDataEnd - pRunningData < DRFLAC_CUESHEET_TRACK_SIZE_IN_BYTES) {
+                                drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                                return DRFLAC_FALSE;
+                            }
+
+                            /* Skip to the index point count */
+                            pRunningData += 35;
+
+                            indexCount = pRunningData[0];
+                            pRunningData += 1;
+
+                            bufferSize += indexCount * sizeof(drflac_cuesheet_track_index);
+
+                            /* Quick validation check. */
+                            indexPointSize = indexCount * DRFLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES;
+                            if (pRunningDataEnd - pRunningData < (drflac_int64)indexPointSize) {
+                                drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                                return DRFLAC_FALSE;
+                            }
+
+                            pRunningData += indexPointSize;
+                        }
+
+                        pRunningData = pRunningDataSaved;
+                    }
+
+                    /* Pass 2: Allocate a buffer and fill the data. Validation was done in the step above so can be skipped. */
+                    {
+                        char* pRunningTrackData;
+
+                        pTrackData = drflac__malloc_from_callbacks(bufferSize, pAllocationCallbacks);
+                        if (pTrackData == NULL) {
+                            drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return DRFLAC_FALSE;
+                        }
+
+                        pRunningTrackData = (char*)pTrackData;
+
+                        for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
+                            drflac_uint8 indexCount;
+
+                            DRFLAC_COPY_MEMORY(pRunningTrackData, pRunningData, DRFLAC_CUESHEET_TRACK_SIZE_IN_BYTES);
+                            pRunningData      += DRFLAC_CUESHEET_TRACK_SIZE_IN_BYTES-1; /* Skip forward, but not beyond the last byte in the CUESHEET_TRACK block which is the index count. */
+                            pRunningTrackData += DRFLAC_CUESHEET_TRACK_SIZE_IN_BYTES-1;
+
+                            /* Grab the index count for the next part. */
+                            indexCount = pRunningData[0];
+                            pRunningData      += 1;
+                            pRunningTrackData += 1;
+
+                            /* Extract each track index. */
+                            for (iIndex = 0; iIndex < indexCount; ++iIndex) {
+                                drflac_cuesheet_track_index* pTrackIndex = (drflac_cuesheet_track_index*)pRunningTrackData;
+
+                                DRFLAC_COPY_MEMORY(pRunningTrackData, pRunningData, DRFLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES);
+                                pRunningData      += DRFLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES;
+                                pRunningTrackData += sizeof(drflac_cuesheet_track_index);
+
+                                pTrackIndex->offset = drflac__be2host_64(pTrackIndex->offset);
+                            }
+                        }
+
+                        metadata.data.cuesheet.pTrackData = pTrackData;
+                    }
+
+                    /* The original data is no longer needed. */
+                    drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                    pRawData = NULL;
+
+                    onMeta(pUserDataMD, &metadata);
+
+                    drflac__free_from_callbacks(pTrackData, pAllocationCallbacks);
+                    pTrackData = NULL;
+                }
+            } break;
+
+            case DRFLAC_METADATA_BLOCK_TYPE_PICTURE:
+            {
+                if (blockSize < 32) {
+                    return DRFLAC_FALSE;
+                }
+
+                if (onMeta) {
+                    drflac_bool32 result = DRFLAC_TRUE;
+                    drflac_uint32 blockSizeRemaining = blockSize;
+                    char* pMime = NULL;
+                    char* pDescription = NULL;
+                    void* pPictureData = NULL;
+
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.type, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.type = drflac__be2host_32(metadata.data.picture.type);
+
+
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.mimeLength, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.mimeLength = drflac__be2host_32(metadata.data.picture.mimeLength);
+
+                    pMime = (char*)drflac__malloc_from_callbacks(metadata.data.picture.mimeLength + 1, pAllocationCallbacks); /* +1 for null terminator. */
+                    if (pMime == NULL) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+
+                    if (blockSizeRemaining < metadata.data.picture.mimeLength || onRead(pUserData, pMime, metadata.data.picture.mimeLength) != metadata.data.picture.mimeLength) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= metadata.data.picture.mimeLength;
+                    pMime[metadata.data.picture.mimeLength] = '\0';  /* Null terminate for safety. */
+                    metadata.data.picture.mime = (const char*)pMime;
+
+
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.descriptionLength, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.descriptionLength = drflac__be2host_32(metadata.data.picture.descriptionLength);
+
+                    pDescription = (char*)drflac__malloc_from_callbacks(metadata.data.picture.descriptionLength + 1, pAllocationCallbacks); /* +1 for null terminator. */
+                    if (pDescription == NULL) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+
+                    if (blockSizeRemaining < metadata.data.picture.descriptionLength || onRead(pUserData, pDescription, metadata.data.picture.descriptionLength) != metadata.data.picture.descriptionLength) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= metadata.data.picture.descriptionLength;
+                    pDescription[metadata.data.picture.descriptionLength] = '\0';  /* Null terminate for safety. */
+                    metadata.data.picture.description = (const char*)pDescription;
+
+
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.width, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.width = drflac__be2host_32(metadata.data.picture.width);
+
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.height, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.height = drflac__be2host_32(metadata.data.picture.height);
+
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.colorDepth, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.colorDepth = drflac__be2host_32(metadata.data.picture.colorDepth);
+
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.indexColorCount, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.indexColorCount = drflac__be2host_32(metadata.data.picture.indexColorCount);
+
+
+                    /* Picture data. */
+                    if (blockSizeRemaining < 4 || onRead(pUserData, &metadata.data.picture.pictureDataSize, 4) != 4) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+                    blockSizeRemaining -= 4;
+                    metadata.data.picture.pictureDataSize = drflac__be2host_32(metadata.data.picture.pictureDataSize);
+
+                    if (blockSizeRemaining < metadata.data.picture.pictureDataSize) {
+                        result = DRFLAC_FALSE;
+                        goto done_flac;
+                    }
+
+                    /* For the actual image data we want to store the offset to the start of the stream. */
+                    metadata.data.picture.pictureDataOffset = runningFilePos + (blockSize - blockSizeRemaining);
+
+                    /*
+                    For the allocation of image data, we can allow memory allocation to fail, in which case we just leave
+                    the pointer as null. If it fails, we need to fall back to seeking past the image data.
+                    */
+                #ifndef DR_FLAC_NO_PICTURE_METADATA_MALLOC
+                    pPictureData = drflac__malloc_from_callbacks(metadata.data.picture.pictureDataSize, pAllocationCallbacks);
+                    if (pPictureData != NULL) {
+                        if (onRead(pUserData, pPictureData, metadata.data.picture.pictureDataSize) != metadata.data.picture.pictureDataSize) {
+                            result = DRFLAC_FALSE;
+                            goto done_flac;
+                        }
+                    } else
+                #endif
+                    {
+                        /* Allocation failed. We need to seek past the picture data. */
+                        if (!onSeek(pUserData, metadata.data.picture.pictureDataSize, DRFLAC_SEEK_CUR)) {
+                            result = DRFLAC_FALSE;
+                            goto done_flac;
+                        }
+                    }
+
+                    blockSizeRemaining -= metadata.data.picture.pictureDataSize;
+                    metadata.data.picture.pPictureData = (const drflac_uint8*)pPictureData;
+
+
+                    /* Only fire the callback if we actually have a way to read the image data. We must have either a valid offset, or a valid data pointer. */
+                    if (metadata.data.picture.pictureDataOffset != 0 || metadata.data.picture.pPictureData != NULL) {
+                        onMeta(pUserDataMD, &metadata);
+                    } else {
+                        /* Don't have a valid offset or data pointer, so just pretend we don't have a picture metadata. */
+                    }
+
+                done_flac:
+                    drflac__free_from_callbacks(pMime,        pAllocationCallbacks);
+                    drflac__free_from_callbacks(pDescription, pAllocationCallbacks);
+                    drflac__free_from_callbacks(pPictureData, pAllocationCallbacks);
+
+                    if (result != DRFLAC_TRUE) {
+                        return DRFLAC_FALSE;
+                    }
+                }
+            } break;
+
+            case DRFLAC_METADATA_BLOCK_TYPE_PADDING:
+            {
+                if (onMeta) {
+                    metadata.data.padding.unused = 0;
+
+                    /* Padding doesn't have anything meaningful in it, so just skip over it, but make sure the caller is aware of it by firing the callback. */
+                    if (!onSeek(pUserData, blockSize, DRFLAC_SEEK_CUR)) {
+                        isLastBlock = DRFLAC_TRUE;  /* An error occurred while seeking. Attempt to recover by treating this as the last block which will in turn terminate the loop. */
+                    } else {
+                        onMeta(pUserDataMD, &metadata);
+                    }
+                }
+            } break;
+
+            case DRFLAC_METADATA_BLOCK_TYPE_INVALID:
+            {
+                /* Invalid chunk. Just skip over this one. */
+                if (onMeta) {
+                    if (!onSeek(pUserData, blockSize, DRFLAC_SEEK_CUR)) {
+                        isLastBlock = DRFLAC_TRUE;  /* An error occurred while seeking. Attempt to recover by treating this as the last block which will in turn terminate the loop. */
+                    }
+                }
+            } break;
+
+            default:
+            {
+                /*
+                It's an unknown chunk, but not necessarily invalid. There's a chance more metadata blocks might be defined later on, so we
+                can at the very least report the chunk to the application and let it look at the raw data.
+                */
+                if (onMeta) {
+                    void* pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData != NULL) {
+                        if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                            drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return DRFLAC_FALSE;
+                        }
+                    } else {
+                        /* Allocation failed. We need to seek past the block. */
+                        if (!onSeek(pUserData, blockSize, DRFLAC_SEEK_CUR)) {
+                            return DRFLAC_FALSE;
+                        }
+                    }
+
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    onMeta(pUserDataMD, &metadata);
+
+                    drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+        }
+
+        /* If we're not handling metadata, just skip over the block. If we are, it will have been handled earlier in the switch statement above. */
+        if (onMeta == NULL && blockSize > 0) {
+            if (!onSeek(pUserData, blockSize, DRFLAC_SEEK_CUR)) {
+                isLastBlock = DRFLAC_TRUE;
+            }
+        }
+
+        runningFilePos += blockSize;
+        if (isLastBlock) {
+            break;
+        }
+    }
+
+    *pSeektablePos   = seektablePos;
+    *pSeekpointCount = seektableSize / DRFLAC_SEEKPOINT_SIZE_IN_BYTES;
+    *pFirstFramePos  = runningFilePos;
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__init_private__native(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_bool32 relaxed)
+{
+    /* Pre Condition: The bit stream should be sitting just past the 4-byte id header. */
+
+    drflac_uint8 isLastBlock;
+    drflac_uint8 blockType;
+    drflac_uint32 blockSize;
+
+    (void)onSeek;
+
+    pInit->container = drflac_container_native;
+
+    /* The first metadata block should be the STREAMINFO block. */
+    if (!drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
+        return DRFLAC_FALSE;
+    }
+
+    if (blockType != DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
+        if (!relaxed) {
+            /* We're opening in strict mode and the first block is not the STREAMINFO block. Error. */
+            return DRFLAC_FALSE;
+        } else {
+            /*
+            Relaxed mode. To open from here we need to just find the first frame and set the sample rate, etc. to whatever is defined
+            for that frame.
+            */
+            pInit->hasStreamInfoBlock = DRFLAC_FALSE;
+            pInit->hasMetadataBlocks  = DRFLAC_FALSE;
+
+            if (!drflac__read_next_flac_frame_header(&pInit->bs, 0, &pInit->firstFrameHeader)) {
+                return DRFLAC_FALSE;    /* Couldn't find a frame. */
+            }
+
+            if (pInit->firstFrameHeader.bitsPerSample == 0) {
+                return DRFLAC_FALSE;    /* Failed to initialize because the first frame depends on the STREAMINFO block, which does not exist. */
+            }
+
+            pInit->sampleRate              = pInit->firstFrameHeader.sampleRate;
+            pInit->channels                = drflac__get_channel_count_from_channel_assignment(pInit->firstFrameHeader.channelAssignment);
+            pInit->bitsPerSample           = pInit->firstFrameHeader.bitsPerSample;
+            pInit->maxBlockSizeInPCMFrames = 65535;   /* <-- See notes here: https://xiph.org/flac/format.html#metadata_block_streaminfo */
+            return DRFLAC_TRUE;
+        }
+    } else {
+        drflac_streaminfo streaminfo;
+        if (!drflac__read_streaminfo(onRead, pUserData, &streaminfo)) {
+            return DRFLAC_FALSE;
+        }
+
+        pInit->hasStreamInfoBlock      = DRFLAC_TRUE;
+        pInit->sampleRate              = streaminfo.sampleRate;
+        pInit->channels                = streaminfo.channels;
+        pInit->bitsPerSample           = streaminfo.bitsPerSample;
+        pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
+        pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;    /* Don't care about the min block size - only the max (used for determining the size of the memory allocation). */
+        pInit->hasMetadataBlocks       = !isLastBlock;
+
+        if (onMeta) {
+            drflac_metadata metadata;
+            metadata.type = DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO;
+            metadata.pRawData = NULL;
+            metadata.rawDataSize = 0;
+            metadata.data.streaminfo = streaminfo;
+            onMeta(pUserDataMD, &metadata);
+        }
+
+        return DRFLAC_TRUE;
+    }
+}
+
+#ifndef DR_FLAC_NO_OGG
+#define DRFLAC_OGG_MAX_PAGE_SIZE            65307
+#define DRFLAC_OGG_CAPTURE_PATTERN_CRC32    1605413199  /* CRC-32 of "OggS". */
+
+typedef enum
+{
+    drflac_ogg_recover_on_crc_mismatch,
+    drflac_ogg_fail_on_crc_mismatch
+} drflac_ogg_crc_mismatch_recovery;
+
+#ifndef DR_FLAC_NO_CRC
+static drflac_uint32 drflac__crc32_table[] = {
+    0x00000000L, 0x04C11DB7L, 0x09823B6EL, 0x0D4326D9L,
+    0x130476DCL, 0x17C56B6BL, 0x1A864DB2L, 0x1E475005L,
+    0x2608EDB8L, 0x22C9F00FL, 0x2F8AD6D6L, 0x2B4BCB61L,
+    0x350C9B64L, 0x31CD86D3L, 0x3C8EA00AL, 0x384FBDBDL,
+    0x4C11DB70L, 0x48D0C6C7L, 0x4593E01EL, 0x4152FDA9L,
+    0x5F15ADACL, 0x5BD4B01BL, 0x569796C2L, 0x52568B75L,
+    0x6A1936C8L, 0x6ED82B7FL, 0x639B0DA6L, 0x675A1011L,
+    0x791D4014L, 0x7DDC5DA3L, 0x709F7B7AL, 0x745E66CDL,
+    0x9823B6E0L, 0x9CE2AB57L, 0x91A18D8EL, 0x95609039L,
+    0x8B27C03CL, 0x8FE6DD8BL, 0x82A5FB52L, 0x8664E6E5L,
+    0xBE2B5B58L, 0xBAEA46EFL, 0xB7A96036L, 0xB3687D81L,
+    0xAD2F2D84L, 0xA9EE3033L, 0xA4AD16EAL, 0xA06C0B5DL,
+    0xD4326D90L, 0xD0F37027L, 0xDDB056FEL, 0xD9714B49L,
+    0xC7361B4CL, 0xC3F706FBL, 0xCEB42022L, 0xCA753D95L,
+    0xF23A8028L, 0xF6FB9D9FL, 0xFBB8BB46L, 0xFF79A6F1L,
+    0xE13EF6F4L, 0xE5FFEB43L, 0xE8BCCD9AL, 0xEC7DD02DL,
+    0x34867077L, 0x30476DC0L, 0x3D044B19L, 0x39C556AEL,
+    0x278206ABL, 0x23431B1CL, 0x2E003DC5L, 0x2AC12072L,
+    0x128E9DCFL, 0x164F8078L, 0x1B0CA6A1L, 0x1FCDBB16L,
+    0x018AEB13L, 0x054BF6A4L, 0x0808D07DL, 0x0CC9CDCAL,
+    0x7897AB07L, 0x7C56B6B0L, 0x71159069L, 0x75D48DDEL,
+    0x6B93DDDBL, 0x6F52C06CL, 0x6211E6B5L, 0x66D0FB02L,
+    0x5E9F46BFL, 0x5A5E5B08L, 0x571D7DD1L, 0x53DC6066L,
+    0x4D9B3063L, 0x495A2DD4L, 0x44190B0DL, 0x40D816BAL,
+    0xACA5C697L, 0xA864DB20L, 0xA527FDF9L, 0xA1E6E04EL,
+    0xBFA1B04BL, 0xBB60ADFCL, 0xB6238B25L, 0xB2E29692L,
+    0x8AAD2B2FL, 0x8E6C3698L, 0x832F1041L, 0x87EE0DF6L,
+    0x99A95DF3L, 0x9D684044L, 0x902B669DL, 0x94EA7B2AL,
+    0xE0B41DE7L, 0xE4750050L, 0xE9362689L, 0xEDF73B3EL,
+    0xF3B06B3BL, 0xF771768CL, 0xFA325055L, 0xFEF34DE2L,
+    0xC6BCF05FL, 0xC27DEDE8L, 0xCF3ECB31L, 0xCBFFD686L,
+    0xD5B88683L, 0xD1799B34L, 0xDC3ABDEDL, 0xD8FBA05AL,
+    0x690CE0EEL, 0x6DCDFD59L, 0x608EDB80L, 0x644FC637L,
+    0x7A089632L, 0x7EC98B85L, 0x738AAD5CL, 0x774BB0EBL,
+    0x4F040D56L, 0x4BC510E1L, 0x46863638L, 0x42472B8FL,
+    0x5C007B8AL, 0x58C1663DL, 0x558240E4L, 0x51435D53L,
+    0x251D3B9EL, 0x21DC2629L, 0x2C9F00F0L, 0x285E1D47L,
+    0x36194D42L, 0x32D850F5L, 0x3F9B762CL, 0x3B5A6B9BL,
+    0x0315D626L, 0x07D4CB91L, 0x0A97ED48L, 0x0E56F0FFL,
+    0x1011A0FAL, 0x14D0BD4DL, 0x19939B94L, 0x1D528623L,
+    0xF12F560EL, 0xF5EE4BB9L, 0xF8AD6D60L, 0xFC6C70D7L,
+    0xE22B20D2L, 0xE6EA3D65L, 0xEBA91BBCL, 0xEF68060BL,
+    0xD727BBB6L, 0xD3E6A601L, 0xDEA580D8L, 0xDA649D6FL,
+    0xC423CD6AL, 0xC0E2D0DDL, 0xCDA1F604L, 0xC960EBB3L,
+    0xBD3E8D7EL, 0xB9FF90C9L, 0xB4BCB610L, 0xB07DABA7L,
+    0xAE3AFBA2L, 0xAAFBE615L, 0xA7B8C0CCL, 0xA379DD7BL,
+    0x9B3660C6L, 0x9FF77D71L, 0x92B45BA8L, 0x9675461FL,
+    0x8832161AL, 0x8CF30BADL, 0x81B02D74L, 0x857130C3L,
+    0x5D8A9099L, 0x594B8D2EL, 0x5408ABF7L, 0x50C9B640L,
+    0x4E8EE645L, 0x4A4FFBF2L, 0x470CDD2BL, 0x43CDC09CL,
+    0x7B827D21L, 0x7F436096L, 0x7200464FL, 0x76C15BF8L,
+    0x68860BFDL, 0x6C47164AL, 0x61043093L, 0x65C52D24L,
+    0x119B4BE9L, 0x155A565EL, 0x18197087L, 0x1CD86D30L,
+    0x029F3D35L, 0x065E2082L, 0x0B1D065BL, 0x0FDC1BECL,
+    0x3793A651L, 0x3352BBE6L, 0x3E119D3FL, 0x3AD08088L,
+    0x2497D08DL, 0x2056CD3AL, 0x2D15EBE3L, 0x29D4F654L,
+    0xC5A92679L, 0xC1683BCEL, 0xCC2B1D17L, 0xC8EA00A0L,
+    0xD6AD50A5L, 0xD26C4D12L, 0xDF2F6BCBL, 0xDBEE767CL,
+    0xE3A1CBC1L, 0xE760D676L, 0xEA23F0AFL, 0xEEE2ED18L,
+    0xF0A5BD1DL, 0xF464A0AAL, 0xF9278673L, 0xFDE69BC4L,
+    0x89B8FD09L, 0x8D79E0BEL, 0x803AC667L, 0x84FBDBD0L,
+    0x9ABC8BD5L, 0x9E7D9662L, 0x933EB0BBL, 0x97FFAD0CL,
+    0xAFB010B1L, 0xAB710D06L, 0xA6322BDFL, 0xA2F33668L,
+    0xBCB4666DL, 0xB8757BDAL, 0xB5365D03L, 0xB1F740B4L
+};
+#endif
+
+static DRFLAC_INLINE drflac_uint32 drflac_crc32_byte(drflac_uint32 crc32, drflac_uint8 data)
+{
+#ifndef DR_FLAC_NO_CRC
+    return (crc32 << 8) ^ drflac__crc32_table[(drflac_uint8)((crc32 >> 24) & 0xFF) ^ data];
+#else
+    (void)data;
+    return crc32;
+#endif
+}
+
+#if 0
+static DRFLAC_INLINE drflac_uint32 drflac_crc32_uint32(drflac_uint32 crc32, drflac_uint32 data)
+{
+    crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >> 24) & 0xFF));
+    crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >> 16) & 0xFF));
+    crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >>  8) & 0xFF));
+    crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >>  0) & 0xFF));
+    return crc32;
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac_crc32_uint64(drflac_uint32 crc32, drflac_uint64 data)
+{
+    crc32 = drflac_crc32_uint32(crc32, (drflac_uint32)((data >> 32) & 0xFFFFFFFF));
+    crc32 = drflac_crc32_uint32(crc32, (drflac_uint32)((data >>  0) & 0xFFFFFFFF));
+    return crc32;
+}
+#endif
+
+static DRFLAC_INLINE drflac_uint32 drflac_crc32_buffer(drflac_uint32 crc32, drflac_uint8* pData, drflac_uint32 dataSize)
+{
+    /* This can be optimized. */
+    drflac_uint32 i;
+    for (i = 0; i < dataSize; ++i) {
+        crc32 = drflac_crc32_byte(crc32, pData[i]);
+    }
+    return crc32;
+}
+
+
+static DRFLAC_INLINE drflac_bool32 drflac_ogg__is_capture_pattern(drflac_uint8 pattern[4])
+{
+    return pattern[0] == 'O' && pattern[1] == 'g' && pattern[2] == 'g' && pattern[3] == 'S';
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac_ogg__get_page_header_size(drflac_ogg_page_header* pHeader)
+{
+    return 27 + pHeader->segmentCount;
+}
+
+static DRFLAC_INLINE drflac_uint32 drflac_ogg__get_page_body_size(drflac_ogg_page_header* pHeader)
+{
+    drflac_uint32 pageBodySize = 0;
+    int i;
+
+    for (i = 0; i < pHeader->segmentCount; ++i) {
+        pageBodySize += pHeader->segmentTable[i];
+    }
+
+    return pageBodySize;
+}
+
+static drflac_result drflac_ogg__read_page_header_after_capture_pattern(drflac_read_proc onRead, void* pUserData, drflac_ogg_page_header* pHeader, drflac_uint32* pBytesRead, drflac_uint32* pCRC32)
+{
+    drflac_uint8 data[23];
+    drflac_uint32 i;
+
+    DRFLAC_ASSERT(*pCRC32 == DRFLAC_OGG_CAPTURE_PATTERN_CRC32);
+
+    if (onRead(pUserData, data, 23) != 23) {
+        return DRFLAC_AT_END;
+    }
+    *pBytesRead += 23;
+
+    /*
+    It's not actually used, but set the capture pattern to 'OggS' for completeness. Not doing this will cause static analysers to complain about
+    us trying to access uninitialized data. We could alternatively just comment out this member of the drflac_ogg_page_header structure, but I
+    like to have it map to the structure of the underlying data.
+    */
+    pHeader->capturePattern[0] = 'O';
+    pHeader->capturePattern[1] = 'g';
+    pHeader->capturePattern[2] = 'g';
+    pHeader->capturePattern[3] = 'S';
+
+    pHeader->structureVersion = data[0];
+    pHeader->headerType       = data[1];
+    DRFLAC_COPY_MEMORY(&pHeader->granulePosition, &data[ 2], 8);
+    DRFLAC_COPY_MEMORY(&pHeader->serialNumber,    &data[10], 4);
+    DRFLAC_COPY_MEMORY(&pHeader->sequenceNumber,  &data[14], 4);
+    DRFLAC_COPY_MEMORY(&pHeader->checksum,        &data[18], 4);
+    pHeader->segmentCount     = data[22];
+
+    /* Calculate the CRC. Note that for the calculation the checksum part of the page needs to be set to 0. */
+    data[18] = 0;
+    data[19] = 0;
+    data[20] = 0;
+    data[21] = 0;
+
+    for (i = 0; i < 23; ++i) {
+        *pCRC32 = drflac_crc32_byte(*pCRC32, data[i]);
+    }
+
+
+    if (onRead(pUserData, pHeader->segmentTable, pHeader->segmentCount) != pHeader->segmentCount) {
+        return DRFLAC_AT_END;
+    }
+    *pBytesRead += pHeader->segmentCount;
+
+    for (i = 0; i < pHeader->segmentCount; ++i) {
+        *pCRC32 = drflac_crc32_byte(*pCRC32, pHeader->segmentTable[i]);
+    }
+
+    return DRFLAC_SUCCESS;
+}
+
+static drflac_result drflac_ogg__read_page_header(drflac_read_proc onRead, void* pUserData, drflac_ogg_page_header* pHeader, drflac_uint32* pBytesRead, drflac_uint32* pCRC32)
+{
+    drflac_uint8 id[4];
+
+    *pBytesRead = 0;
+
+    if (onRead(pUserData, id, 4) != 4) {
+        return DRFLAC_AT_END;
+    }
+    *pBytesRead += 4;
+
+    /* We need to read byte-by-byte until we find the OggS capture pattern. */
+    for (;;) {
+        if (drflac_ogg__is_capture_pattern(id)) {
+            drflac_result result;
+
+            *pCRC32 = DRFLAC_OGG_CAPTURE_PATTERN_CRC32;
+
+            result = drflac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, pHeader, pBytesRead, pCRC32);
+            if (result == DRFLAC_SUCCESS) {
+                return DRFLAC_SUCCESS;
+            } else {
+                if (result == DRFLAC_CRC_MISMATCH) {
+                    continue;
+                } else {
+                    return result;
+                }
+            }
+        } else {
+            /* The first 4 bytes did not equal the capture pattern. Read the next byte and try again. */
+            id[0] = id[1];
+            id[1] = id[2];
+            id[2] = id[3];
+            if (onRead(pUserData, &id[3], 1) != 1) {
+                return DRFLAC_AT_END;
+            }
+            *pBytesRead += 1;
+        }
+    }
+}
+
+
+/*
+The main part of the Ogg encapsulation is the conversion from the physical Ogg bitstream to the native FLAC bitstream. It works
+in three general stages: Ogg Physical Bitstream -> Ogg/FLAC Logical Bitstream -> FLAC Native Bitstream. dr_flac is designed
+in such a way that the core sections assume everything is delivered in native format. Therefore, for each encapsulation type
+dr_flac is supporting there needs to be a layer sitting on top of the onRead and onSeek callbacks that ensures the bits read from
+the physical Ogg bitstream are converted and delivered in native FLAC format.
+*/
+typedef struct
+{
+    drflac_read_proc onRead;                /* The original onRead callback from drflac_open() and family. */
+    drflac_seek_proc onSeek;                /* The original onSeek callback from drflac_open() and family. */
+    drflac_tell_proc onTell;                /* The original onTell callback from drflac_open() and family. */
+    void* pUserData;                        /* The user data passed on onRead and onSeek. This is the user data that was passed on drflac_open() and family. */
+    drflac_uint64 currentBytePos;           /* The position of the byte we are sitting on in the physical byte stream. Used for efficient seeking. */
+    drflac_uint64 firstBytePos;             /* The position of the first byte in the physical bitstream. Points to the start of the "OggS" identifier of the FLAC bos page. */
+    drflac_uint32 serialNumber;             /* The serial number of the FLAC audio pages. This is determined by the initial header page that was read during initialization. */
+    drflac_ogg_page_header bosPageHeader;   /* Used for seeking. */
+    drflac_ogg_page_header currentPageHeader;
+    drflac_uint32 bytesRemainingInPage;
+    drflac_uint32 pageDataSize;
+    drflac_uint8 pageData[DRFLAC_OGG_MAX_PAGE_SIZE];
+} drflac_oggbs; /* oggbs = Ogg Bitstream */
+
+static size_t drflac_oggbs__read_physical(drflac_oggbs* oggbs, void* bufferOut, size_t bytesToRead)
+{
+    size_t bytesActuallyRead = oggbs->onRead(oggbs->pUserData, bufferOut, bytesToRead);
+    oggbs->currentBytePos += bytesActuallyRead;
+
+    return bytesActuallyRead;
+}
+
+static drflac_bool32 drflac_oggbs__seek_physical(drflac_oggbs* oggbs, drflac_uint64 offset, drflac_seek_origin origin)
+{
+    if (origin == DRFLAC_SEEK_SET) {
+        if (offset <= 0x7FFFFFFF) {
+            if (!oggbs->onSeek(oggbs->pUserData, (int)offset, DRFLAC_SEEK_SET)) {
+                return DRFLAC_FALSE;
+            }
+            oggbs->currentBytePos = offset;
+
+            return DRFLAC_TRUE;
+        } else {
+            if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, DRFLAC_SEEK_SET)) {
+                return DRFLAC_FALSE;
+            }
+            oggbs->currentBytePos = offset;
+
+            return drflac_oggbs__seek_physical(oggbs, offset - 0x7FFFFFFF, DRFLAC_SEEK_CUR);
+        }
+    } else {
+        while (offset > 0x7FFFFFFF) {
+            if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, DRFLAC_SEEK_CUR)) {
+                return DRFLAC_FALSE;
+            }
+            oggbs->currentBytePos += 0x7FFFFFFF;
+            offset -= 0x7FFFFFFF;
+        }
+
+        if (!oggbs->onSeek(oggbs->pUserData, (int)offset, DRFLAC_SEEK_CUR)) {    /* <-- Safe cast thanks to the loop above. */
+            return DRFLAC_FALSE;
+        }
+        oggbs->currentBytePos += offset;
+
+        return DRFLAC_TRUE;
+    }
+}
+
+static drflac_bool32 drflac_oggbs__goto_next_page(drflac_oggbs* oggbs, drflac_ogg_crc_mismatch_recovery recoveryMethod)
+{
+    drflac_ogg_page_header header;
+    for (;;) {
+        drflac_uint32 crc32 = 0;
+        drflac_uint32 bytesRead;
+        drflac_uint32 pageBodySize;
+#ifndef DR_FLAC_NO_CRC
+        drflac_uint32 actualCRC32;
+#endif
+
+        if (drflac_ogg__read_page_header(oggbs->onRead, oggbs->pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
+            return DRFLAC_FALSE;
+        }
+        oggbs->currentBytePos += bytesRead;
+
+        pageBodySize = drflac_ogg__get_page_body_size(&header);
+        if (pageBodySize > DRFLAC_OGG_MAX_PAGE_SIZE) {
+            continue;   /* Invalid page size. Assume it's corrupted and just move to the next page. */
+        }
+
+        if (header.serialNumber != oggbs->serialNumber) {
+            /* It's not a FLAC page. Skip it. */
+            if (pageBodySize > 0 && !drflac_oggbs__seek_physical(oggbs, pageBodySize, DRFLAC_SEEK_CUR)) {
+                return DRFLAC_FALSE;
+            }
+            continue;
+        }
+
+
+        /* We need to read the entire page and then do a CRC check on it. If there's a CRC mismatch we need to skip this page. */
+        if (drflac_oggbs__read_physical(oggbs, oggbs->pageData, pageBodySize) != pageBodySize) {
+            return DRFLAC_FALSE;
+        }
+        oggbs->pageDataSize = pageBodySize;
+
+#ifndef DR_FLAC_NO_CRC
+        actualCRC32 = drflac_crc32_buffer(crc32, oggbs->pageData, oggbs->pageDataSize);
+        if (actualCRC32 != header.checksum) {
+            if (recoveryMethod == drflac_ogg_recover_on_crc_mismatch) {
+                continue;   /* CRC mismatch. Skip this page. */
+            } else {
+                /*
+                Even though we are failing on a CRC mismatch, we still want our stream to be in a good state. Therefore we
+                go to the next valid page to ensure we're in a good state, but return false to let the caller know that the
+                seek did not fully complete.
+                */
+                drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch);
+                return DRFLAC_FALSE;
+            }
+        }
+#else
+        (void)recoveryMethod;   /* <-- Silence a warning. */
+#endif
+
+        oggbs->currentPageHeader = header;
+        oggbs->bytesRemainingInPage = pageBodySize;
+        return DRFLAC_TRUE;
+    }
+}
+
+/* Function below is unused at the moment, but I might be re-adding it later. */
+#if 0
+static drflac_uint8 drflac_oggbs__get_current_segment_index(drflac_oggbs* oggbs, drflac_uint8* pBytesRemainingInSeg)
+{
+    drflac_uint32 bytesConsumedInPage = drflac_ogg__get_page_body_size(&oggbs->currentPageHeader) - oggbs->bytesRemainingInPage;
+    drflac_uint8 iSeg = 0;
+    drflac_uint32 iByte = 0;
+    while (iByte < bytesConsumedInPage) {
+        drflac_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
+        if (iByte + segmentSize > bytesConsumedInPage) {
+            break;
+        } else {
+            iSeg += 1;
+            iByte += segmentSize;
+        }
+    }
+
+    *pBytesRemainingInSeg = oggbs->currentPageHeader.segmentTable[iSeg] - (drflac_uint8)(bytesConsumedInPage - iByte);
+    return iSeg;
+}
+
+static drflac_bool32 drflac_oggbs__seek_to_next_packet(drflac_oggbs* oggbs)
+{
+    /* The current packet ends when we get to the segment with a lacing value of < 255 which is not at the end of a page. */
+    for (;;) {
+        drflac_bool32 atEndOfPage = DRFLAC_FALSE;
+
+        drflac_uint8 bytesRemainingInSeg;
+        drflac_uint8 iFirstSeg = drflac_oggbs__get_current_segment_index(oggbs, &bytesRemainingInSeg);
+
+        drflac_uint32 bytesToEndOfPacketOrPage = bytesRemainingInSeg;
+        for (drflac_uint8 iSeg = iFirstSeg; iSeg < oggbs->currentPageHeader.segmentCount; ++iSeg) {
+            drflac_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
+            if (segmentSize < 255) {
+                if (iSeg == oggbs->currentPageHeader.segmentCount-1) {
+                    atEndOfPage = DRFLAC_TRUE;
+                }
+
+                break;
+            }
+
+            bytesToEndOfPacketOrPage += segmentSize;
+        }
+
+        /*
+        At this point we will have found either the packet or the end of the page. If were at the end of the page we'll
+        want to load the next page and keep searching for the end of the packet.
+        */
+        drflac_oggbs__seek_physical(oggbs, bytesToEndOfPacketOrPage, DRFLAC_SEEK_CUR);
+        oggbs->bytesRemainingInPage -= bytesToEndOfPacketOrPage;
+
+        if (atEndOfPage) {
+            /*
+            We're potentially at the next packet, but we need to check the next page first to be sure because the packet may
+            straddle pages.
+            */
+            if (!drflac_oggbs__goto_next_page(oggbs)) {
+                return DRFLAC_FALSE;
+            }
+
+            /* If it's a fresh packet it most likely means we're at the next packet. */
+            if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {
+                return DRFLAC_TRUE;
+            }
+        } else {
+            /* We're at the next packet. */
+            return DRFLAC_TRUE;
+        }
+    }
+}
+
+static drflac_bool32 drflac_oggbs__seek_to_next_frame(drflac_oggbs* oggbs)
+{
+    /* The bitstream should be sitting on the first byte just after the header of the frame. */
+
+    /* What we're actually doing here is seeking to the start of the next packet. */
+    return drflac_oggbs__seek_to_next_packet(oggbs);
+}
+#endif
+
+static size_t drflac__on_read_ogg(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    drflac_oggbs* oggbs = (drflac_oggbs*)pUserData;
+    drflac_uint8* pRunningBufferOut = (drflac_uint8*)bufferOut;
+    size_t bytesRead = 0;
+
+    DRFLAC_ASSERT(oggbs != NULL);
+    DRFLAC_ASSERT(pRunningBufferOut != NULL);
+
+    /* Reading is done page-by-page. If we've run out of bytes in the page we need to move to the next one. */
+    while (bytesRead < bytesToRead) {
+        size_t bytesRemainingToRead = bytesToRead - bytesRead;
+
+        if (oggbs->bytesRemainingInPage >= bytesRemainingToRead) {
+            DRFLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), bytesRemainingToRead);
+            bytesRead += bytesRemainingToRead;
+            oggbs->bytesRemainingInPage -= (drflac_uint32)bytesRemainingToRead;
+            break;
+        }
+
+        /* If we get here it means some of the requested data is contained in the next pages. */
+        if (oggbs->bytesRemainingInPage > 0) {
+            DRFLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), oggbs->bytesRemainingInPage);
+            bytesRead += oggbs->bytesRemainingInPage;
+            pRunningBufferOut += oggbs->bytesRemainingInPage;
+            oggbs->bytesRemainingInPage = 0;
+        }
+
+        DRFLAC_ASSERT(bytesRemainingToRead > 0);
+        if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
+            break;  /* Failed to go to the next page. Might have simply hit the end of the stream. */
+        }
+    }
+
+    return bytesRead;
+}
+
+static drflac_bool32 drflac__on_seek_ogg(void* pUserData, int offset, drflac_seek_origin origin)
+{
+    drflac_oggbs* oggbs = (drflac_oggbs*)pUserData;
+    int bytesSeeked = 0;
+
+    DRFLAC_ASSERT(oggbs != NULL);
+    DRFLAC_ASSERT(offset >= 0);  /* <-- Never seek backwards. */
+
+    /* Seeking is always forward which makes things a lot simpler. */
+    if (origin == DRFLAC_SEEK_SET) {
+        if (!drflac_oggbs__seek_physical(oggbs, (int)oggbs->firstBytePos, DRFLAC_SEEK_SET)) {
+            return DRFLAC_FALSE;
+        }
+
+        if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_fail_on_crc_mismatch)) {
+            return DRFLAC_FALSE;
+        }
+
+        return drflac__on_seek_ogg(pUserData, offset, DRFLAC_SEEK_CUR);
+    } else if (origin == DRFLAC_SEEK_CUR) {
+        while (bytesSeeked < offset) {
+            int bytesRemainingToSeek = offset - bytesSeeked;
+            DRFLAC_ASSERT(bytesRemainingToSeek >= 0);
+
+            if (oggbs->bytesRemainingInPage >= (size_t)bytesRemainingToSeek) {
+                bytesSeeked += bytesRemainingToSeek;
+                (void)bytesSeeked;  /* <-- Silence a dead store warning emitted by Clang Static Analyzer. */
+                oggbs->bytesRemainingInPage -= bytesRemainingToSeek;
+                break;
+            }
+
+            /* If we get here it means some of the requested data is contained in the next pages. */
+            if (oggbs->bytesRemainingInPage > 0) {
+                bytesSeeked += (int)oggbs->bytesRemainingInPage;
+                oggbs->bytesRemainingInPage = 0;
+            }
+
+            DRFLAC_ASSERT(bytesRemainingToSeek > 0);
+            if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_fail_on_crc_mismatch)) {
+                /* Failed to go to the next page. We either hit the end of the stream or had a CRC mismatch. */
+                return DRFLAC_FALSE;
+            }
+        }
+    } else if (origin == DRFLAC_SEEK_END) {
+        /* Seeking to the end is not supported. */
+        return DRFLAC_FALSE;
+    }
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__on_tell_ogg(void* pUserData, drflac_int64* pCursor)
+{
+    /*
+    Not implemented for Ogg containers because we don't currently track the byte position of the logical bitstream. To support this, we'll need
+    to track the position in drflac__on_read_ogg and drflac__on_seek_ogg.
+    */
+    (void)pUserData;
+    (void)pCursor;
+    return DRFLAC_FALSE;
+}
+
+
+static drflac_bool32 drflac_ogg__seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex)
+{
+    drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
+    drflac_uint64 originalBytePos;
+    drflac_uint64 runningGranulePosition;
+    drflac_uint64 runningFrameBytePos;
+    drflac_uint64 runningPCMFrameCount;
+
+    DRFLAC_ASSERT(oggbs != NULL);
+
+    originalBytePos = oggbs->currentBytePos;   /* For recovery. Points to the OggS identifier. */
+
+    /* First seek to the first frame. */
+    if (!drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes)) {
+        return DRFLAC_FALSE;
+    }
+    oggbs->bytesRemainingInPage = 0;
+
+    runningGranulePosition = 0;
+    for (;;) {
+        if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
+            drflac_oggbs__seek_physical(oggbs, originalBytePos, DRFLAC_SEEK_SET);
+            return DRFLAC_FALSE;   /* Never did find that sample... */
+        }
+
+        runningFrameBytePos = oggbs->currentBytePos - drflac_ogg__get_page_header_size(&oggbs->currentPageHeader) - oggbs->pageDataSize;
+        if (oggbs->currentPageHeader.granulePosition >= pcmFrameIndex) {
+            break; /* The sample is somewhere in the previous page. */
+        }
+
+        /*
+        At this point we know the sample is not in the previous page. It could possibly be in this page. For simplicity we
+        disregard any pages that do not begin a fresh packet.
+        */
+        if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {    /* <-- Is it a fresh page? */
+            if (oggbs->currentPageHeader.segmentTable[0] >= 2) {
+                drflac_uint8 firstBytesInPage[2];
+                firstBytesInPage[0] = oggbs->pageData[0];
+                firstBytesInPage[1] = oggbs->pageData[1];
+
+                if ((firstBytesInPage[0] == 0xFF) && (firstBytesInPage[1] & 0xFC) == 0xF8) {    /* <-- Does the page begin with a frame's sync code? */
+                    runningGranulePosition = oggbs->currentPageHeader.granulePosition;
+                }
+
+                continue;
+            }
+        }
+    }
+
+    /*
+    We found the page that that is closest to the sample, so now we need to find it. The first thing to do is seek to the
+    start of that page. In the loop above we checked that it was a fresh page which means this page is also the start of
+    a new frame. This property means that after we've seeked to the page we can immediately start looping over frames until
+    we find the one containing the target sample.
+    */
+    if (!drflac_oggbs__seek_physical(oggbs, runningFrameBytePos, DRFLAC_SEEK_SET)) {
+        return DRFLAC_FALSE;
+    }
+    if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
+        return DRFLAC_FALSE;
+    }
+
+    /*
+    At this point we'll be sitting on the first byte of the frame header of the first frame in the page. We just keep
+    looping over these frames until we find the one containing the sample we're after.
+    */
+    runningPCMFrameCount = runningGranulePosition;
+    for (;;) {
+        /*
+        There are two ways to find the sample and seek past irrelevant frames:
+          1) Use the native FLAC decoder.
+          2) Use Ogg's framing system.
+
+        Both of these options have their own pros and cons. Using the native FLAC decoder is slower because it needs to
+        do a full decode of the frame. Using Ogg's framing system is faster, but more complicated and involves some code
+        duplication for the decoding of frame headers.
+
+        Another thing to consider is that using the Ogg framing system will perform direct seeking of the physical Ogg
+        bitstream. This is important to consider because it means we cannot read data from the drflac_bs object using the
+        standard drflac__*() APIs because that will read in extra data for its own internal caching which in turn breaks
+        the positioning of the read pointer of the physical Ogg bitstream. Therefore, anything that would normally be read
+        using the native FLAC decoding APIs, such as drflac__read_next_flac_frame_header(), need to be re-implemented so as to
+        avoid the use of the drflac_bs object.
+
+        Considering these issues, I have decided to use the slower native FLAC decoding method for the following reasons:
+          1) Seeking is already partially accelerated using Ogg's paging system in the code block above.
+          2) Seeking in an Ogg encapsulated FLAC stream is probably quite uncommon.
+          3) Simplicity.
+        */
+        drflac_uint64 firstPCMFrameInFLACFrame = 0;
+        drflac_uint64 lastPCMFrameInFLACFrame = 0;
+        drflac_uint64 pcmFrameCountInThisFrame;
+
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return DRFLAC_FALSE;
+        }
+
+        drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
+
+        pcmFrameCountInThisFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
+
+        /* If we are seeking to the end of the file and we've just hit it, we're done. */
+        if (pcmFrameIndex == pFlac->totalPCMFrameCount && (runningPCMFrameCount + pcmFrameCountInThisFrame) == pFlac->totalPCMFrameCount) {
+            drflac_result result = drflac__decode_flac_frame(pFlac);
+            if (result == DRFLAC_SUCCESS) {
+                pFlac->currentPCMFrame = pcmFrameIndex;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+                return DRFLAC_TRUE;
+            } else {
+                return DRFLAC_FALSE;
+            }
+        }
+
+        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFrame)) {
+            /*
+            The sample should be in this FLAC frame. We need to fully decode it, however if it's an invalid frame (a CRC mismatch), we need to pretend
+            it never existed and keep iterating.
+            */
+            drflac_result result = drflac__decode_flac_frame(pFlac);
+            if (result == DRFLAC_SUCCESS) {
+                /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
+                drflac_uint64 pcmFramesToDecode = (size_t)(pcmFrameIndex - runningPCMFrameCount);    /* <-- Safe cast because the maximum number of samples in a frame is 65535. */
+                if (pcmFramesToDecode == 0) {
+                    return DRFLAC_TRUE;
+                }
+
+                pFlac->currentPCMFrame = runningPCMFrameCount;
+
+                return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
+            } else {
+                if (result == DRFLAC_CRC_MISMATCH) {
+                    continue;   /* CRC mismatch. Pretend this frame never existed. */
+                } else {
+                    return DRFLAC_FALSE;
+                }
+            }
+        } else {
+            /*
+            It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
+            frame never existed and leave the running sample count untouched.
+            */
+            drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
+            if (result == DRFLAC_SUCCESS) {
+                runningPCMFrameCount += pcmFrameCountInThisFrame;
+            } else {
+                if (result == DRFLAC_CRC_MISMATCH) {
+                    continue;   /* CRC mismatch. Pretend this frame never existed. */
+                } else {
+                    return DRFLAC_FALSE;
+                }
+            }
+        }
+    }
+}
+
+
+
+static drflac_bool32 drflac__init_private__ogg(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_bool32 relaxed)
+{
+    drflac_ogg_page_header header;
+    drflac_uint32 crc32 = DRFLAC_OGG_CAPTURE_PATTERN_CRC32;
+    drflac_uint32 bytesRead = 0;
+
+    /* Pre Condition: The bit stream should be sitting just past the 4-byte OggS capture pattern. */
+    (void)relaxed;
+
+    pInit->container = drflac_container_ogg;
+    pInit->oggFirstBytePos = 0;
+
+    /*
+    We'll get here if the first 4 bytes of the stream were the OggS capture pattern, however it doesn't necessarily mean the
+    stream includes FLAC encoded audio. To check for this we need to scan the beginning-of-stream page markers and check if
+    any match the FLAC specification. Important to keep in mind that the stream may be multiplexed.
+    */
+    if (drflac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
+        return DRFLAC_FALSE;
+    }
+    pInit->runningFilePos += bytesRead;
+
+    for (;;) {
+        int pageBodySize;
+
+        /* Break if we're past the beginning of stream page. */
+        if ((header.headerType & 0x02) == 0) {
+            return DRFLAC_FALSE;
+        }
+
+        /* Check if it's a FLAC header. */
+        pageBodySize = drflac_ogg__get_page_body_size(&header);
+        if (pageBodySize == 51) {   /* 51 = the lacing value of the FLAC header packet. */
+            /* It could be a FLAC page... */
+            drflac_uint32 bytesRemainingInPage = pageBodySize;
+            drflac_uint8 packetType;
+
+            if (onRead(pUserData, &packetType, 1) != 1) {
+                return DRFLAC_FALSE;
+            }
+
+            bytesRemainingInPage -= 1;
+            if (packetType == 0x7F) {
+                /* Increasingly more likely to be a FLAC page... */
+                drflac_uint8 sig[4];
+                if (onRead(pUserData, sig, 4) != 4) {
+                    return DRFLAC_FALSE;
+                }
+
+                bytesRemainingInPage -= 4;
+                if (sig[0] == 'F' && sig[1] == 'L' && sig[2] == 'A' && sig[3] == 'C') {
+                    /* Almost certainly a FLAC page... */
+                    drflac_uint8 mappingVersion[2];
+                    if (onRead(pUserData, mappingVersion, 2) != 2) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    if (mappingVersion[0] != 1) {
+                        return DRFLAC_FALSE;   /* Only supporting version 1.x of the Ogg mapping. */
+                    }
+
+                    /*
+                    The next 2 bytes are the non-audio packets, not including this one. We don't care about this because we're going to
+                    be handling it in a generic way based on the serial number and packet types.
+                    */
+                    if (!onSeek(pUserData, 2, DRFLAC_SEEK_CUR)) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    /* Expecting the native FLAC signature "fLaC". */
+                    if (onRead(pUserData, sig, 4) != 4) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    if (sig[0] == 'f' && sig[1] == 'L' && sig[2] == 'a' && sig[3] == 'C') {
+                        /* The remaining data in the page should be the STREAMINFO block. */
+                        drflac_streaminfo streaminfo;
+                        drflac_uint8 isLastBlock;
+                        drflac_uint8 blockType;
+                        drflac_uint32 blockSize;
+                        if (!drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
+                            return DRFLAC_FALSE;
+                        }
+
+                        if (blockType != DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
+                            return DRFLAC_FALSE;    /* Invalid block type. First block must be the STREAMINFO block. */
+                        }
+
+                        if (drflac__read_streaminfo(onRead, pUserData, &streaminfo)) {
+                            /* Success! */
+                            pInit->hasStreamInfoBlock      = DRFLAC_TRUE;
+                            pInit->sampleRate              = streaminfo.sampleRate;
+                            pInit->channels                = streaminfo.channels;
+                            pInit->bitsPerSample           = streaminfo.bitsPerSample;
+                            pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
+                            pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;
+                            pInit->hasMetadataBlocks       = !isLastBlock;
+
+                            if (onMeta) {
+                                drflac_metadata metadata;
+                                metadata.type = DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO;
+                                metadata.pRawData = NULL;
+                                metadata.rawDataSize = 0;
+                                metadata.data.streaminfo = streaminfo;
+                                onMeta(pUserDataMD, &metadata);
+                            }
+
+                            pInit->runningFilePos  += pageBodySize;
+                            pInit->oggFirstBytePos  = pInit->runningFilePos - 79;   /* Subtracting 79 will place us right on top of the "OggS" identifier of the FLAC bos page. */
+                            pInit->oggSerial        = header.serialNumber;
+                            pInit->oggBosHeader     = header;
+                            break;
+                        } else {
+                            /* Failed to read STREAMINFO block. Aww, so close... */
+                            return DRFLAC_FALSE;
+                        }
+                    } else {
+                        /* Invalid file. */
+                        return DRFLAC_FALSE;
+                    }
+                } else {
+                    /* Not a FLAC header. Skip it. */
+                    if (!onSeek(pUserData, bytesRemainingInPage, DRFLAC_SEEK_CUR)) {
+                        return DRFLAC_FALSE;
+                    }
+                }
+            } else {
+                /* Not a FLAC header. Seek past the entire page and move on to the next. */
+                if (!onSeek(pUserData, bytesRemainingInPage, DRFLAC_SEEK_CUR)) {
+                    return DRFLAC_FALSE;
+                }
+            }
+        } else {
+            if (!onSeek(pUserData, pageBodySize, DRFLAC_SEEK_CUR)) {
+                return DRFLAC_FALSE;
+            }
+        }
+
+        pInit->runningFilePos += pageBodySize;
+
+
+        /* Read the header of the next page. */
+        if (drflac_ogg__read_page_header(onRead, pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
+            return DRFLAC_FALSE;
+        }
+        pInit->runningFilePos += bytesRead;
+    }
+
+    /*
+    If we get here it means we found a FLAC audio stream. We should be sitting on the first byte of the header of the next page. The next
+    packets in the FLAC logical stream contain the metadata. The only thing left to do in the initialization phase for Ogg is to create the
+    Ogg bistream object.
+    */
+    pInit->hasMetadataBlocks = DRFLAC_TRUE;    /* <-- Always have at least VORBIS_COMMENT metadata block. */
+    return DRFLAC_TRUE;
+}
+#endif
+
+static drflac_bool32 drflac__init_private(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_meta_proc onMeta, drflac_container container, void* pUserData, void* pUserDataMD)
+{
+    drflac_bool32 relaxed;
+    drflac_uint8 id[4];
+
+    if (pInit == NULL || onRead == NULL || onSeek == NULL) {    /* <-- onTell is optional. */
+        return DRFLAC_FALSE;
+    }
+
+    DRFLAC_ZERO_MEMORY(pInit, sizeof(*pInit));
+    pInit->onRead       = onRead;
+    pInit->onSeek       = onSeek;
+    pInit->onTell       = onTell;
+    pInit->onMeta       = onMeta;
+    pInit->container    = container;
+    pInit->pUserData    = pUserData;
+    pInit->pUserDataMD  = pUserDataMD;
+
+    pInit->bs.onRead    = onRead;
+    pInit->bs.onSeek    = onSeek;
+    pInit->bs.onTell    = onTell;
+    pInit->bs.pUserData = pUserData;
+    drflac__reset_cache(&pInit->bs);
+
+
+    /* If the container is explicitly defined then we can try opening in relaxed mode. */
+    relaxed = container != drflac_container_unknown;
+
+    /* Skip over any ID3 tags. */
+    for (;;) {
+        if (onRead(pUserData, id, 4) != 4) {
+            return DRFLAC_FALSE;    /* Ran out of data. */
+        }
+        pInit->runningFilePos += 4;
+
+        if (id[0] == 'I' && id[1] == 'D' && id[2] == '3') {
+            drflac_uint8 header[6];
+            drflac_uint8 flags;
+            drflac_uint32 headerSize;
+
+            if (onRead(pUserData, header, 6) != 6) {
+                return DRFLAC_FALSE;    /* Ran out of data. */
+            }
+            pInit->runningFilePos += 6;
+
+            flags = header[1];
+
+            DRFLAC_COPY_MEMORY(&headerSize, header+2, 4);
+            headerSize = drflac__unsynchsafe_32(drflac__be2host_32(headerSize));
+            if (flags & 0x10) {
+                headerSize += 10;
+            }
+
+            if (!onSeek(pUserData, headerSize, DRFLAC_SEEK_CUR)) {
+                return DRFLAC_FALSE;    /* Failed to seek past the tag. */
+            }
+            pInit->runningFilePos += headerSize;
+        } else {
+            break;
+        }
+    }
+
+    if (id[0] == 'f' && id[1] == 'L' && id[2] == 'a' && id[3] == 'C') {
+        return drflac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+    }
+#ifndef DR_FLAC_NO_OGG
+    if (id[0] == 'O' && id[1] == 'g' && id[2] == 'g' && id[3] == 'S') {
+        return drflac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+    }
+#endif
+
+    /* If we get here it means we likely don't have a header. Try opening in relaxed mode, if applicable. */
+    if (relaxed) {
+        if (container == drflac_container_native) {
+            return drflac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+        }
+#ifndef DR_FLAC_NO_OGG
+        if (container == drflac_container_ogg) {
+            return drflac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+        }
+#endif
+    }
+
+    /* Unsupported container. */
+    return DRFLAC_FALSE;
+}
+
+static void drflac__init_from_info(drflac* pFlac, const drflac_init_info* pInit)
+{
+    DRFLAC_ASSERT(pFlac != NULL);
+    DRFLAC_ASSERT(pInit != NULL);
+
+    DRFLAC_ZERO_MEMORY(pFlac, sizeof(*pFlac));
+    pFlac->bs                      = pInit->bs;
+    pFlac->onMeta                  = pInit->onMeta;
+    pFlac->pUserDataMD             = pInit->pUserDataMD;
+    pFlac->maxBlockSizeInPCMFrames = pInit->maxBlockSizeInPCMFrames;
+    pFlac->sampleRate              = pInit->sampleRate;
+    pFlac->channels                = (drflac_uint8)pInit->channels;
+    pFlac->bitsPerSample           = (drflac_uint8)pInit->bitsPerSample;
+    pFlac->totalPCMFrameCount      = pInit->totalPCMFrameCount;
+    pFlac->container               = pInit->container;
+}
+
+
+static drflac* drflac_open_with_metadata_private(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_meta_proc onMeta, drflac_container container, void* pUserData, void* pUserDataMD, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac_init_info init;
+    drflac_uint32 allocationSize;
+    drflac_uint32 wholeSIMDVectorCountPerChannel;
+    drflac_uint32 decodedSamplesAllocationSize;
+#ifndef DR_FLAC_NO_OGG
+    drflac_oggbs* pOggbs = NULL;
+#endif
+    drflac_uint64 firstFramePos;
+    drflac_uint64 seektablePos;
+    drflac_uint32 seekpointCount;
+    drflac_allocation_callbacks allocationCallbacks;
+    drflac* pFlac;
+
+    /* CPU support first. */
+    drflac__init_cpu_caps();
+
+    if (!drflac__init_private(&init, onRead, onSeek, onTell, onMeta, container, pUserData, pUserDataMD)) {
+        return NULL;
+    }
+
+    if (pAllocationCallbacks != NULL) {
+        allocationCallbacks = *pAllocationCallbacks;
+        if (allocationCallbacks.onFree == NULL || (allocationCallbacks.onMalloc == NULL && allocationCallbacks.onRealloc == NULL)) {
+            return NULL;    /* Invalid allocation callbacks. */
+        }
+    } else {
+        allocationCallbacks.pUserData = NULL;
+        allocationCallbacks.onMalloc  = drflac__malloc_default;
+        allocationCallbacks.onRealloc = drflac__realloc_default;
+        allocationCallbacks.onFree    = drflac__free_default;
+    }
+
+
+    /*
+    The size of the allocation for the drflac object needs to be large enough to fit the following:
+      1) The main members of the drflac structure
+      2) A block of memory large enough to store the decoded samples of the largest frame in the stream
+      3) If the container is Ogg, a drflac_oggbs object
+
+    The complicated part of the allocation is making sure there's enough room the decoded samples, taking into consideration
+    the different SIMD instruction sets.
+    */
+    allocationSize = sizeof(drflac);
+
+    /*
+    The allocation size for decoded frames depends on the number of 32-bit integers that fit inside the largest SIMD vector
+    we are supporting.
+    */
+    if ((init.maxBlockSizeInPCMFrames % (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) == 0) {
+        wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32)));
+    } else {
+        wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) + 1;
+    }
+
+    decodedSamplesAllocationSize = wholeSIMDVectorCountPerChannel * DRFLAC_MAX_SIMD_VECTOR_SIZE * init.channels;
+
+    allocationSize += decodedSamplesAllocationSize;
+    allocationSize += DRFLAC_MAX_SIMD_VECTOR_SIZE;  /* Allocate extra bytes to ensure we have enough for alignment. */
+
+#ifndef DR_FLAC_NO_OGG
+    /* There's additional data required for Ogg streams. */
+    if (init.container == drflac_container_ogg) {
+        allocationSize += sizeof(drflac_oggbs);
+
+        pOggbs = (drflac_oggbs*)drflac__malloc_from_callbacks(sizeof(*pOggbs), &allocationCallbacks);
+        if (pOggbs == NULL) {
+            return NULL; /*DRFLAC_OUT_OF_MEMORY;*/
+        }
+
+        DRFLAC_ZERO_MEMORY(pOggbs, sizeof(*pOggbs));
+        pOggbs->onRead = onRead;
+        pOggbs->onSeek = onSeek;
+        pOggbs->onTell = onTell;
+        pOggbs->pUserData = pUserData;
+        pOggbs->currentBytePos = init.oggFirstBytePos;
+        pOggbs->firstBytePos = init.oggFirstBytePos;
+        pOggbs->serialNumber = init.oggSerial;
+        pOggbs->bosPageHeader = init.oggBosHeader;
+        pOggbs->bytesRemainingInPage = 0;
+    }
+#endif
+
+    /*
+    This part is a bit awkward. We need to load the seektable so that it can be referenced in-memory, but I want the drflac object to
+    consist of only a single heap allocation. To this, the size of the seek table needs to be known, which we determine when reading
+    and decoding the metadata.
+    */
+    firstFramePos  = 42;   /* <-- We know we are at byte 42 at this point. */
+    seektablePos   = 0;
+    seekpointCount = 0;
+    if (init.hasMetadataBlocks) {
+        drflac_read_proc onReadOverride = onRead;
+        drflac_seek_proc onSeekOverride = onSeek;
+        drflac_tell_proc onTellOverride = onTell;
+        void* pUserDataOverride = pUserData;
+
+#ifndef DR_FLAC_NO_OGG
+        if (init.container == drflac_container_ogg) {
+            onReadOverride = drflac__on_read_ogg;
+            onSeekOverride = drflac__on_seek_ogg;
+            onTellOverride = drflac__on_tell_ogg;
+            pUserDataOverride = (void*)pOggbs;
+        }
+#endif
+
+        if (!drflac__read_and_decode_metadata(onReadOverride, onSeekOverride, onTellOverride, onMeta, pUserDataOverride, pUserDataMD, &firstFramePos, &seektablePos, &seekpointCount, &allocationCallbacks)) {
+        #ifndef DR_FLAC_NO_OGG
+            drflac__free_from_callbacks(pOggbs, &allocationCallbacks);
+        #endif
+            return NULL;
+        }
+
+        allocationSize += seekpointCount * sizeof(drflac_seekpoint);
+    }
+
+
+    pFlac = (drflac*)drflac__malloc_from_callbacks(allocationSize, &allocationCallbacks);
+    if (pFlac == NULL) {
+    #ifndef DR_FLAC_NO_OGG
+        drflac__free_from_callbacks(pOggbs, &allocationCallbacks);
+    #endif
+        return NULL;
+    }
+
+    drflac__init_from_info(pFlac, &init);
+    pFlac->allocationCallbacks = allocationCallbacks;
+    pFlac->pDecodedSamples = (drflac_int32*)drflac_align((size_t)pFlac->pExtraData, DRFLAC_MAX_SIMD_VECTOR_SIZE);
+
+#ifndef DR_FLAC_NO_OGG
+    if (init.container == drflac_container_ogg) {
+        drflac_oggbs* pInternalOggbs = (drflac_oggbs*)((drflac_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize + (seekpointCount * sizeof(drflac_seekpoint)));
+        DRFLAC_COPY_MEMORY(pInternalOggbs, pOggbs, sizeof(*pOggbs));
+
+        /* At this point the pOggbs object has been handed over to pInternalOggbs and can be freed. */
+        drflac__free_from_callbacks(pOggbs, &allocationCallbacks);
+        pOggbs = NULL;
+
+        /* The Ogg bistream needs to be layered on top of the original bitstream. */
+        pFlac->bs.onRead = drflac__on_read_ogg;
+        pFlac->bs.onSeek = drflac__on_seek_ogg;
+        pFlac->bs.onTell = drflac__on_tell_ogg;
+        pFlac->bs.pUserData = (void*)pInternalOggbs;
+        pFlac->_oggbs = (void*)pInternalOggbs;
+    }
+#endif
+
+    pFlac->firstFLACFramePosInBytes = firstFramePos;
+
+    /* NOTE: Seektables are not currently compatible with Ogg encapsulation (Ogg has its own accelerated seeking system). I may change this later, so I'm leaving this here for now. */
+#ifndef DR_FLAC_NO_OGG
+    if (init.container == drflac_container_ogg)
+    {
+        pFlac->pSeekpoints = NULL;
+        pFlac->seekpointCount = 0;
+    }
+    else
+#endif
+    {
+        /* If we have a seektable we need to load it now, making sure we move back to where we were previously. */
+        if (seektablePos != 0) {
+            pFlac->seekpointCount = seekpointCount;
+            pFlac->pSeekpoints = (drflac_seekpoint*)((drflac_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize);
+
+            DRFLAC_ASSERT(pFlac->bs.onSeek != NULL);
+            DRFLAC_ASSERT(pFlac->bs.onRead != NULL);
+
+            /* Seek to the seektable, then just read directly into our seektable buffer. */
+            if (pFlac->bs.onSeek(pFlac->bs.pUserData, (int)seektablePos, DRFLAC_SEEK_SET)) {
+                drflac_uint32 iSeekpoint;
+
+                for (iSeekpoint = 0; iSeekpoint < seekpointCount; iSeekpoint += 1) {
+                    if (pFlac->bs.onRead(pFlac->bs.pUserData, pFlac->pSeekpoints + iSeekpoint, DRFLAC_SEEKPOINT_SIZE_IN_BYTES) == DRFLAC_SEEKPOINT_SIZE_IN_BYTES) {
+                        /* Endian swap. */
+                        pFlac->pSeekpoints[iSeekpoint].firstPCMFrame   = drflac__be2host_64(pFlac->pSeekpoints[iSeekpoint].firstPCMFrame);
+                        pFlac->pSeekpoints[iSeekpoint].flacFrameOffset = drflac__be2host_64(pFlac->pSeekpoints[iSeekpoint].flacFrameOffset);
+                        pFlac->pSeekpoints[iSeekpoint].pcmFrameCount   = drflac__be2host_16(pFlac->pSeekpoints[iSeekpoint].pcmFrameCount);
+                    } else {
+                        /* Failed to read the seektable. Pretend we don't have one. */
+                        pFlac->pSeekpoints = NULL;
+                        pFlac->seekpointCount = 0;
+                        break;
+                    }
+                }
+
+                /* We need to seek back to where we were. If this fails it's a critical error. */
+                if (!pFlac->bs.onSeek(pFlac->bs.pUserData, (int)pFlac->firstFLACFramePosInBytes, DRFLAC_SEEK_SET)) {
+                    drflac__free_from_callbacks(pFlac, &allocationCallbacks);
+                    return NULL;
+                }
+            } else {
+                /* Failed to seek to the seektable. Ominous sign, but for now we can just pretend we don't have one. */
+                pFlac->pSeekpoints = NULL;
+                pFlac->seekpointCount = 0;
+            }
+        }
+    }
+
+
+    /*
+    If we get here, but don't have a STREAMINFO block, it means we've opened the stream in relaxed mode and need to decode
+    the first frame.
+    */
+    if (!init.hasStreamInfoBlock) {
+        pFlac->currentFLACFrame.header = init.firstFrameHeader;
+        for (;;) {
+            drflac_result result = drflac__decode_flac_frame(pFlac);
+            if (result == DRFLAC_SUCCESS) {
+                break;
+            } else {
+                if (result == DRFLAC_CRC_MISMATCH) {
+                    if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                        drflac__free_from_callbacks(pFlac, &allocationCallbacks);
+                        return NULL;
+                    }
+                    continue;
+                } else {
+                    drflac__free_from_callbacks(pFlac, &allocationCallbacks);
+                    return NULL;
+                }
+            }
+        }
+    }
+
+    return pFlac;
+}
+
+
+
+#ifndef DR_FLAC_NO_STDIO
+#include <stdio.h>
+#ifndef DR_FLAC_NO_WCHAR
+#include <wchar.h>      /* For wcslen(), wcsrtombs() */
+#endif
+
+/* Errno */
+/* drflac_result_from_errno() is only used for fopen() and wfopen() so putting it inside DR_WAV_NO_STDIO for now. If something else needs this later we can move it out. */
+#include <errno.h>
+static drflac_result drflac_result_from_errno(int e)
+{
+    switch (e)
+    {
+        case 0: return DRFLAC_SUCCESS;
+    #ifdef EPERM
+        case EPERM: return DRFLAC_INVALID_OPERATION;
+    #endif
+    #ifdef ENOENT
+        case ENOENT: return DRFLAC_DOES_NOT_EXIST;
+    #endif
+    #ifdef ESRCH
+        case ESRCH: return DRFLAC_DOES_NOT_EXIST;
+    #endif
+    #ifdef EINTR
+        case EINTR: return DRFLAC_INTERRUPT;
+    #endif
+    #ifdef EIO
+        case EIO: return DRFLAC_IO_ERROR;
+    #endif
+    #ifdef ENXIO
+        case ENXIO: return DRFLAC_DOES_NOT_EXIST;
+    #endif
+    #ifdef E2BIG
+        case E2BIG: return DRFLAC_INVALID_ARGS;
+    #endif
+    #ifdef ENOEXEC
+        case ENOEXEC: return DRFLAC_INVALID_FILE;
+    #endif
+    #ifdef EBADF
+        case EBADF: return DRFLAC_INVALID_FILE;
+    #endif
+    #ifdef ECHILD
+        case ECHILD: return DRFLAC_ERROR;
+    #endif
+    #ifdef EAGAIN
+        case EAGAIN: return DRFLAC_UNAVAILABLE;
+    #endif
+    #ifdef ENOMEM
+        case ENOMEM: return DRFLAC_OUT_OF_MEMORY;
+    #endif
+    #ifdef EACCES
+        case EACCES: return DRFLAC_ACCESS_DENIED;
+    #endif
+    #ifdef EFAULT
+        case EFAULT: return DRFLAC_BAD_ADDRESS;
+    #endif
+    #ifdef ENOTBLK
+        case ENOTBLK: return DRFLAC_ERROR;
+    #endif
+    #ifdef EBUSY
+        case EBUSY: return DRFLAC_BUSY;
+    #endif
+    #ifdef EEXIST
+        case EEXIST: return DRFLAC_ALREADY_EXISTS;
+    #endif
+    #ifdef EXDEV
+        case EXDEV: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENODEV
+        case ENODEV: return DRFLAC_DOES_NOT_EXIST;
+    #endif
+    #ifdef ENOTDIR
+        case ENOTDIR: return DRFLAC_NOT_DIRECTORY;
+    #endif
+    #ifdef EISDIR
+        case EISDIR: return DRFLAC_IS_DIRECTORY;
+    #endif
+    #ifdef EINVAL
+        case EINVAL: return DRFLAC_INVALID_ARGS;
+    #endif
+    #ifdef ENFILE
+        case ENFILE: return DRFLAC_TOO_MANY_OPEN_FILES;
+    #endif
+    #ifdef EMFILE
+        case EMFILE: return DRFLAC_TOO_MANY_OPEN_FILES;
+    #endif
+    #ifdef ENOTTY
+        case ENOTTY: return DRFLAC_INVALID_OPERATION;
+    #endif
+    #ifdef ETXTBSY
+        case ETXTBSY: return DRFLAC_BUSY;
+    #endif
+    #ifdef EFBIG
+        case EFBIG: return DRFLAC_TOO_BIG;
+    #endif
+    #ifdef ENOSPC
+        case ENOSPC: return DRFLAC_NO_SPACE;
+    #endif
+    #ifdef ESPIPE
+        case ESPIPE: return DRFLAC_BAD_SEEK;
+    #endif
+    #ifdef EROFS
+        case EROFS: return DRFLAC_ACCESS_DENIED;
+    #endif
+    #ifdef EMLINK
+        case EMLINK: return DRFLAC_TOO_MANY_LINKS;
+    #endif
+    #ifdef EPIPE
+        case EPIPE: return DRFLAC_BAD_PIPE;
+    #endif
+    #ifdef EDOM
+        case EDOM: return DRFLAC_OUT_OF_RANGE;
+    #endif
+    #ifdef ERANGE
+        case ERANGE: return DRFLAC_OUT_OF_RANGE;
+    #endif
+    #ifdef EDEADLK
+        case EDEADLK: return DRFLAC_DEADLOCK;
+    #endif
+    #ifdef ENAMETOOLONG
+        case ENAMETOOLONG: return DRFLAC_PATH_TOO_LONG;
+    #endif
+    #ifdef ENOLCK
+        case ENOLCK: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENOSYS
+        case ENOSYS: return DRFLAC_NOT_IMPLEMENTED;
+    #endif
+    #if defined(ENOTEMPTY) && ENOTEMPTY != EEXIST   /* In AIX, ENOTEMPTY and EEXIST use the same value. */
+        case ENOTEMPTY: return DRFLAC_DIRECTORY_NOT_EMPTY;
+    #endif
+    #ifdef ELOOP
+        case ELOOP: return DRFLAC_TOO_MANY_LINKS;
+    #endif
+    #ifdef ENOMSG
+        case ENOMSG: return DRFLAC_NO_MESSAGE;
+    #endif
+    #ifdef EIDRM
+        case EIDRM: return DRFLAC_ERROR;
+    #endif
+    #ifdef ECHRNG
+        case ECHRNG: return DRFLAC_ERROR;
+    #endif
+    #ifdef EL2NSYNC
+        case EL2NSYNC: return DRFLAC_ERROR;
+    #endif
+    #ifdef EL3HLT
+        case EL3HLT: return DRFLAC_ERROR;
+    #endif
+    #ifdef EL3RST
+        case EL3RST: return DRFLAC_ERROR;
+    #endif
+    #ifdef ELNRNG
+        case ELNRNG: return DRFLAC_OUT_OF_RANGE;
+    #endif
+    #ifdef EUNATCH
+        case EUNATCH: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENOCSI
+        case ENOCSI: return DRFLAC_ERROR;
+    #endif
+    #ifdef EL2HLT
+        case EL2HLT: return DRFLAC_ERROR;
+    #endif
+    #ifdef EBADE
+        case EBADE: return DRFLAC_ERROR;
+    #endif
+    #ifdef EBADR
+        case EBADR: return DRFLAC_ERROR;
+    #endif
+    #ifdef EXFULL
+        case EXFULL: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENOANO
+        case ENOANO: return DRFLAC_ERROR;
+    #endif
+    #ifdef EBADRQC
+        case EBADRQC: return DRFLAC_ERROR;
+    #endif
+    #ifdef EBADSLT
+        case EBADSLT: return DRFLAC_ERROR;
+    #endif
+    #ifdef EBFONT
+        case EBFONT: return DRFLAC_INVALID_FILE;
+    #endif
+    #ifdef ENOSTR
+        case ENOSTR: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENODATA
+        case ENODATA: return DRFLAC_NO_DATA_AVAILABLE;
+    #endif
+    #ifdef ETIME
+        case ETIME: return DRFLAC_TIMEOUT;
+    #endif
+    #ifdef ENOSR
+        case ENOSR: return DRFLAC_NO_DATA_AVAILABLE;
+    #endif
+    #ifdef ENONET
+        case ENONET: return DRFLAC_NO_NETWORK;
+    #endif
+    #ifdef ENOPKG
+        case ENOPKG: return DRFLAC_ERROR;
+    #endif
+    #ifdef EREMOTE
+        case EREMOTE: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENOLINK
+        case ENOLINK: return DRFLAC_ERROR;
+    #endif
+    #ifdef EADV
+        case EADV: return DRFLAC_ERROR;
+    #endif
+    #ifdef ESRMNT
+        case ESRMNT: return DRFLAC_ERROR;
+    #endif
+    #ifdef ECOMM
+        case ECOMM: return DRFLAC_ERROR;
+    #endif
+    #ifdef EPROTO
+        case EPROTO: return DRFLAC_ERROR;
+    #endif
+    #ifdef EMULTIHOP
+        case EMULTIHOP: return DRFLAC_ERROR;
+    #endif
+    #ifdef EDOTDOT
+        case EDOTDOT: return DRFLAC_ERROR;
+    #endif
+    #ifdef EBADMSG
+        case EBADMSG: return DRFLAC_BAD_MESSAGE;
+    #endif
+    #ifdef EOVERFLOW
+        case EOVERFLOW: return DRFLAC_TOO_BIG;
+    #endif
+    #ifdef ENOTUNIQ
+        case ENOTUNIQ: return DRFLAC_NOT_UNIQUE;
+    #endif
+    #ifdef EBADFD
+        case EBADFD: return DRFLAC_ERROR;
+    #endif
+    #ifdef EREMCHG
+        case EREMCHG: return DRFLAC_ERROR;
+    #endif
+    #ifdef ELIBACC
+        case ELIBACC: return DRFLAC_ACCESS_DENIED;
+    #endif
+    #ifdef ELIBBAD
+        case ELIBBAD: return DRFLAC_INVALID_FILE;
+    #endif
+    #ifdef ELIBSCN
+        case ELIBSCN: return DRFLAC_INVALID_FILE;
+    #endif
+    #ifdef ELIBMAX
+        case ELIBMAX: return DRFLAC_ERROR;
+    #endif
+    #ifdef ELIBEXEC
+        case ELIBEXEC: return DRFLAC_ERROR;
+    #endif
+    #ifdef EILSEQ
+        case EILSEQ: return DRFLAC_INVALID_DATA;
+    #endif
+    #ifdef ERESTART
+        case ERESTART: return DRFLAC_ERROR;
+    #endif
+    #ifdef ESTRPIPE
+        case ESTRPIPE: return DRFLAC_ERROR;
+    #endif
+    #ifdef EUSERS
+        case EUSERS: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENOTSOCK
+        case ENOTSOCK: return DRFLAC_NOT_SOCKET;
+    #endif
+    #ifdef EDESTADDRREQ
+        case EDESTADDRREQ: return DRFLAC_NO_ADDRESS;
+    #endif
+    #ifdef EMSGSIZE
+        case EMSGSIZE: return DRFLAC_TOO_BIG;
+    #endif
+    #ifdef EPROTOTYPE
+        case EPROTOTYPE: return DRFLAC_BAD_PROTOCOL;
+    #endif
+    #ifdef ENOPROTOOPT
+        case ENOPROTOOPT: return DRFLAC_PROTOCOL_UNAVAILABLE;
+    #endif
+    #ifdef EPROTONOSUPPORT
+        case EPROTONOSUPPORT: return DRFLAC_PROTOCOL_NOT_SUPPORTED;
+    #endif
+    #ifdef ESOCKTNOSUPPORT
+        case ESOCKTNOSUPPORT: return DRFLAC_SOCKET_NOT_SUPPORTED;
+    #endif
+    #ifdef EOPNOTSUPP
+        case EOPNOTSUPP: return DRFLAC_INVALID_OPERATION;
+    #endif
+    #ifdef EPFNOSUPPORT
+        case EPFNOSUPPORT: return DRFLAC_PROTOCOL_FAMILY_NOT_SUPPORTED;
+    #endif
+    #ifdef EAFNOSUPPORT
+        case EAFNOSUPPORT: return DRFLAC_ADDRESS_FAMILY_NOT_SUPPORTED;
+    #endif
+    #ifdef EADDRINUSE
+        case EADDRINUSE: return DRFLAC_ALREADY_IN_USE;
+    #endif
+    #ifdef EADDRNOTAVAIL
+        case EADDRNOTAVAIL: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENETDOWN
+        case ENETDOWN: return DRFLAC_NO_NETWORK;
+    #endif
+    #ifdef ENETUNREACH
+        case ENETUNREACH: return DRFLAC_NO_NETWORK;
+    #endif
+    #ifdef ENETRESET
+        case ENETRESET: return DRFLAC_NO_NETWORK;
+    #endif
+    #ifdef ECONNABORTED
+        case ECONNABORTED: return DRFLAC_NO_NETWORK;
+    #endif
+    #ifdef ECONNRESET
+        case ECONNRESET: return DRFLAC_CONNECTION_RESET;
+    #endif
+    #ifdef ENOBUFS
+        case ENOBUFS: return DRFLAC_NO_SPACE;
+    #endif
+    #ifdef EISCONN
+        case EISCONN: return DRFLAC_ALREADY_CONNECTED;
+    #endif
+    #ifdef ENOTCONN
+        case ENOTCONN: return DRFLAC_NOT_CONNECTED;
+    #endif
+    #ifdef ESHUTDOWN
+        case ESHUTDOWN: return DRFLAC_ERROR;
+    #endif
+    #ifdef ETOOMANYREFS
+        case ETOOMANYREFS: return DRFLAC_ERROR;
+    #endif
+    #ifdef ETIMEDOUT
+        case ETIMEDOUT: return DRFLAC_TIMEOUT;
+    #endif
+    #ifdef ECONNREFUSED
+        case ECONNREFUSED: return DRFLAC_CONNECTION_REFUSED;
+    #endif
+    #ifdef EHOSTDOWN
+        case EHOSTDOWN: return DRFLAC_NO_HOST;
+    #endif
+    #ifdef EHOSTUNREACH
+        case EHOSTUNREACH: return DRFLAC_NO_HOST;
+    #endif
+    #ifdef EALREADY
+        case EALREADY: return DRFLAC_IN_PROGRESS;
+    #endif
+    #ifdef EINPROGRESS
+        case EINPROGRESS: return DRFLAC_IN_PROGRESS;
+    #endif
+    #ifdef ESTALE
+        case ESTALE: return DRFLAC_INVALID_FILE;
+    #endif
+    #ifdef EUCLEAN
+        case EUCLEAN: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENOTNAM
+        case ENOTNAM: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENAVAIL
+        case ENAVAIL: return DRFLAC_ERROR;
+    #endif
+    #ifdef EISNAM
+        case EISNAM: return DRFLAC_ERROR;
+    #endif
+    #ifdef EREMOTEIO
+        case EREMOTEIO: return DRFLAC_IO_ERROR;
+    #endif
+    #ifdef EDQUOT
+        case EDQUOT: return DRFLAC_NO_SPACE;
+    #endif
+    #ifdef ENOMEDIUM
+        case ENOMEDIUM: return DRFLAC_DOES_NOT_EXIST;
+    #endif
+    #ifdef EMEDIUMTYPE
+        case EMEDIUMTYPE: return DRFLAC_ERROR;
+    #endif
+    #ifdef ECANCELED
+        case ECANCELED: return DRFLAC_CANCELLED;
+    #endif
+    #ifdef ENOKEY
+        case ENOKEY: return DRFLAC_ERROR;
+    #endif
+    #ifdef EKEYEXPIRED
+        case EKEYEXPIRED: return DRFLAC_ERROR;
+    #endif
+    #ifdef EKEYREVOKED
+        case EKEYREVOKED: return DRFLAC_ERROR;
+    #endif
+    #ifdef EKEYREJECTED
+        case EKEYREJECTED: return DRFLAC_ERROR;
+    #endif
+    #ifdef EOWNERDEAD
+        case EOWNERDEAD: return DRFLAC_ERROR;
+    #endif
+    #ifdef ENOTRECOVERABLE
+        case ENOTRECOVERABLE: return DRFLAC_ERROR;
+    #endif
+    #ifdef ERFKILL
+        case ERFKILL: return DRFLAC_ERROR;
+    #endif
+    #ifdef EHWPOISON
+        case EHWPOISON: return DRFLAC_ERROR;
+    #endif
+        default: return DRFLAC_ERROR;
+    }
+}
+/* End Errno */
+
+/* fopen */
+static drflac_result drflac_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode)
+{
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    errno_t err;
+#endif
+
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return DRFLAC_INVALID_ARGS;
+    }
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    err = fopen_s(ppFile, pFilePath, pOpenMode);
+    if (err != 0) {
+        return drflac_result_from_errno(err);
+    }
+#else
+#if defined(_WIN32) || defined(__APPLE__)
+    *ppFile = fopen(pFilePath, pOpenMode);
+#else
+    #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE)
+        *ppFile = fopen64(pFilePath, pOpenMode);
+    #else
+        *ppFile = fopen(pFilePath, pOpenMode);
+    #endif
+#endif
+    if (*ppFile == NULL) {
+        drflac_result result = drflac_result_from_errno(errno);
+        if (result == DRFLAC_SUCCESS) {
+            result = DRFLAC_ERROR;   /* Just a safety check to make sure we never ever return success when pFile == NULL. */
+        }
+
+        return result;
+    }
+#endif
+
+    return DRFLAC_SUCCESS;
+}
+
+/*
+_wfopen() isn't always available in all compilation environments.
+
+    * Windows only.
+    * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back).
+    * MinGW-64 (both 32- and 64-bit) seems to support it.
+    * MinGW wraps it in !defined(__STRICT_ANSI__).
+    * OpenWatcom wraps it in !defined(_NO_EXT_KEYS).
+
+This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs()
+fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support.
+*/
+#if defined(_WIN32)
+    #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS))
+        #define DRFLAC_HAS_WFOPEN
+    #endif
+#endif
+
+#ifndef DR_FLAC_NO_WCHAR
+static drflac_result drflac_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return DRFLAC_INVALID_ARGS;
+    }
+
+#if defined(DRFLAC_HAS_WFOPEN)
+    {
+        /* Use _wfopen() on Windows. */
+    #if defined(_MSC_VER) && _MSC_VER >= 1400
+        errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode);
+        if (err != 0) {
+            return drflac_result_from_errno(err);
+        }
+    #else
+        *ppFile = _wfopen(pFilePath, pOpenMode);
+        if (*ppFile == NULL) {
+            return drflac_result_from_errno(errno);
+        }
+    #endif
+        (void)pAllocationCallbacks;
+    }
+#else
+    /*
+    Use fopen() on anything other than Windows. Requires a conversion. This is annoying because
+	fopen() is locale specific. The only real way I can think of to do this is with wcsrtombs(). Note
+	that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for
+    maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler
+	error I'll look into improving compatibility.
+    */
+
+	/*
+	Some compilers don't support wchar_t or wcsrtombs() which we're using below. In this case we just
+	need to abort with an error. If you encounter a compiler lacking such support, add it to this list
+	and submit a bug report and it'll be added to the library upstream.
+	*/
+	#if defined(__DJGPP__)
+	{
+		/* Nothing to do here. This will fall through to the error check below. */
+	}
+	#else
+    {
+        mbstate_t mbs;
+        size_t lenMB;
+        const wchar_t* pFilePathTemp = pFilePath;
+        char* pFilePathMB = NULL;
+        char pOpenModeMB[32] = {0};
+
+        /* Get the length first. */
+        DRFLAC_ZERO_OBJECT(&mbs);
+        lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs);
+        if (lenMB == (size_t)-1) {
+            return drflac_result_from_errno(errno);
+        }
+
+        pFilePathMB = (char*)drflac__malloc_from_callbacks(lenMB + 1, pAllocationCallbacks);
+        if (pFilePathMB == NULL) {
+            return DRFLAC_OUT_OF_MEMORY;
+        }
+
+        pFilePathTemp = pFilePath;
+        DRFLAC_ZERO_OBJECT(&mbs);
+        wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs);
+
+        /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */
+        {
+            size_t i = 0;
+            for (;;) {
+                if (pOpenMode[i] == 0) {
+                    pOpenModeMB[i] = '\0';
+                    break;
+                }
+
+                pOpenModeMB[i] = (char)pOpenMode[i];
+                i += 1;
+            }
+        }
+
+        *ppFile = fopen(pFilePathMB, pOpenModeMB);
+
+        drflac__free_from_callbacks(pFilePathMB, pAllocationCallbacks);
+    }
+	#endif
+
+    if (*ppFile == NULL) {
+        return DRFLAC_ERROR;
+    }
+#endif
+
+    return DRFLAC_SUCCESS;
+}
+#endif
+/* End fopen */
+
+static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+
+static drflac_bool32 drflac__on_seek_stdio(void* pUserData, int offset, drflac_seek_origin origin)
+{
+    int whence = SEEK_SET;
+    if (origin == DRFLAC_SEEK_CUR) {
+        whence = SEEK_CUR;
+    } else if (origin == DRFLAC_SEEK_END) {
+        whence = SEEK_END;
+    }
+
+    return fseek((FILE*)pUserData, offset, whence) == 0;
+}
+
+static drflac_bool32 drflac__on_tell_stdio(void* pUserData, drflac_int64* pCursor)
+{
+    FILE* pFileStdio = (FILE*)pUserData;
+    drflac_int64 result;
+
+    /* These were all validated at a higher level. */
+    DRFLAC_ASSERT(pFileStdio != NULL);
+    DRFLAC_ASSERT(pCursor    != NULL);
+
+#if defined(_WIN32) && !defined(NXDK)
+    #if defined(_MSC_VER) && _MSC_VER > 1200
+        result = _ftelli64(pFileStdio);
+    #else
+        result = ftell(pFileStdio);
+    #endif
+#else
+    result = ftell(pFileStdio);
+#endif
+
+    *pCursor = result;
+
+    return DRFLAC_TRUE;
+}
+
+
+
+DRFLAC_API drflac* drflac_open_file(const char* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac* pFlac;
+    FILE* pFile;
+
+    if (drflac_fopen(&pFile, pFileName, "rb") != DRFLAC_SUCCESS) {
+        return NULL;
+    }
+
+    pFlac = drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, drflac__on_tell_stdio, (void*)pFile, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return NULL;
+    }
+
+    return pFlac;
+}
+
+#ifndef DR_FLAC_NO_WCHAR
+DRFLAC_API drflac* drflac_open_file_w(const wchar_t* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac* pFlac;
+    FILE* pFile;
+
+    if (drflac_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != DRFLAC_SUCCESS) {
+        return NULL;
+    }
+
+    pFlac = drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, drflac__on_tell_stdio, (void*)pFile, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return NULL;
+    }
+
+    return pFlac;
+}
+#endif
+
+DRFLAC_API drflac* drflac_open_file_with_metadata(const char* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac* pFlac;
+    FILE* pFile;
+
+    if (drflac_fopen(&pFile, pFileName, "rb") != DRFLAC_SUCCESS) {
+        return NULL;
+    }
+
+    pFlac = drflac_open_with_metadata_private(drflac__on_read_stdio, drflac__on_seek_stdio, drflac__on_tell_stdio, onMeta, drflac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return pFlac;
+    }
+
+    return pFlac;
+}
+
+#ifndef DR_FLAC_NO_WCHAR
+DRFLAC_API drflac* drflac_open_file_with_metadata_w(const wchar_t* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac* pFlac;
+    FILE* pFile;
+
+    if (drflac_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != DRFLAC_SUCCESS) {
+        return NULL;
+    }
+
+    pFlac = drflac_open_with_metadata_private(drflac__on_read_stdio, drflac__on_seek_stdio, drflac__on_tell_stdio, onMeta, drflac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return pFlac;
+    }
+
+    return pFlac;
+}
+#endif
+#endif  /* DR_FLAC_NO_STDIO */
+
+static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    drflac__memory_stream* memoryStream = (drflac__memory_stream*)pUserData;
+    size_t bytesRemaining;
+
+    DRFLAC_ASSERT(memoryStream != NULL);
+    DRFLAC_ASSERT(memoryStream->dataSize >= memoryStream->currentReadPos);
+
+    bytesRemaining = memoryStream->dataSize - memoryStream->currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (bytesToRead > 0) {
+        DRFLAC_COPY_MEMORY(bufferOut, memoryStream->data + memoryStream->currentReadPos, bytesToRead);
+        memoryStream->currentReadPos += bytesToRead;
+    }
+
+    return bytesToRead;
+}
+
+static drflac_bool32 drflac__on_seek_memory(void* pUserData, int offset, drflac_seek_origin origin)
+{
+    drflac__memory_stream* memoryStream = (drflac__memory_stream*)pUserData;
+    drflac_int64 newCursor;
+
+    DRFLAC_ASSERT(memoryStream != NULL);
+
+    if (origin == DRFLAC_SEEK_SET) {
+        newCursor = 0;
+    } else if (origin == DRFLAC_SEEK_CUR) {
+        newCursor = (drflac_int64)memoryStream->currentReadPos;
+    } else if (origin == DRFLAC_SEEK_END) {
+        newCursor = (drflac_int64)memoryStream->dataSize;
+    } else {
+        DRFLAC_ASSERT(!"Invalid seek origin");
+        return DRFLAC_FALSE;
+    }
+
+    newCursor += offset;
+
+    if (newCursor < 0) {
+        return DRFLAC_FALSE;  /* Trying to seek prior to the start of the buffer. */
+    }
+    if ((size_t)newCursor > memoryStream->dataSize) {
+        return DRFLAC_FALSE;  /* Trying to seek beyond the end of the buffer. */
+    }
+
+    memoryStream->currentReadPos = (size_t)newCursor;
+
+    return DRFLAC_TRUE;
+}
+
+static drflac_bool32 drflac__on_tell_memory(void* pUserData, drflac_int64* pCursor)
+{
+    drflac__memory_stream* memoryStream = (drflac__memory_stream*)pUserData;
+
+    DRFLAC_ASSERT(memoryStream != NULL);
+    DRFLAC_ASSERT(pCursor != NULL);
+
+    *pCursor = (drflac_int64)memoryStream->currentReadPos;
+    return DRFLAC_TRUE;
+}
+
+DRFLAC_API drflac* drflac_open_memory(const void* pData, size_t dataSize, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac__memory_stream memoryStream;
+    drflac* pFlac;
+
+    memoryStream.data = (const drflac_uint8*)pData;
+    memoryStream.dataSize = dataSize;
+    memoryStream.currentReadPos = 0;
+    pFlac = drflac_open(drflac__on_read_memory, drflac__on_seek_memory, drflac__on_tell_memory, &memoryStream, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+
+    pFlac->memoryStream = memoryStream;
+
+    /* This is an awful hack... */
+#ifndef DR_FLAC_NO_OGG
+    if (pFlac->container == drflac_container_ogg)
+    {
+        drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
+        oggbs->pUserData = &pFlac->memoryStream;
+    }
+    else
+#endif
+    {
+        pFlac->bs.pUserData = &pFlac->memoryStream;
+    }
+
+    return pFlac;
+}
+
+DRFLAC_API drflac* drflac_open_memory_with_metadata(const void* pData, size_t dataSize, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac__memory_stream memoryStream;
+    drflac* pFlac;
+
+    memoryStream.data = (const drflac_uint8*)pData;
+    memoryStream.dataSize = dataSize;
+    memoryStream.currentReadPos = 0;
+    pFlac = drflac_open_with_metadata_private(drflac__on_read_memory, drflac__on_seek_memory, drflac__on_tell_memory, onMeta, drflac_container_unknown, &memoryStream, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+
+    pFlac->memoryStream = memoryStream;
+
+    /* This is an awful hack... */
+#ifndef DR_FLAC_NO_OGG
+    if (pFlac->container == drflac_container_ogg)
+    {
+        drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
+        oggbs->pUserData = &pFlac->memoryStream;
+    }
+    else
+#endif
+    {
+        pFlac->bs.pUserData = &pFlac->memoryStream;
+    }
+
+    return pFlac;
+}
+
+
+
+DRFLAC_API drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    return drflac_open_with_metadata_private(onRead, onSeek, onTell, NULL, drflac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
+}
+DRFLAC_API drflac* drflac_open_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    return drflac_open_with_metadata_private(onRead, onSeek, onTell, NULL, container, pUserData, pUserData, pAllocationCallbacks);
+}
+
+DRFLAC_API drflac* drflac_open_with_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    return drflac_open_with_metadata_private(onRead, onSeek, onTell, onMeta, drflac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
+}
+DRFLAC_API drflac* drflac_open_with_metadata_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, drflac_meta_proc onMeta, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    return drflac_open_with_metadata_private(onRead, onSeek, onTell, onMeta, container, pUserData, pUserData, pAllocationCallbacks);
+}
+
+DRFLAC_API void drflac_close(drflac* pFlac)
+{
+    if (pFlac == NULL) {
+        return;
+    }
+
+#ifndef DR_FLAC_NO_STDIO
+    /*
+    If we opened the file with drflac_open_file() we will want to close the file handle. We can know whether or not drflac_open_file()
+    was used by looking at the callbacks.
+    */
+    if (pFlac->bs.onRead == drflac__on_read_stdio) {
+        fclose((FILE*)pFlac->bs.pUserData);
+    }
+
+#ifndef DR_FLAC_NO_OGG
+    /* Need to clean up Ogg streams a bit differently due to the way the bit streaming is chained. */
+    if (pFlac->container == drflac_container_ogg) {
+        drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
+        DRFLAC_ASSERT(pFlac->bs.onRead == drflac__on_read_ogg);
+
+        if (oggbs->onRead == drflac__on_read_stdio) {
+            fclose((FILE*)oggbs->pUserData);
+        }
+    }
+#endif
+#endif
+
+    drflac__free_from_callbacks(pFlac, &pFlac->allocationCallbacks);
+}
+
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        drflac_uint32 left  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        drflac_uint32 side  = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        drflac_uint32 right = left - side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
+
+        drflac_uint32 right0 = left0 - side0;
+        drflac_uint32 right1 = left1 - side1;
+        drflac_uint32 right2 = left2 - side2;
+        drflac_uint32 right3 = left3 - side3;
+
+        pOutputSamples[i*8+0] = (drflac_int32)left0;
+        pOutputSamples[i*8+1] = (drflac_int32)right0;
+        pOutputSamples[i*8+2] = (drflac_int32)left1;
+        pOutputSamples[i*8+3] = (drflac_int32)right1;
+        pOutputSamples[i*8+4] = (drflac_int32)left2;
+        pOutputSamples[i*8+5] = (drflac_int32)right2;
+        pOutputSamples[i*8+6] = (drflac_int32)left3;
+        pOutputSamples[i*8+7] = (drflac_int32)right3;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i right = _mm_sub_epi32(left, side);
+
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+#endif
+
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t left;
+        uint32x4_t side;
+        uint32x4_t right;
+
+        left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        right = vsubq_u32(left, side);
+
+        drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        drflac_uint32 side  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        drflac_uint32 left  = right + side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
+
+        drflac_uint32 left0 = right0 + side0;
+        drflac_uint32 left1 = right1 + side1;
+        drflac_uint32 left2 = right2 + side2;
+        drflac_uint32 left3 = right3 + side3;
+
+        pOutputSamples[i*8+0] = (drflac_int32)left0;
+        pOutputSamples[i*8+1] = (drflac_int32)right0;
+        pOutputSamples[i*8+2] = (drflac_int32)left1;
+        pOutputSamples[i*8+3] = (drflac_int32)right1;
+        pOutputSamples[i*8+4] = (drflac_int32)left2;
+        pOutputSamples[i*8+5] = (drflac_int32)right2;
+        pOutputSamples[i*8+6] = (drflac_int32)left3;
+        pOutputSamples[i*8+7] = (drflac_int32)right3;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i left  = _mm_add_epi32(right, side);
+
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+#endif
+
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t side;
+        uint32x4_t right;
+        uint32x4_t left;
+
+        side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        left  = vaddq_u32(right, side);
+
+        drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left;
+        pOutputSamples[i*2+1] = (drflac_int32)right;
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+        mid = (mid << 1) | (side & 0x01);
+
+        pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample);
+        pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_int32 shift = unusedBitsPerSample;
+
+    if (shift > 0) {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            drflac_uint32 temp0L;
+            drflac_uint32 temp1L;
+            drflac_uint32 temp2L;
+            drflac_uint32 temp3L;
+            drflac_uint32 temp0R;
+            drflac_uint32 temp1R;
+            drflac_uint32 temp2R;
+            drflac_uint32 temp3R;
+
+            drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+
+            drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+
+            temp0L = (mid0 + side0) << shift;
+            temp1L = (mid1 + side1) << shift;
+            temp2L = (mid2 + side2) << shift;
+            temp3L = (mid3 + side3) << shift;
+
+            temp0R = (mid0 - side0) << shift;
+            temp1R = (mid1 - side1) << shift;
+            temp2R = (mid2 - side2) << shift;
+            temp3R = (mid3 - side3) << shift;
+
+            pOutputSamples[i*8+0] = (drflac_int32)temp0L;
+            pOutputSamples[i*8+1] = (drflac_int32)temp0R;
+            pOutputSamples[i*8+2] = (drflac_int32)temp1L;
+            pOutputSamples[i*8+3] = (drflac_int32)temp1R;
+            pOutputSamples[i*8+4] = (drflac_int32)temp2L;
+            pOutputSamples[i*8+5] = (drflac_int32)temp2R;
+            pOutputSamples[i*8+6] = (drflac_int32)temp3L;
+            pOutputSamples[i*8+7] = (drflac_int32)temp3R;
         }
         }
-        return true;
     } else {
     } else {
-        // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
-        size_t bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING;
-        size_t bitCountLo = bitCount - bitCountHi;
-        uint32_t resultHi = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountHi);
-
-        if (!drflac__reload_cache(pFlac)) {
-            return false;
+        for (i = 0; i < frameCount4; ++i) {
+            drflac_uint32 temp0L;
+            drflac_uint32 temp1L;
+            drflac_uint32 temp2L;
+            drflac_uint32 temp3L;
+            drflac_uint32 temp0R;
+            drflac_uint32 temp1R;
+            drflac_uint32 temp2R;
+            drflac_uint32 temp3R;
+
+            drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+
+            drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+
+            temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1);
+            temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1);
+            temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1);
+            temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1);
+
+            temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1);
+            temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1);
+            temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1);
+            temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1);
+
+            pOutputSamples[i*8+0] = (drflac_int32)temp0L;
+            pOutputSamples[i*8+1] = (drflac_int32)temp0R;
+            pOutputSamples[i*8+2] = (drflac_int32)temp1L;
+            pOutputSamples[i*8+3] = (drflac_int32)temp1R;
+            pOutputSamples[i*8+4] = (drflac_int32)temp2L;
+            pOutputSamples[i*8+5] = (drflac_int32)temp2R;
+            pOutputSamples[i*8+6] = (drflac_int32)temp3L;
+            pOutputSamples[i*8+7] = (drflac_int32)temp3R;
         }
         }
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-        *pResultOut = (resultHi << bitCountLo) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo);
-        pFlac->consumedBits += bitCountLo;
-        pFlac->cache <<= bitCountLo;
-        return true;
+        mid = (mid << 1) | (side & 0x01);
+
+        pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample);
+        pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample);
     }
     }
 }
 }
 
 
-static bool drflac__read_int32(drflac* pFlac, unsigned int bitCount, int32_t* pResult)
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
-    assert(pResult != NULL);
-    assert(bitCount > 0);
-    assert(bitCount <= 32);
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_int32 shift = unusedBitsPerSample;
 
 
-    uint32_t result;
-    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
-        return false;
-    }
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
 
 
-    if ((result & (1 << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
-        result |= (-1 << bitCount);
-    }
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
 
 
-    *pResult = (int32_t)result;
-    return true;
-}
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
 
 
-static bool drflac__read_uint64(drflac* pFlac, unsigned int bitCount, uint64_t* pResultOut)
-{
-    assert(bitCount <= 64);
-    assert(bitCount >  32);
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
 
 
-    uint32_t resultHi;
-    if (!drflac__read_uint32(pFlac, bitCount - 32, &resultHi)) {
-        return false;
-    }
+            left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
+            right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
 
 
-    uint32_t resultLo;
-    if (!drflac__read_uint32(pFlac, 32, &resultLo)) {
-        return false;
-    }
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+        }
 
 
-    *pResultOut = (((uint64_t)resultHi) << 32) | ((uint64_t)resultLo);
-    return true;
-}
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-static bool drflac__read_int64(drflac* pFlac, unsigned int bitCount, int64_t* pResultOut)
-{
-    assert(bitCount <= 64);
+            mid = (mid << 1) | (side & 0x01);
 
 
-    uint64_t result;
-    if (!drflac__read_uint64(pFlac, bitCount, &result)) {
-        return false;
-    }
+            pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1;
+            pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1;
+        }
+    } else {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
 
 
-    if ((result & (1ULL << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
-        result |= (-1LL << bitCount);
-    }
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
 
 
-    *pResultOut = (int64_t)result;
-    return true;
-}
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
 
 
-static bool drflac__read_uint16(drflac* pFlac, unsigned int bitCount, uint16_t* pResult)
-{
-    assert(pFlac != NULL);
-    assert(pResult != NULL);
-    assert(bitCount > 0);
-    assert(bitCount <= 16);
+            left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
+            right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
 
 
-    uint32_t result;
-    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
-        return false;
-    }
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+        }
 
 
-    *pResult = (uint16_t)result;
-    return true;
-}
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-static bool drflac__read_int16(drflac* pFlac, unsigned int bitCount, int16_t* pResult)
-{
-    assert(pFlac != NULL);
-    assert(pResult != NULL);
-    assert(bitCount > 0);
-    assert(bitCount <= 16);
+            mid = (mid << 1) | (side & 0x01);
 
 
-    int32_t result;
-    if (!drflac__read_int32(pFlac, bitCount, &result)) {
-        return false;
+            pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift);
+            pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift);
+        }
     }
     }
-
-    *pResult = (int16_t)result;
-    return true;
 }
 }
+#endif
 
 
-static bool drflac__read_uint8(drflac* pFlac, unsigned int bitCount, uint8_t* pResult)
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
-    assert(pResult != NULL);
-    assert(bitCount > 0);
-    assert(bitCount <= 8);
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_int32 shift = unusedBitsPerSample;
+    int32x4_t  wbpsShift0_4; /* wbps = Wasted Bits Per Sample */
+    int32x4_t  wbpsShift1_4; /* wbps = Wasted Bits Per Sample */
+    uint32x4_t one4;
 
 
-    uint32_t result;
-    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
-        return false;
-    }
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
 
 
-    *pResult = (uint8_t)result;
-    return true;
-}
+    wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+    wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+    one4         = vdupq_n_u32(1);
 
 
-static bool drflac__read_int8(drflac* pFlac, unsigned int bitCount, int8_t* pResult)
-{
-    assert(pFlac != NULL);
-    assert(pResult != NULL);
-    assert(bitCount > 0);
-    assert(bitCount <= 8);
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
 
 
-    int32_t result;
-    if (!drflac__read_int32(pFlac, bitCount, &result)) {
-        return false;
-    }
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
 
 
-    *pResult = (int8_t)result;
-    return true;
-}
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
 
 
+            left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
+            right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
 
 
-static inline bool drflac__seek_past_next_set_bit(drflac* pFlac, unsigned int* pOffsetOut)
-{
-    unsigned int zeroCounter = 0;
-    while (pFlac->cache == 0) {
-        zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
-        if (!drflac__reload_cache(pFlac)) {
-            return false;
+            drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
         }
         }
-    }
 
 
-    // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
-    // no need for us to perform any cache reloading logic here which should make things much faster.
-    assert(pFlac->cache != 0);
-
-    unsigned int bitOffsetTable[] = {
-        0,
-        4,
-        3, 3,
-        2, 2, 2, 2,
-        1, 1, 1, 1, 1, 1, 1, 1
-    };
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-    unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
-    if (setBitOffsetPlus1 == 0) {
-        if (pFlac->cache == 1) {
-            setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
-        } else {
-            setBitOffsetPlus1 = 5;
-            for (;;)
-            {
-                if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
-                    break;
-                }
+            mid = (mid << 1) | (side & 0x01);
 
 
-                setBitOffsetPlus1 += 1;
-            }
+            pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1;
+            pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1;
         }
         }
-    }
+    } else {
+        int32x4_t shift4;
 
 
-    pFlac->consumedBits += setBitOffsetPlus1;
-    pFlac->cache <<= setBitOffsetPlus1;
+        shift -= 1;
+        shift4 = vdupq_n_s32(shift);
 
 
-    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
-    return true;
-}
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
 
 
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
 
 
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
 
 
-static bool drflac__seek_to_byte(drflac* pFlac, long long offsetFromStart)
-{
-    assert(pFlac != NULL);
+            left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
+            right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
 
 
-    long long bytesToMove = offsetFromStart - pFlac->currentBytePos;
-    if (bytesToMove == 0) {
-        return 1;
-    }
+            drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
+        }
 
 
-    if (bytesToMove > 0x7FFFFFFF) {
-        while (bytesToMove > 0x7FFFFFFF) {
-            if (!pFlac->onSeek(pFlac->pUserData, 0x7FFFFFFF)) {
-                return 0;
-            }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-            pFlac->currentBytePos += 0x7FFFFFFF;
-            bytesToMove -= 0x7FFFFFFF;
-        }
-    } else {
-        while (bytesToMove < (int)0x80000000) {
-            if (!pFlac->onSeek(pFlac->pUserData, (int)0x80000000)) {
-                return 0;
-            }
+            mid = (mid << 1) | (side & 0x01);
 
 
-            pFlac->currentBytePos += (int)0x80000000;
-            bytesToMove -= (int)0x80000000;
+            pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift);
+            pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift);
         }
         }
     }
     }
+}
+#endif
 
 
-    assert(bytesToMove <= 0x7FFFFFFF && bytesToMove >= (int)0x80000000);
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
 
 
-    bool result = pFlac->onSeek(pFlac->pUserData, (int)bytesToMove);    // <-- Safe cast as per the assert above.
-    pFlac->currentBytePos += (int)bytesToMove;
 
 
-    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
-    pFlac->cache = 0;
-    pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT; // <-- This clears the L2 cache.
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample));
+        pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample));
+    }
+}
+#endif
 
 
-    return result;
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
+
+        pOutputSamples[i*8+0] = (drflac_int32)tempL0;
+        pOutputSamples[i*8+1] = (drflac_int32)tempR0;
+        pOutputSamples[i*8+2] = (drflac_int32)tempL1;
+        pOutputSamples[i*8+3] = (drflac_int32)tempR1;
+        pOutputSamples[i*8+4] = (drflac_int32)tempL2;
+        pOutputSamples[i*8+5] = (drflac_int32)tempR2;
+        pOutputSamples[i*8+6] = (drflac_int32)tempL3;
+        pOutputSamples[i*8+7] = (drflac_int32)tempR3;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
+        pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
+    }
 }
 }
 
 
-static long long drflac__tell(drflac* pFlac)
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
 
 
-    size_t unreadBytesFromL1 = (DRFLAC_CACHE_L1_SIZE_BYTES - (pFlac->consumedBits/8));
-    size_t unreadBytesFromL2 = (DRFLAC_CACHE_L2_SIZE_BYTES - ((pFlac->nextL2Line - pFlac->unusedL2Lines)*DRFLAC_CACHE_L1_SIZE_BYTES));
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+    }
 
 
-    return pFlac->currentBytePos - unreadBytesFromL1 - unreadBytesFromL2;
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
+        pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
+    }
 }
 }
+#endif
 
 
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
+    int32x4_t shift4_0 = vdupq_n_s32(shift0);
+    int32x4_t shift4_1 = vdupq_n_s32(shift1);
 
 
-static bool drflac__read_utf8_coded_number(drflac* pFlac, unsigned long long* pNumberOut)
-{
-    assert(pFlac != NULL);
-    assert(pNumberOut != NULL);
+    for (i = 0; i < frameCount4; ++i) {
+        int32x4_t left;
+        int32x4_t right;
 
 
-    // We should never need to read UTF-8 data while not being aligned to a byte boundary. Therefore we can grab the data
-    // directly from the input stream rather than using drflac__read_uint8().
-    assert((pFlac->consumedBits & 7) == 0);
+        left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift4_0));
+        right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift4_1));
 
 
-    unsigned char utf8[7] = {0};
-    if (!drflac__read_uint8(pFlac, 8, utf8)) {
-        *pNumberOut = 0;
-        return false;
+        drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
     }
     }
 
 
-    if ((utf8[0] & 0x80) == 0) {
-        *pNumberOut = utf8[0];
-        return true;
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
+        pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
     }
     }
+}
+#endif
 
 
-    int byteCount = 1;
-    if ((utf8[0] & 0xE0) == 0xC0) {
-        byteCount = 2;
-    } else if ((utf8[0] & 0xF0) == 0xE0) {
-        byteCount = 3;
-    } else if ((utf8[0] & 0xF8) == 0xF0) {
-        byteCount = 4;
-    } else if ((utf8[0] & 0xFC) == 0xF8) {
-        byteCount = 5;
-    } else if ((utf8[0] & 0xFE) == 0xFC) {
-        byteCount = 6;
-    } else if ((utf8[0] & 0xFF) == 0xFE) {
-        byteCount = 7;
-    } else {
-        *pNumberOut = 0;
-        return false;     // Bad UTF-8 encoding.
+static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
     }
     }
+}
 
 
-    // Read extra bytes.
-    assert(byteCount > 1);
 
 
-    unsigned long long result = ((long long)(utf8[0] & (0xFF >> (byteCount + 1))));
-    for (int i = 1; i < byteCount; ++i) {
-        if (!drflac__read_uint8(pFlac, 8, utf8 + i)) {
-            *pNumberOut = 0;
-            return false;
-        }
+DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut)
+{
+    drflac_uint64 framesRead;
+    drflac_uint32 unusedBitsPerSample;
 
 
-        result = (result << 6) | (utf8[i] & 0x3F);
+    if (pFlac == NULL || framesToRead == 0) {
+        return 0;
     }
     }
 
 
-    *pNumberOut = result;
-    return true;
-}
+    if (pBufferOut == NULL) {
+        return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
+    }
 
 
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
+    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
 
 
+    framesRead = 0;
+    while (framesToRead > 0) {
+        /* If we've run out of samples in this frame, go to the next. */
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
+                break;  /* Couldn't read the next frame, so just break from the loop and return. */
+            }
+        } else {
+            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+            drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
+            drflac_uint64 frameCountThisIteration = framesToRead;
 
 
-static DRFLAC_INLINE bool drflac__read_and_seek_rice(drflac* pFlac, unsigned char m)
-{
-    unsigned int unused;
-    if (!drflac__seek_past_next_set_bit(pFlac, &unused)) {
-        return false;
-    }
+            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
+                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
+            }
+
+            if (channelCount == 2) {
+                const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
+                const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
+
+                switch (pFlac->currentFLACFrame.header.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        drflac_read_pcm_frames_s32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
 
 
-    if (m > 0) {
-        if (!drflac__seek_bits(pFlac, m)) {
-            return false;
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        drflac_read_pcm_frames_s32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        drflac_read_pcm_frames_s32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        drflac_read_pcm_frames_s32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                }
+            } else {
+                /* Generic interleaving. */
+                drflac_uint64 i;
+                for (i = 0; i < frameCountThisIteration; ++i) {
+                    unsigned int j;
+                    for (j = 0; j < channelCount; ++j) {
+                        pBufferOut[(i*channelCount)+j] = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
+                    }
+                }
+            }
+
+            framesRead                += frameCountThisIteration;
+            pBufferOut                += frameCountThisIteration * channelCount;
+            framesToRead              -= frameCountThisIteration;
+            pFlac->currentPCMFrame    += frameCountThisIteration;
+            pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)frameCountThisIteration;
         }
         }
     }
     }
 
 
-    return true;
+    return framesRead;
 }
 }
 
 
 
 
-// The next two functions are responsible for calculating the prediction.
-//
-// When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
-// safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
-//
-//
-// Optimization Experiment #1
-//
-// The first optimization experiment I'm trying here is a loop unroll for the most common LPC orders. I've done a little test
-// and the results are as follows, in order of most common:
-// 1)  order = 8  : 93.1M
-// 2)  order = 7  : 36.6M
-// 3)  order = 3  : 33.2M
-// 4)  order = 6  : 20.9M
-// 5)  order = 5  : 18.1M
-// 6)  order = 4  : 15.8M
-// 7)  order = 12 : 10.8M
-// 8)  order = 2  :  9.8M
-// 9)  order = 1  :  1.6M
-// 10) order = 10 :  1.0M
-// 11) order = 9  :  0.8M
-// 12) order = 11 :  0.8M
-//
-// We'll experiment with unrolling the top 8 most common ones. We'll ignore the least common ones since there seems to be a
-// large drop off there.
-//
-// Result: There's a tiny improvement in some cases, but it could just be within margin of error so unsure if it's worthwhile
-// just yet.
-static DRFLAC_INLINE int32_t drflac__calculate_prediction_32(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
 {
 {
-    assert(order <= 32);
+    drflac_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        drflac_uint32 left  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        drflac_uint32 side  = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        drflac_uint32 right = left - side;
 
 
-    // 32-bit version.
+        left  >>= 16;
+        right >>= 16;
 
 
-    // This method is slower on both 32- and 64-bit builds with VC++. Leaving this here for now just in case we need it later
-    // for whatever reason.
-#if 0
-    int prediction;
-    if (order == 8)
-    {
-        prediction  = coefficients[0] * pDecodedSamples[-1];
-        prediction += coefficients[1] * pDecodedSamples[-2];
-        prediction += coefficients[2] * pDecodedSamples[-3];
-        prediction += coefficients[3] * pDecodedSamples[-4];
-        prediction += coefficients[4] * pDecodedSamples[-5];
-        prediction += coefficients[5] * pDecodedSamples[-6];
-        prediction += coefficients[6] * pDecodedSamples[-7];
-        prediction += coefficients[7] * pDecodedSamples[-8];
-    }
-    else if (order == 7)
-    {
-        prediction  = coefficients[0] * pDecodedSamples[-1];
-        prediction += coefficients[1] * pDecodedSamples[-2];
-        prediction += coefficients[2] * pDecodedSamples[-3];
-        prediction += coefficients[3] * pDecodedSamples[-4];
-        prediction += coefficients[4] * pDecodedSamples[-5];
-        prediction += coefficients[5] * pDecodedSamples[-6];
-        prediction += coefficients[6] * pDecodedSamples[-7];
-    }
-    else if (order == 3)
-    {
-        prediction  = coefficients[0] * pDecodedSamples[-1];
-        prediction += coefficients[1] * pDecodedSamples[-2];
-        prediction += coefficients[2] * pDecodedSamples[-3];
-    }
-    else if (order == 6)
-    {
-        prediction  = coefficients[0] * pDecodedSamples[-1];
-        prediction += coefficients[1] * pDecodedSamples[-2];
-        prediction += coefficients[2] * pDecodedSamples[-3];
-        prediction += coefficients[3] * pDecodedSamples[-4];
-        prediction += coefficients[4] * pDecodedSamples[-5];
-        prediction += coefficients[5] * pDecodedSamples[-6];
-    }
-    else if (order == 5)
-    {
-        prediction  = coefficients[0] * pDecodedSamples[-1];
-        prediction += coefficients[1] * pDecodedSamples[-2];
-        prediction += coefficients[2] * pDecodedSamples[-3];
-        prediction += coefficients[3] * pDecodedSamples[-4];
-        prediction += coefficients[4] * pDecodedSamples[-5];
-    }
-    else if (order == 4)
-    {
-        prediction  = coefficients[0] * pDecodedSamples[-1];
-        prediction += coefficients[1] * pDecodedSamples[-2];
-        prediction += coefficients[2] * pDecodedSamples[-3];
-        prediction += coefficients[3] * pDecodedSamples[-4];
-    }
-    else if (order == 12)
-    {
-        prediction  = coefficients[0]  * pDecodedSamples[-1];
-        prediction += coefficients[1]  * pDecodedSamples[-2];
-        prediction += coefficients[2]  * pDecodedSamples[-3];
-        prediction += coefficients[3]  * pDecodedSamples[-4];
-        prediction += coefficients[4]  * pDecodedSamples[-5];
-        prediction += coefficients[5]  * pDecodedSamples[-6];
-        prediction += coefficients[6]  * pDecodedSamples[-7];
-        prediction += coefficients[7]  * pDecodedSamples[-8];
-        prediction += coefficients[8]  * pDecodedSamples[-9];
-        prediction += coefficients[9]  * pDecodedSamples[-10];
-        prediction += coefficients[10] * pDecodedSamples[-11];
-        prediction += coefficients[11] * pDecodedSamples[-12];
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
     }
     }
-    else if (order == 2)
-    {
-        prediction  = coefficients[0] * pDecodedSamples[-1];
-        prediction += coefficients[1] * pDecodedSamples[-2];
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
+
+        drflac_uint32 right0 = left0 - side0;
+        drflac_uint32 right1 = left1 - side1;
+        drflac_uint32 right2 = left2 - side2;
+        drflac_uint32 right3 = left3 - side3;
+
+        left0  >>= 16;
+        left1  >>= 16;
+        left2  >>= 16;
+        left3  >>= 16;
+
+        right0 >>= 16;
+        right1 >>= 16;
+        right2 >>= 16;
+        right3 >>= 16;
+
+        pOutputSamples[i*8+0] = (drflac_int16)left0;
+        pOutputSamples[i*8+1] = (drflac_int16)right0;
+        pOutputSamples[i*8+2] = (drflac_int16)left1;
+        pOutputSamples[i*8+3] = (drflac_int16)right1;
+        pOutputSamples[i*8+4] = (drflac_int16)left2;
+        pOutputSamples[i*8+5] = (drflac_int16)right2;
+        pOutputSamples[i*8+6] = (drflac_int16)left3;
+        pOutputSamples[i*8+7] = (drflac_int16)right3;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        left  >>= 16;
+        right >>= 16;
+
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
     }
     }
-    else if (order == 1)
-    {
-        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i right = _mm_sub_epi32(left, side);
+
+        left  = _mm_srai_epi32(left,  16);
+        right = _mm_srai_epi32(right, 16);
+
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
     }
     }
-    else if (order == 10)
-    {
-        prediction  = coefficients[0]  * pDecodedSamples[-1];
-        prediction += coefficients[1]  * pDecodedSamples[-2];
-        prediction += coefficients[2]  * pDecodedSamples[-3];
-        prediction += coefficients[3]  * pDecodedSamples[-4];
-        prediction += coefficients[4]  * pDecodedSamples[-5];
-        prediction += coefficients[5]  * pDecodedSamples[-6];
-        prediction += coefficients[6]  * pDecodedSamples[-7];
-        prediction += coefficients[7]  * pDecodedSamples[-8];
-        prediction += coefficients[8]  * pDecodedSamples[-9];
-        prediction += coefficients[9]  * pDecodedSamples[-10];
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        left  >>= 16;
+        right >>= 16;
+
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
     }
     }
-    else if (order == 9)
-    {
-        prediction  = coefficients[0]  * pDecodedSamples[-1];
-        prediction += coefficients[1]  * pDecodedSamples[-2];
-        prediction += coefficients[2]  * pDecodedSamples[-3];
-        prediction += coefficients[3]  * pDecodedSamples[-4];
-        prediction += coefficients[4]  * pDecodedSamples[-5];
-        prediction += coefficients[5]  * pDecodedSamples[-6];
-        prediction += coefficients[6]  * pDecodedSamples[-7];
-        prediction += coefficients[7]  * pDecodedSamples[-8];
-        prediction += coefficients[8]  * pDecodedSamples[-9];
+}
+#endif
+
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t left;
+        uint32x4_t side;
+        uint32x4_t right;
+
+        left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        right = vsubq_u32(left, side);
+
+        left  = vshrq_n_u32(left,  16);
+        right = vshrq_n_u32(right, 16);
+
+        drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
     }
     }
-    else if (order == 11)
-    {
-        prediction  = coefficients[0]  * pDecodedSamples[-1];
-        prediction += coefficients[1]  * pDecodedSamples[-2];
-        prediction += coefficients[2]  * pDecodedSamples[-3];
-        prediction += coefficients[3]  * pDecodedSamples[-4];
-        prediction += coefficients[4]  * pDecodedSamples[-5];
-        prediction += coefficients[5]  * pDecodedSamples[-6];
-        prediction += coefficients[6]  * pDecodedSamples[-7];
-        prediction += coefficients[7]  * pDecodedSamples[-8];
-        prediction += coefficients[8]  * pDecodedSamples[-9];
-        prediction += coefficients[9]  * pDecodedSamples[-10];
-        prediction += coefficients[10] * pDecodedSamples[-11];
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        left  >>= 16;
+        right >>= 16;
+
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
     }
     }
-    else
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
     {
     {
-        prediction = 0;
-        for (int j = 0; j < (int)order; ++j) {
-            prediction += coefficients[j] * pDecodedSamples[-j-1];
-        }
-    }
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s16__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s16__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
 #endif
 #endif
+    }
+}
 
 
-    // Experiment #2. See if we can use a switch and let the compiler optimize it to a jump table.
-    // Result: VC++ definitely optimizes this to a single jmp as expected. I expect other compilers should do the same, but I've
-    // not verified yet.
-#if 1
-    int prediction = 0;
 
 
-    switch (order)
-    {
-    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
-    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
-    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
-    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
-    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
-    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
-    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
-    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
-    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
-    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
-    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
-    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
-    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
-    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
-    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
-    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
-    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
-    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
-    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
-    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
-    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
-    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
-    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
-    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
-    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
-    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
-    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
-    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
-    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
-    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
-    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
-    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        drflac_uint32 side  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        drflac_uint32 left  = right + side;
+
+        left  >>= 16;
+        right >>= 16;
+
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
     }
     }
+}
 #endif
 #endif
 
 
-    return (int32_t)(prediction >> shift);
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
+
+        drflac_uint32 left0 = right0 + side0;
+        drflac_uint32 left1 = right1 + side1;
+        drflac_uint32 left2 = right2 + side2;
+        drflac_uint32 left3 = right3 + side3;
+
+        left0  >>= 16;
+        left1  >>= 16;
+        left2  >>= 16;
+        left3  >>= 16;
+
+        right0 >>= 16;
+        right1 >>= 16;
+        right2 >>= 16;
+        right3 >>= 16;
+
+        pOutputSamples[i*8+0] = (drflac_int16)left0;
+        pOutputSamples[i*8+1] = (drflac_int16)right0;
+        pOutputSamples[i*8+2] = (drflac_int16)left1;
+        pOutputSamples[i*8+3] = (drflac_int16)right1;
+        pOutputSamples[i*8+4] = (drflac_int16)left2;
+        pOutputSamples[i*8+5] = (drflac_int16)right2;
+        pOutputSamples[i*8+6] = (drflac_int16)left3;
+        pOutputSamples[i*8+7] = (drflac_int16)right3;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        left  >>= 16;
+        right >>= 16;
+
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
+    }
 }
 }
 
 
-static DRFLAC_INLINE int32_t drflac__calculate_prediction(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
 {
 {
-    assert(order <= 32);
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-    // 64-bit version.
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
 
 
-    // This method is faster on the 32-bit build when compiling with VC++. See note below.
-#ifndef DRFLAC_64BIT
-    long long prediction;
-    if (order == 8)
-    {
-        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
-        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
-        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
-        prediction += (long long)coefficients[7] * (long long)pDecodedSamples[-8];
-    }
-    else if (order == 7)
-    {
-        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
-        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
-        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
-    }
-    else if (order == 3)
-    {
-        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
-    }
-    else if (order == 6)
-    {
-        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
-        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
-    }
-    else if (order == 5)
-    {
-        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i left  = _mm_add_epi32(right, side);
+
+        left  = _mm_srai_epi32(left,  16);
+        right = _mm_srai_epi32(right, 16);
+
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
     }
     }
-    else if (order == 4)
-    {
-        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        left  >>= 16;
+        right >>= 16;
+
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
     }
     }
-    else if (order == 12)
-    {
-        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
-        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
-        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
-        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
-        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
-        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
-        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
-        prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
+}
+#endif
+
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t side;
+        uint32x4_t right;
+        uint32x4_t left;
+
+        side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        left  = vaddq_u32(right, side);
+
+        left  = vshrq_n_u32(left,  16);
+        right = vshrq_n_u32(right, 16);
+
+        drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
     }
     }
-    else if (order == 2)
-    {
-        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        left  >>= 16;
+        right >>= 16;
+
+        pOutputSamples[i*2+0] = (drflac_int16)left;
+        pOutputSamples[i*2+1] = (drflac_int16)right;
     }
     }
-    else if (order == 1)
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
     {
     {
-        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s16__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s16__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
     }
     }
-    else if (order == 10)
-    {
-        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
-        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
-        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
-        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
-        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
-        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+}
+
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        drflac_uint32 mid  = (drflac_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+        mid = (mid << 1) | (side & 0x01);
+
+        pOutputSamples[i*2+0] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
+        pOutputSamples[i*2+1] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
     }
     }
-    else if (order == 9)
-    {
-        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
-        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
-        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
-        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
-        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift = unusedBitsPerSample;
+
+    if (shift > 0) {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            drflac_uint32 temp0L;
+            drflac_uint32 temp1L;
+            drflac_uint32 temp2L;
+            drflac_uint32 temp3L;
+            drflac_uint32 temp0R;
+            drflac_uint32 temp1R;
+            drflac_uint32 temp2R;
+            drflac_uint32 temp3R;
+
+            drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+
+            drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+
+            temp0L = (mid0 + side0) << shift;
+            temp1L = (mid1 + side1) << shift;
+            temp2L = (mid2 + side2) << shift;
+            temp3L = (mid3 + side3) << shift;
+
+            temp0R = (mid0 - side0) << shift;
+            temp1R = (mid1 - side1) << shift;
+            temp2R = (mid2 - side2) << shift;
+            temp3R = (mid3 - side3) << shift;
+
+            temp0L >>= 16;
+            temp1L >>= 16;
+            temp2L >>= 16;
+            temp3L >>= 16;
+
+            temp0R >>= 16;
+            temp1R >>= 16;
+            temp2R >>= 16;
+            temp3R >>= 16;
+
+            pOutputSamples[i*8+0] = (drflac_int16)temp0L;
+            pOutputSamples[i*8+1] = (drflac_int16)temp0R;
+            pOutputSamples[i*8+2] = (drflac_int16)temp1L;
+            pOutputSamples[i*8+3] = (drflac_int16)temp1R;
+            pOutputSamples[i*8+4] = (drflac_int16)temp2L;
+            pOutputSamples[i*8+5] = (drflac_int16)temp2R;
+            pOutputSamples[i*8+6] = (drflac_int16)temp3L;
+            pOutputSamples[i*8+7] = (drflac_int16)temp3R;
+        }
+    } else {
+        for (i = 0; i < frameCount4; ++i) {
+            drflac_uint32 temp0L;
+            drflac_uint32 temp1L;
+            drflac_uint32 temp2L;
+            drflac_uint32 temp3L;
+            drflac_uint32 temp0R;
+            drflac_uint32 temp1R;
+            drflac_uint32 temp2R;
+            drflac_uint32 temp3R;
+
+            drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+
+            drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+
+            temp0L = ((drflac_int32)(mid0 + side0) >> 1);
+            temp1L = ((drflac_int32)(mid1 + side1) >> 1);
+            temp2L = ((drflac_int32)(mid2 + side2) >> 1);
+            temp3L = ((drflac_int32)(mid3 + side3) >> 1);
+
+            temp0R = ((drflac_int32)(mid0 - side0) >> 1);
+            temp1R = ((drflac_int32)(mid1 - side1) >> 1);
+            temp2R = ((drflac_int32)(mid2 - side2) >> 1);
+            temp3R = ((drflac_int32)(mid3 - side3) >> 1);
+
+            temp0L >>= 16;
+            temp1L >>= 16;
+            temp2L >>= 16;
+            temp3L >>= 16;
+
+            temp0R >>= 16;
+            temp1R >>= 16;
+            temp2R >>= 16;
+            temp3R >>= 16;
+
+            pOutputSamples[i*8+0] = (drflac_int16)temp0L;
+            pOutputSamples[i*8+1] = (drflac_int16)temp0R;
+            pOutputSamples[i*8+2] = (drflac_int16)temp1L;
+            pOutputSamples[i*8+3] = (drflac_int16)temp1R;
+            pOutputSamples[i*8+4] = (drflac_int16)temp2L;
+            pOutputSamples[i*8+5] = (drflac_int16)temp2R;
+            pOutputSamples[i*8+6] = (drflac_int16)temp3L;
+            pOutputSamples[i*8+7] = (drflac_int16)temp3R;
+        }
     }
     }
-    else if (order == 11)
-    {
-        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
-        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
-        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
-        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
-        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
-        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
-        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
-        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
-        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
-        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
-        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+        mid = (mid << 1) | (side & 0x01);
+
+        pOutputSamples[i*2+0] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
+        pOutputSamples[i*2+1] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
     }
     }
-    else
-    {
-        prediction = 0;
-        for (int j = 0; j < (int)order; ++j) {
-            prediction += (long long)coefficients[j] * (long long)pDecodedSamples[-j-1];
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift = unusedBitsPerSample;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
+
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+
+            left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
+            right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
+
+            left  = _mm_srai_epi32(left,  16);
+            right = _mm_srai_epi32(right, 16);
+
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
+        }
+
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid = (mid << 1) | (side & 0x01);
+
+            pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16);
+            pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16);
+        }
+    } else {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
+
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+
+            left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
+            right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
+
+            left  = _mm_srai_epi32(left,  16);
+            right = _mm_srai_epi32(right, 16);
+
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
+        }
+
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid = (mid << 1) | (side & 0x01);
+
+            pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16);
+            pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16);
         }
         }
     }
     }
+}
 #endif
 #endif
 
 
-    // Experiment #2. See if we can use a switch and let the compiler optimize it to a single jmp instruction.
-    // Result: VC++ optimizes this to a single jmp on the 64-bit build, but for some reason the 32-bit version compiles to less efficient
-    // code. Thus, we use this version on the 64-bit build and the uglier version above for the 32-bit build. If anyone has an idea on how
-    // I can get VC++ to generate an efficient jump table for the 32-bit build let me know.
-#ifdef DRFLAC_64BIT
-    long long prediction = 0;
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift = unusedBitsPerSample;
+    int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */
+    int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */
 
 
-    switch (order)
-    {
-    case 32: prediction += (long long)coefficients[31] * (long long)pDecodedSamples[-32];
-    case 31: prediction += (long long)coefficients[30] * (long long)pDecodedSamples[-31];
-    case 30: prediction += (long long)coefficients[29] * (long long)pDecodedSamples[-30];
-    case 29: prediction += (long long)coefficients[28] * (long long)pDecodedSamples[-29];
-    case 28: prediction += (long long)coefficients[27] * (long long)pDecodedSamples[-28];
-    case 27: prediction += (long long)coefficients[26] * (long long)pDecodedSamples[-27];
-    case 26: prediction += (long long)coefficients[25] * (long long)pDecodedSamples[-26];
-    case 25: prediction += (long long)coefficients[24] * (long long)pDecodedSamples[-25];
-    case 24: prediction += (long long)coefficients[23] * (long long)pDecodedSamples[-24];
-    case 23: prediction += (long long)coefficients[22] * (long long)pDecodedSamples[-23];
-    case 22: prediction += (long long)coefficients[21] * (long long)pDecodedSamples[-22];
-    case 21: prediction += (long long)coefficients[20] * (long long)pDecodedSamples[-21];
-    case 20: prediction += (long long)coefficients[19] * (long long)pDecodedSamples[-20];
-    case 19: prediction += (long long)coefficients[18] * (long long)pDecodedSamples[-19];
-    case 18: prediction += (long long)coefficients[17] * (long long)pDecodedSamples[-18];
-    case 17: prediction += (long long)coefficients[16] * (long long)pDecodedSamples[-17];
-    case 16: prediction += (long long)coefficients[15] * (long long)pDecodedSamples[-16];
-    case 15: prediction += (long long)coefficients[14] * (long long)pDecodedSamples[-15];
-    case 14: prediction += (long long)coefficients[13] * (long long)pDecodedSamples[-14];
-    case 13: prediction += (long long)coefficients[12] * (long long)pDecodedSamples[-13];
-    case 12: prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
-    case 11: prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
-    case 10: prediction += (long long)coefficients[ 9] * (long long)pDecodedSamples[-10];
-    case  9: prediction += (long long)coefficients[ 8] * (long long)pDecodedSamples[- 9];
-    case  8: prediction += (long long)coefficients[ 7] * (long long)pDecodedSamples[- 8];
-    case  7: prediction += (long long)coefficients[ 6] * (long long)pDecodedSamples[- 7];
-    case  6: prediction += (long long)coefficients[ 5] * (long long)pDecodedSamples[- 6];
-    case  5: prediction += (long long)coefficients[ 4] * (long long)pDecodedSamples[- 5];
-    case  4: prediction += (long long)coefficients[ 3] * (long long)pDecodedSamples[- 4];
-    case  3: prediction += (long long)coefficients[ 2] * (long long)pDecodedSamples[- 3];
-    case  2: prediction += (long long)coefficients[ 1] * (long long)pDecodedSamples[- 2];
-    case  1: prediction += (long long)coefficients[ 0] * (long long)pDecodedSamples[- 1];
-    }
-#endif
-
-    return (int32_t)(prediction >> shift);
-}
-
-
-// Reads and decodes a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
-//
-// This is the most frequently called function in the library. It does both the Rice decoding and the prediction in a single loop
-// iteration.
-static bool drflac__decode_samples_with_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
-{
-    assert(pFlac != NULL);
-    assert(count > 0);
-    assert(pSamplesOut != NULL);
-
-    static unsigned int bitOffsetTable[] = {
-        0,
-        4,
-        3, 3,
-        2, 2, 2, 2,
-        1, 1, 1, 1, 1, 1, 1, 1
-    };
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
 
 
-    drflac_cache_t riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(riceParam);
-    drflac_cache_t resultHiShift = DRFLAC_CACHE_L1_SIZE_BITS - riceParam;
+    wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+    wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
 
 
-    for (int i = 0; i < (int)count; ++i)
-    {
-        unsigned int zeroCounter = 0;
-        while (pFlac->cache == 0) {
-            zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
-            if (!drflac__reload_cache(pFlac)) {
-                return false;
-            }
-        }
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
 
 
-        // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
-        // no need for us to perform any cache reloading logic here which should make things much faster.
-        assert(pFlac->cache != 0);
-        unsigned int decodedRice;
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
 
 
-        unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
-        if (setBitOffsetPlus1 > 0) {
-            decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
-        } else {
-            if (pFlac->cache == 1) {
-                setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
-                decodedRice = (zeroCounter + (DRFLAC_CACHE_L1_SIZE_BITS-1)) << riceParam;
-            } else {
-                setBitOffsetPlus1 = 5;
-                for (;;)
-                {
-                    if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
-                        decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
-                        break;
-                    }
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
 
 
-                    setBitOffsetPlus1 += 1;
-                }
-            }
+            left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
+            right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
+
+            left  = vshrq_n_s32(left,  16);
+            right = vshrq_n_s32(right, 16);
+
+            drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
         }
         }
 
 
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-        unsigned int bitsLo = 0;
-        unsigned int riceLength = setBitOffsetPlus1 + riceParam;
-        if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING)
-        {
-            bitsLo = (unsigned int)((pFlac->cache & (riceParamMask >> setBitOffsetPlus1)) >> (DRFLAC_CACHE_L1_SIZE_BITS - riceLength));
+            mid = (mid << 1) | (side & 0x01);
 
 
-            pFlac->consumedBits += riceLength;
-            pFlac->cache <<= riceLength;
+            pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16);
+            pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16);
         }
         }
-        else
-        {
-            pFlac->consumedBits += riceLength;
-            pFlac->cache <<= setBitOffsetPlus1;
+    } else {
+        int32x4_t shift4;
 
 
-            // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
-            size_t bitCountLo = pFlac->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS;
-            drflac_cache_t resultHi = pFlac->cache & riceParamMask;    // <-- This mask is OK because all bits after the first bits are always zero.
+        shift -= 1;
+        shift4 = vdupq_n_s32(shift);
 
 
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
 
 
-            if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
-                pFlac->cache = drflac__be2host__cache_line(pFlac->cacheL2[pFlac->nextL2Line++]);
-            } else {
-                // Slow path. We need to fetch more data from the client.
-                if (!drflac__reload_cache(pFlac)) {
-                    return false;
-                }
-            }
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
 
 
-            bitsLo = (unsigned int)((resultHi >> resultHiShift) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo));
-            pFlac->consumedBits = bitCountLo;
-            pFlac->cache <<= bitCountLo;
-        }
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
 
 
+            left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
+            right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
 
 
-        decodedRice |= bitsLo;
-        if ((decodedRice & 0x01)) {
-            decodedRice = ~(decodedRice >> 1);
-        } else {
-            decodedRice = (decodedRice >> 1);
+            left  = vshrq_n_s32(left,  16);
+            right = vshrq_n_s32(right, 16);
+
+            drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
         }
         }
 
 
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-        // In order to properly calculate the prediction when the bits per sample is >16 we need to do it using 64-bit arithmetic. We can assume this
-        // is probably going to be slower on 32-bit systems so we'll do a more optimized 32-bit version when the bits per sample is low enough.
-        if (pFlac->currentFrame.bitsPerSample > 16) {
-            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i));
-        } else {
-            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i));
+            mid = (mid << 1) | (side & 0x01);
+
+            pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16);
+            pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16);
         }
         }
     }
     }
+}
+#endif
 
 
-    return true;
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s16__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s16__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
 }
 }
 
 
 
 
-// Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
-static bool drflac__read_and_seek_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam)
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
-    assert(count > 0);
-
-    for (unsigned int i = 0; i < count; ++i) {
-        if (!drflac__read_and_seek_rice(pFlac, riceParam)) {
-            return false;
-        }
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int16)((drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) >> 16);
+        pOutputSamples[i*2+1] = (drflac_int16)((drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) >> 16);
     }
     }
+}
+#endif
 
 
-    return true;
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
+
+        tempL0 >>= 16;
+        tempL1 >>= 16;
+        tempL2 >>= 16;
+        tempL3 >>= 16;
+
+        tempR0 >>= 16;
+        tempR1 >>= 16;
+        tempR2 >>= 16;
+        tempR3 >>= 16;
+
+        pOutputSamples[i*8+0] = (drflac_int16)tempL0;
+        pOutputSamples[i*8+1] = (drflac_int16)tempR0;
+        pOutputSamples[i*8+2] = (drflac_int16)tempL1;
+        pOutputSamples[i*8+3] = (drflac_int16)tempR1;
+        pOutputSamples[i*8+4] = (drflac_int16)tempL2;
+        pOutputSamples[i*8+5] = (drflac_int16)tempR2;
+        pOutputSamples[i*8+6] = (drflac_int16)tempL3;
+        pOutputSamples[i*8+7] = (drflac_int16)tempR3;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
+        pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
+    }
 }
 }
 
 
-static bool drflac__decode_samples_with_residual__unencoded(drflac* pFlac, unsigned int count, unsigned char unencodedBitsPerSample, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
-    assert(count > 0);
-    assert(unencodedBitsPerSample > 0 && unencodedBitsPerSample <= 32);
-    assert(pSamplesOut != NULL);
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-    for (unsigned int i = 0; i < count; ++i)
-    {
-        if (!drflac__read_int32(pFlac, unencodedBitsPerSample, pSamplesOut + i)) {
-            return false;
-        }
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
 
 
-        pSamplesOut[i] += drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i);
+        left  = _mm_srai_epi32(left,  16);
+        right = _mm_srai_epi32(right, 16);
+
+        /* At this point we have results. We can now pack and interleave these into a single __m128i object and then store the in the output buffer. */
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
     }
     }
 
 
-    return true;
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
+        pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
+    }
 }
 }
+#endif
 
 
-
-// Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
-// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
-// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
-static bool drflac__decode_samples_with_residual(drflac* pFlac, unsigned int blockSize, unsigned int order, int shift, const short* coefficients, int* pDecodedSamples)
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
-    assert(blockSize != 0);
-    assert(pDecodedSamples != NULL);       // <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode?
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    int32x4_t shift0_4 = vdupq_n_s32(shift0);
+    int32x4_t shift1_4 = vdupq_n_s32(shift1);
+
+    for (i = 0; i < frameCount4; ++i) {
+        int32x4_t left;
+        int32x4_t right;
 
 
-    unsigned char residualMethod;
-    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
-        return false;
+        left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
+        right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
+
+        left  = vshrq_n_s32(left,  16);
+        right = vshrq_n_s32(right, 16);
+
+        drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
     }
     }
 
 
-    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-        return false;    // Unknown or unsupported residual coding method.
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
+        pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
     }
     }
+}
+#endif
 
 
-    // Ignore the first <order> values.
-    pDecodedSamples += order;
+static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_s16__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_s16__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
 
 
+DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut)
+{
+    drflac_uint64 framesRead;
+    drflac_uint32 unusedBitsPerSample;
 
 
-    unsigned char partitionOrder;
-    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
-        return false;
+    if (pFlac == NULL || framesToRead == 0) {
+        return 0;
     }
     }
 
 
+    if (pBufferOut == NULL) {
+        return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
+    }
 
 
-    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
-    unsigned int partitionsRemaining = (1 << partitionOrder);
-    for (;;)
-    {
-        unsigned char riceParam = 0;
-        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
-            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
-                return false;
-            }
-            if (riceParam == 16) {
-                riceParam = 0xFF;
-            }
-        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
-                return false;
-            }
-            if (riceParam == 32) {
-                riceParam = 0xFF;
-            }
-        }
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
+    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
 
 
-        if (riceParam != 0xFF) {
-            if (!drflac__decode_samples_with_residual__rice(pFlac, samplesInPartition, riceParam, order, shift, coefficients, pDecodedSamples)) {
-                return false;
+    framesRead = 0;
+    while (framesToRead > 0) {
+        /* If we've run out of samples in this frame, go to the next. */
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
+                break;  /* Couldn't read the next frame, so just break from the loop and return. */
             }
             }
         } else {
         } else {
-            unsigned char unencodedBitsPerSample = 0;
-            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
-                return false;
-            }
+            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+            drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
+            drflac_uint64 frameCountThisIteration = framesToRead;
 
 
-            if (!drflac__decode_samples_with_residual__unencoded(pFlac, samplesInPartition, unencodedBitsPerSample, order, shift, coefficients, pDecodedSamples)) {
-                return false;
+            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
+                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
             }
             }
-        }
 
 
-        pDecodedSamples += samplesInPartition;
+            if (channelCount == 2) {
+                const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
+                const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
 
 
+                switch (pFlac->currentFLACFrame.header.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        drflac_read_pcm_frames_s16__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
 
 
-        if (partitionsRemaining == 1) {
-            break;
-        }
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        drflac_read_pcm_frames_s16__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
 
 
-        partitionsRemaining -= 1;
-        samplesInPartition = blockSize / (1 << partitionOrder);
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        drflac_read_pcm_frames_s16__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        drflac_read_pcm_frames_s16__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                }
+            } else {
+                /* Generic interleaving. */
+                drflac_uint64 i;
+                for (i = 0; i < frameCountThisIteration; ++i) {
+                    unsigned int j;
+                    for (j = 0; j < channelCount; ++j) {
+                        drflac_int32 sampleS32 = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
+                        pBufferOut[(i*channelCount)+j] = (drflac_int16)(sampleS32 >> 16);
+                    }
+                }
+            }
+
+            framesRead                += frameCountThisIteration;
+            pBufferOut                += frameCountThisIteration * channelCount;
+            framesToRead              -= frameCountThisIteration;
+            pFlac->currentPCMFrame    += frameCountThisIteration;
+            pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)frameCountThisIteration;
+        }
     }
     }
 
 
-    return true;
+    return framesRead;
 }
 }
 
 
-// Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
-// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
-// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
-static bool drflac__read_and_seek_residual(drflac* pFlac, unsigned int blockSize, unsigned int order)
-{
-    assert(pFlac != NULL);
-    assert(blockSize != 0);
 
 
-    unsigned char residualMethod;
-    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
-        return false;
-    }
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        drflac_uint32 left  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        drflac_uint32 side  = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        drflac_uint32 right = left - side;
 
 
-    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-        return false;    // Unknown or unsupported residual coding method.
+        pOutputSamples[i*2+0] = (float)((drflac_int32)left  / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((drflac_int32)right / 2147483648.0);
     }
     }
+}
+#endif
 
 
-    unsigned char partitionOrder;
-    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
-        return false;
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+    float factor = 1 / 2147483648.0;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
+
+        drflac_uint32 right0 = left0 - side0;
+        drflac_uint32 right1 = left1 - side1;
+        drflac_uint32 right2 = left2 - side2;
+        drflac_uint32 right3 = left3 - side3;
+
+        pOutputSamples[i*8+0] = (drflac_int32)left0  * factor;
+        pOutputSamples[i*8+1] = (drflac_int32)right0 * factor;
+        pOutputSamples[i*8+2] = (drflac_int32)left1  * factor;
+        pOutputSamples[i*8+3] = (drflac_int32)right1 * factor;
+        pOutputSamples[i*8+4] = (drflac_int32)left2  * factor;
+        pOutputSamples[i*8+5] = (drflac_int32)right2 * factor;
+        pOutputSamples[i*8+6] = (drflac_int32)left3  * factor;
+        pOutputSamples[i*8+7] = (drflac_int32)right3 * factor;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left  * factor;
+        pOutputSamples[i*2+1] = (drflac_int32)right * factor;
     }
     }
+}
 
 
-    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
-    unsigned int partitionsRemaining = (1 << partitionOrder);
-    for (;;)
-    {
-        unsigned char riceParam = 0;
-        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
-            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
-                return false;
-            }
-            if (riceParam == 16) {
-                riceParam = 0xFF;
-            }
-        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
-                return false;
-            }
-            if (riceParam == 32) {
-                riceParam = 0xFF;
-            }
-        }
-
-        if (riceParam != 0xFF) {
-            if (!drflac__read_and_seek_residual__rice(pFlac, samplesInPartition, riceParam)) {
-                return false;
-            }
-        } else {
-            unsigned char unencodedBitsPerSample = 0;
-            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
-                return false;
-            }
-
-            if (!drflac__seek_bits(pFlac, unencodedBitsPerSample * samplesInPartition)) {
-                return false;
-            }
-        }
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    __m128 factor;
 
 
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
 
 
-        if (partitionsRemaining == 1) {
-            break;
-        }
+    factor = _mm_set1_ps(1.0f / 8388608.0f);
 
 
-        partitionsRemaining -= 1;
-        samplesInPartition = blockSize / (1 << partitionOrder);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i right = _mm_sub_epi32(left, side);
+        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
+        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
+
+        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
     }
     }
 
 
-    return true;
-}
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
 
 
+        pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
+    }
+}
+#endif
 
 
-static bool drflac__decode_samples__constant(drflac* pFlac, drflac_subframe* pSubframe)
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
 {
 {
-    // Only a single sample needs to be decoded here.
-    int sample;
-    if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
-        return false;
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    float32x4_t factor4;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    factor4  = vdupq_n_f32(1.0f / 8388608.0f);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t left;
+        uint32x4_t side;
+        uint32x4_t right;
+        float32x4_t leftf;
+        float32x4_t rightf;
+
+        left   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        right  = vsubq_u32(left, side);
+        leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
+        rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
+
+        drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 left  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 side  = pInputSamples1U32[i] << shift1;
+        drflac_uint32 right = left - side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
     }
     }
+}
+#endif
 
 
-    // We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
-    // we'll want to look at a more efficient way.
-    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
-        pSubframe->pDecodedSamples[i] = sample;
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_f32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
     }
     }
-
-    return true;
 }
 }
 
 
-static bool drflac__decode_samples__verbatim(drflac* pFlac, drflac_subframe* pSubframe)
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
 {
 {
-    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
-        int sample;
-        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
-            return false;
-        }
+    drflac_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        drflac_uint32 side  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        drflac_uint32 left  = right + side;
 
 
-        pSubframe->pDecodedSamples[i] = sample;
+        pOutputSamples[i*2+0] = (float)((drflac_int32)left  / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((drflac_int32)right / 2147483648.0);
     }
     }
+}
+#endif
 
 
-    return true;
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    float factor = 1 / 2147483648.0;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
+
+        drflac_uint32 left0 = right0 + side0;
+        drflac_uint32 left1 = right1 + side1;
+        drflac_uint32 left2 = right2 + side2;
+        drflac_uint32 left3 = right3 + side3;
+
+        pOutputSamples[i*8+0] = (drflac_int32)left0  * factor;
+        pOutputSamples[i*8+1] = (drflac_int32)right0 * factor;
+        pOutputSamples[i*8+2] = (drflac_int32)left1  * factor;
+        pOutputSamples[i*8+3] = (drflac_int32)right1 * factor;
+        pOutputSamples[i*8+4] = (drflac_int32)left2  * factor;
+        pOutputSamples[i*8+5] = (drflac_int32)right2 * factor;
+        pOutputSamples[i*8+6] = (drflac_int32)left3  * factor;
+        pOutputSamples[i*8+7] = (drflac_int32)right3 * factor;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left  * factor;
+        pOutputSamples[i*2+1] = (drflac_int32)right * factor;
+    }
 }
 }
 
 
-static bool drflac__decode_samples__fixed(drflac* pFlac, drflac_subframe* pSubframe)
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
 {
 {
-    short lpcCoefficientsTable[5][4] = {
-        {0,  0, 0,  0},
-        {1,  0, 0,  0},
-        {2, -1, 0,  0},
-        {3, -3, 1,  0},
-        {4, -6, 4, -1}
-    };
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    __m128 factor;
 
 
-    // Warm up samples and coefficients.
-    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
-        int sample;
-        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
-            return false;
-        }
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
 
 
-        pSubframe->pDecodedSamples[i] = sample;
-    }
+    factor = _mm_set1_ps(1.0f / 8388608.0f);
 
 
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i left  = _mm_add_epi32(right, side);
+        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
+        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
 
 
-    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, 0, lpcCoefficientsTable[pSubframe->lpcOrder], pSubframe->pDecodedSamples)) {
-        return false;
+        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
     }
     }
 
 
-    return true;
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
+    }
 }
 }
+#endif
 
 
-static bool drflac__decode_samples__lpc(drflac* pFlac, drflac_subframe* pSubframe)
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
 {
 {
-    // Warm up samples.
-    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
-        int sample;
-        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
-            return false;
-        }
-
-        pSubframe->pDecodedSamples[i] = sample;
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    float32x4_t factor4;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    factor4  = vdupq_n_f32(1.0f / 8388608.0f);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t side;
+        uint32x4_t right;
+        uint32x4_t left;
+        float32x4_t leftf;
+        float32x4_t rightf;
+
+        side   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        right  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        left   = vaddq_u32(right, side);
+        leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
+        rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
+
+        drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 side  = pInputSamples0U32[i] << shift0;
+        drflac_uint32 right = pInputSamples1U32[i] << shift1;
+        drflac_uint32 left  = right + side;
+
+        pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
     }
     }
+}
+#endif
 
 
-    unsigned char lpcPrecision;
-    if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
-        return false;
-    }
-    if (lpcPrecision == 15) {
-        return false;    // Invalid.
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_f32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
     }
     }
-    lpcPrecision += 1;
-
+}
 
 
-    signed char lpcShift;
-    if (!drflac__read_int8(pFlac, 5, &lpcShift)) {
-        return false;
-    }
 
 
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        drflac_uint32 mid  = (drflac_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-    short coefficients[32];
-    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
-        if (!drflac__read_int16(pFlac, lpcPrecision, coefficients + i)) {
-            return false;
-        }
-    }
+        mid = (mid << 1) | (side & 0x01);
 
 
-    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, lpcShift, coefficients, pSubframe->pDecodedSamples)) {
-        return false;
+        pOutputSamples[i*2+0] = (float)((((drflac_int32)(mid + side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((((drflac_int32)(mid - side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
     }
     }
-
-    return true;
 }
 }
+#endif
 
 
-
-static bool drflac__read_next_frame_header(drflac* pFlac)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
-    assert(pFlac->onRead != NULL);
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift = unusedBitsPerSample;
+    float factor = 1 / 2147483648.0;
+
+    if (shift > 0) {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            drflac_uint32 temp0L;
+            drflac_uint32 temp1L;
+            drflac_uint32 temp2L;
+            drflac_uint32 temp3L;
+            drflac_uint32 temp0R;
+            drflac_uint32 temp1R;
+            drflac_uint32 temp2R;
+            drflac_uint32 temp3R;
+
+            drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+
+            drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+
+            temp0L = (mid0 + side0) << shift;
+            temp1L = (mid1 + side1) << shift;
+            temp2L = (mid2 + side2) << shift;
+            temp3L = (mid3 + side3) << shift;
+
+            temp0R = (mid0 - side0) << shift;
+            temp1R = (mid1 - side1) << shift;
+            temp2R = (mid2 - side2) << shift;
+            temp3R = (mid3 - side3) << shift;
+
+            pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor;
+            pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor;
+            pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor;
+            pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor;
+            pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor;
+            pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor;
+            pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor;
+            pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor;
+        }
+    } else {
+        for (i = 0; i < frameCount4; ++i) {
+            drflac_uint32 temp0L;
+            drflac_uint32 temp1L;
+            drflac_uint32 temp2L;
+            drflac_uint32 temp3L;
+            drflac_uint32 temp0R;
+            drflac_uint32 temp1R;
+            drflac_uint32 temp2R;
+            drflac_uint32 temp3R;
+
+            drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+
+            drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+
+            temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1);
+            temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1);
+            temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1);
+            temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1);
+
+            temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1);
+            temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1);
+            temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1);
+            temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1);
+
+            pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor;
+            pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor;
+            pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor;
+            pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor;
+            pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor;
+            pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor;
+            pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor;
+            pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor;
+        }
+    }
 
 
-    // At the moment the sync code is as a form of basic validation. The CRC is stored, but is unused at the moment. This
-    // should probably be handled better in the future.
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-    const int sampleRateTable[12]       = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
-    const uint8_t bitsPerSampleTable[8] = {0, 8, 12, (uint8_t)-1, 16, 20, 24, (uint8_t)-1};   // -1 = reserved.
+        mid = (mid << 1) | (side & 0x01);
 
 
-    unsigned short syncCode = 0;
-    if (!drflac__read_uint16(pFlac, 14, &syncCode)) {
-        return false;
+        pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) * factor;
+        pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) * factor;
     }
     }
+}
 
 
-    if (syncCode != 0x3FFE) {
-        // TODO: Try and recover by attempting to seek to and read the next frame?
-        return false;
-    }
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift = unusedBitsPerSample - 8;
+    float factor;
+    __m128 factor128;
 
 
-    unsigned char reserved;
-    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
-        return false;
-    }
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
 
 
-    unsigned char blockingStrategy = 0;
-    if (!drflac__read_uint8(pFlac, 1, &blockingStrategy)) {
-        return false;
-    }
+    factor = 1.0f / 8388608.0f;
+    factor128 = _mm_set1_ps(factor);
 
 
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i tempL;
+            __m128i tempR;
+            __m128  leftf;
+            __m128  rightf;
 
 
+            mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
 
 
-    unsigned char blockSize = 0;
-    if (!drflac__read_uint8(pFlac, 4, &blockSize)) {
-        return false;
-    }
+            mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
 
 
-    unsigned char sampleRate = 0;
-    if (!drflac__read_uint8(pFlac, 4, &sampleRate)) {
-        return false;
-    }
+            tempL  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
+            tempR  = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
 
 
-    unsigned char channelAssignment = 0;
-    if (!drflac__read_uint8(pFlac, 4, &channelAssignment)) {
-        return false;
-    }
+            leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
+            rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
 
 
-    unsigned char bitsPerSample = 0;
-    if (!drflac__read_uint8(pFlac, 3, &bitsPerSample)) {
-        return false;
-    }
+            _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+            _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
+        }
 
 
-    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
-        return false;
-    }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
+            mid = (mid << 1) | (side & 0x01);
 
 
-    unsigned char isVariableBlockSize = blockingStrategy == 1;
-    if (isVariableBlockSize) {
-        pFlac->currentFrame.frameNumber = 0;
-        if (!drflac__read_utf8_coded_number(pFlac, &pFlac->currentFrame.sampleNumber)) {
-            return false;
+            pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor;
+            pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor;
         }
         }
     } else {
     } else {
-        unsigned long long frameNumber = 0;
-        if (!drflac__read_utf8_coded_number(pFlac, &frameNumber)) {
-            return false;
-        }
-        pFlac->currentFrame.frameNumber  = (unsigned int)frameNumber;   // <-- Safe cast.
-        pFlac->currentFrame.sampleNumber = 0;
-    }
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i tempL;
+            __m128i tempR;
+            __m128 leftf;
+            __m128 rightf;
 
 
+            mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
 
 
-    if (blockSize == 1) {
-        pFlac->currentFrame.blockSize = 192;
-    } else if (blockSize >= 2 && blockSize <= 5) {
-        pFlac->currentFrame.blockSize = 576 * (1 << (blockSize - 2));
-    } else if (blockSize == 6) {
-        if (!drflac__read_uint16(pFlac, 8, &pFlac->currentFrame.blockSize)) {
-            return false;
-        }
-        pFlac->currentFrame.blockSize += 1;
-    } else if (blockSize == 7) {
-        if (!drflac__read_uint16(pFlac, 16, &pFlac->currentFrame.blockSize)) {
-            return false;
+            mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+
+            tempL  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
+            tempR  = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
+
+            leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
+            rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
+
+            _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+            _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
         }
         }
-        pFlac->currentFrame.blockSize += 1;
-    } else {
-        pFlac->currentFrame.blockSize = 256 * (1 << (blockSize - 8));
-    }
 
 
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-    if (sampleRate <= 11) {
-        pFlac->currentFrame.sampleRate = sampleRateTable[sampleRate];
-    } else if (sampleRate == 12) {
-        if (!drflac__read_uint32(pFlac, 8, &pFlac->currentFrame.sampleRate)) {
-            return false;
+            mid = (mid << 1) | (side & 0x01);
+
+            pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor;
+            pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor;
         }
         }
-        pFlac->currentFrame.sampleRate *= 1000;
-    } else if (sampleRate == 13) {
-        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
-            return false;
+    }
+}
+#endif
+
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift = unusedBitsPerSample - 8;
+    float factor;
+    float32x4_t factor4;
+    int32x4_t shift4;
+    int32x4_t wbps0_4;  /* Wasted Bits Per Sample */
+    int32x4_t wbps1_4;  /* Wasted Bits Per Sample */
+
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
+
+    factor  = 1.0f / 8388608.0f;
+    factor4 = vdupq_n_f32(factor);
+    wbps0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+    wbps1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            int32x4_t lefti;
+            int32x4_t righti;
+            float32x4_t leftf;
+            float32x4_t rightf;
+
+            uint32x4_t mid  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
+            uint32x4_t side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
+
+            mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
+
+            lefti  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
+            righti = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
+
+            leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
+            rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
+
+            drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
         }
         }
-    } else if (sampleRate == 14) {
-        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
-            return false;
+
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+
+            mid = (mid << 1) | (side & 0x01);
+
+            pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor;
+            pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor;
         }
         }
-        pFlac->currentFrame.sampleRate *= 10;
     } else {
     } else {
-        return false;  // Invalid.
-    }
+        shift -= 1;
+        shift4 = vdupq_n_s32(shift);
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t lefti;
+            int32x4_t righti;
+            float32x4_t leftf;
+            float32x4_t rightf;
 
 
+            mid    = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
+            side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
 
 
-    pFlac->currentFrame.channelAssignment = channelAssignment;
+            mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
 
 
-    pFlac->currentFrame.bitsPerSample = bitsPerSampleTable[bitsPerSample];
-    if (pFlac->currentFrame.bitsPerSample == 0) {
-        pFlac->currentFrame.bitsPerSample = pFlac->bitsPerSample;
-    }
-
-    if (drflac__read_uint8(pFlac, 8, &pFlac->currentFrame.crc8) != 1) {
-        return false;
-    }
+            lefti  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
+            righti = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
 
 
-    memset(pFlac->currentFrame.subframes, 0, sizeof(pFlac->currentFrame.subframes));
+            leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
+            rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
 
 
-    return true;
-}
+            drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+        }
 
 
-static bool drflac__read_subframe_header(drflac* pFlac, drflac_subframe* pSubframe)
-{
-    unsigned char header;
-    if (!drflac__read_uint8(pFlac, 8, &header)) {
-        return false;
-    }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
 
 
-    // First bit should always be 0.
-    if ((header & 0x80) != 0) {
-        return false;
-    }
+            mid = (mid << 1) | (side & 0x01);
 
 
-    int type = (header & 0x7E) >> 1;
-    if (type == 0) {
-        pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
-    } else if (type == 1) {
-        pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
-    } else {
-        if ((type & 0x20) != 0) {
-            pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
-            pSubframe->lpcOrder = (type & 0x1F) + 1;
-        } else if ((type & 0x08) != 0) {
-            pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
-            pSubframe->lpcOrder = (type & 0x07);
-            if (pSubframe->lpcOrder > 4) {
-                pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
-                pSubframe->lpcOrder = 0;
-            }
-        } else {
-            pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+            pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor;
+            pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor;
         }
         }
     }
     }
+}
+#endif
 
 
-    if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
-        return false;
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_f32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
     }
     }
+}
 
 
-    // Wasted bits per sample.
-    pSubframe->wastedBitsPerSample = 0;
-    if ((header & 0x01) == 1) {
-        unsigned int wastedBitsPerSample;
-        if (!drflac__seek_past_next_set_bit(pFlac, &wastedBitsPerSample)) {
-            return false;
-        }
-        pSubframe->wastedBitsPerSample = (unsigned char)wastedBitsPerSample + 1;
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (float)((drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) / 2147483648.0);
     }
     }
+}
+#endif
 
 
-    return true;
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    float factor = 1 / 2147483648.0;
+
+    for (i = 0; i < frameCount4; ++i) {
+        drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
+        drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
+        drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
+        drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
+
+        drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
+        drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
+        drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
+        drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
+
+        pOutputSamples[i*8+0] = (drflac_int32)tempL0 * factor;
+        pOutputSamples[i*8+1] = (drflac_int32)tempR0 * factor;
+        pOutputSamples[i*8+2] = (drflac_int32)tempL1 * factor;
+        pOutputSamples[i*8+3] = (drflac_int32)tempR1 * factor;
+        pOutputSamples[i*8+4] = (drflac_int32)tempL2 * factor;
+        pOutputSamples[i*8+5] = (drflac_int32)tempR2 * factor;
+        pOutputSamples[i*8+6] = (drflac_int32)tempL3 * factor;
+        pOutputSamples[i*8+7] = (drflac_int32)tempR3 * factor;
+    }
+
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
+        pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
+    }
 }
 }
 
 
-static bool drflac__decode_subframe(drflac* pFlac, int subframeIndex)
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+
+    float factor = 1.0f / 8388608.0f;
+    __m128 factor128 = _mm_set1_ps(factor);
+
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i lefti;
+        __m128i righti;
+        __m128 leftf;
+        __m128 rightf;
+
+        lefti  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        righti = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+
+        leftf  = _mm_mul_ps(_mm_cvtepi32_ps(lefti),  factor128);
+        rightf = _mm_mul_ps(_mm_cvtepi32_ps(righti), factor128);
 
 
-    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
-    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
-        return false;
+        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
     }
     }
 
 
-    // Side channels require an extra bit per sample. Took a while to figure that one out...
-    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
-    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
-        pSubframe->bitsPerSample += 1;
-    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
-        pSubframe->bitsPerSample += 1;
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
+        pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
     }
     }
+}
+#endif
 
 
-    // Need to handle wasted bits per sample.
-    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
-    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+#if defined(DRFLAC_SUPPORT_NEON)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 i;
+    drflac_uint64 frameCount4 = frameCount >> 2;
+    const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
+    const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
+    drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
 
 
-    switch (pSubframe->subframeType)
-    {
-        case DRFLAC_SUBFRAME_CONSTANT:
-        {
-            drflac__decode_samples__constant(pFlac, pSubframe);
-        } break;
+    float factor = 1.0f / 8388608.0f;
+    float32x4_t factor4 = vdupq_n_f32(factor);
+    int32x4_t shift0_4  = vdupq_n_s32(shift0);
+    int32x4_t shift1_4  = vdupq_n_s32(shift1);
 
 
-        case DRFLAC_SUBFRAME_VERBATIM:
-        {
-            drflac__decode_samples__verbatim(pFlac, pSubframe);
-        } break;
+    for (i = 0; i < frameCount4; ++i) {
+        int32x4_t lefti;
+        int32x4_t righti;
+        float32x4_t leftf;
+        float32x4_t rightf;
 
 
-        case DRFLAC_SUBFRAME_FIXED:
-        {
-            drflac__decode_samples__fixed(pFlac, pSubframe);
-        } break;
+        lefti  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
+        righti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
 
 
-        case DRFLAC_SUBFRAME_LPC:
-        {
-            drflac__decode_samples__lpc(pFlac, pSubframe);
-        } break;
+        leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
+        rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
 
 
-        default: return false;
+        drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
     }
     }
 
 
-    return true;
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
+        pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
+    }
 }
 }
+#endif
 
 
-static bool drflac__seek_subframe(drflac* pFlac, int subframeIndex)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
 {
 {
-    assert(pFlac != NULL);
-
-    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
-    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
-        return false;
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(DRFLAC_SUPPORT_NEON)
+    if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        /* Scalar fallback. */
+#if 0
+        drflac_read_pcm_frames_f32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
     }
     }
+}
 
 
-    // Side channels require an extra bit per sample. Took a while to figure that one out...
-    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
-    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
-        pSubframe->bitsPerSample += 1;
-    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
-        pSubframe->bitsPerSample += 1;
-    }
+DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut)
+{
+    drflac_uint64 framesRead;
+    drflac_uint32 unusedBitsPerSample;
 
 
-    // Need to handle wasted bits per sample.
-    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
-    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+    if (pFlac == NULL || framesToRead == 0) {
+        return 0;
+    }
 
 
-    switch (pSubframe->subframeType)
-    {
-        case DRFLAC_SUBFRAME_CONSTANT:
-        {
-            if (!drflac__seek_bits(pFlac, pSubframe->bitsPerSample)) {
-                return false;
-            }
-        } break;
+    if (pBufferOut == NULL) {
+        return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
+    }
 
 
-        case DRFLAC_SUBFRAME_VERBATIM:
-        {
-            unsigned int bitsToSeek = pFlac->currentFrame.blockSize * pSubframe->bitsPerSample;
-            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
-                return false;
-            }
-        } break;
+    DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
+    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
 
 
-        case DRFLAC_SUBFRAME_FIXED:
-        {
-            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
-            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
-                return false;
+    framesRead = 0;
+    while (framesToRead > 0) {
+        /* If we've run out of samples in this frame, go to the next. */
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
+                break;  /* Couldn't read the next frame, so just break from the loop and return. */
             }
             }
+        } else {
+            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+            drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
+            drflac_uint64 frameCountThisIteration = framesToRead;
 
 
-            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
-                return false;
+            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
+                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
             }
             }
-        } break;
 
 
-        case DRFLAC_SUBFRAME_LPC:
-        {
-            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
-            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
-                return false;
-            }
+            if (channelCount == 2) {
+                const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
+                const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
 
 
-            unsigned char lpcPrecision;
-            if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
-                return false;
-            }
-            if (lpcPrecision == 15) {
-                return false;    // Invalid.
-            }
-            lpcPrecision += 1;
+                switch (pFlac->currentFLACFrame.header.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        drflac_read_pcm_frames_f32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
 
 
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        drflac_read_pcm_frames_f32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
 
 
-            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    // +5 for shift.
-            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
-                return false;
-            }
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        drflac_read_pcm_frames_f32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
 
 
-            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
-                return false;
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        drflac_read_pcm_frames_f32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                }
+            } else {
+                /* Generic interleaving. */
+                drflac_uint64 i;
+                for (i = 0; i < frameCountThisIteration; ++i) {
+                    unsigned int j;
+                    for (j = 0; j < channelCount; ++j) {
+                        drflac_int32 sampleS32 = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
+                        pBufferOut[(i*channelCount)+j] = (float)(sampleS32 / 2147483648.0);
+                    }
+                }
             }
             }
-        } break;
 
 
-        default: return false;
+            framesRead                += frameCountThisIteration;
+            pBufferOut                += frameCountThisIteration * channelCount;
+            framesToRead              -= frameCountThisIteration;
+            pFlac->currentPCMFrame    += frameCountThisIteration;
+            pFlac->currentFLACFrame.pcmFramesRemaining -= (unsigned int)frameCountThisIteration;
+        }
     }
     }
 
 
-    return true;
+    return framesRead;
 }
 }
 
 
 
 
-static DRFLAC_INLINE int drflac__get_channel_count_from_channel_assignment(int channelAssignment)
-{
-    assert(channelAssignment <= 10);
-
-    int lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
-    return lookup[channelAssignment];
-}
-
-static bool drflac__decode_frame(drflac* pFlac)
+DRFLAC_API drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex)
 {
 {
-    // This function should be called while the stream is sitting on the first byte after the frame header.
-
-    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
-    for (int i = 0; i < channelCount; ++i)
-    {
-        if (!drflac__decode_subframe(pFlac, i)) {
-            return false;
-        }
+    if (pFlac == NULL) {
+        return DRFLAC_FALSE;
     }
     }
 
 
-    // At the end of the frame sits the padding and CRC. We don't use these so we can just seek past.
-    if (!drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16)) {
-        return false;
+    /* Don't do anything if we're already on the seek point. */
+    if (pFlac->currentPCMFrame == pcmFrameIndex) {
+        return DRFLAC_TRUE;
     }
     }
 
 
-
-    pFlac->currentFrame.samplesRemaining = pFlac->currentFrame.blockSize * channelCount;
-
-    return true;
-}
-
-static bool drflac__seek_frame(drflac* pFlac)
-{
-    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
-    for (int i = 0; i < channelCount; ++i)
-    {
-        if (!drflac__seek_subframe(pFlac, i)) {
-            return false;
-        }
+    /*
+    If we don't know where the first frame begins then we can't seek. This will happen when the STREAMINFO block was not present
+    when the decoder was opened.
+    */
+    if (pFlac->firstFLACFramePosInBytes == 0) {
+        return DRFLAC_FALSE;
     }
     }
 
 
-    // Padding and CRC.
-    return drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16);
-}
-
-static bool drflac__read_and_decode_next_frame(drflac* pFlac)
-{
-    assert(pFlac != NULL);
+    if (pcmFrameIndex == 0) {
+        pFlac->currentPCMFrame = 0;
+        return drflac__seek_to_first_frame(pFlac);
+    } else {
+        drflac_bool32 wasSuccessful = DRFLAC_FALSE;
+        drflac_uint64 originalPCMFrame = pFlac->currentPCMFrame;
 
 
-    if (!drflac__read_next_frame_header(pFlac)) {
-        return false;
-    }
+        /* Clamp the sample to the end. */
+        if (pcmFrameIndex > pFlac->totalPCMFrameCount) {
+            pcmFrameIndex = pFlac->totalPCMFrameCount;
+        }
 
 
-    return drflac__decode_frame(pFlac);
-}
+        /* If the target sample and the current sample are in the same frame we just move the position forward. */
+        if (pcmFrameIndex > pFlac->currentPCMFrame) {
+            /* Forward. */
+            drflac_uint32 offset = (drflac_uint32)(pcmFrameIndex - pFlac->currentPCMFrame);
+            if (pFlac->currentFLACFrame.pcmFramesRemaining >  offset) {
+                pFlac->currentFLACFrame.pcmFramesRemaining -= offset;
+                pFlac->currentPCMFrame = pcmFrameIndex;
+                return DRFLAC_TRUE;
+            }
+        } else {
+            /* Backward. */
+            drflac_uint32 offsetAbs = (drflac_uint32)(pFlac->currentPCMFrame - pcmFrameIndex);
+            drflac_uint32 currentFLACFramePCMFrameCount = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
+            drflac_uint32 currentFLACFramePCMFramesConsumed = currentFLACFramePCMFrameCount - pFlac->currentFLACFrame.pcmFramesRemaining;
+            if (currentFLACFramePCMFramesConsumed > offsetAbs) {
+                pFlac->currentFLACFrame.pcmFramesRemaining += offsetAbs;
+                pFlac->currentPCMFrame = pcmFrameIndex;
+                return DRFLAC_TRUE;
+            }
+        }
 
 
-static unsigned int drflac__read_block_header(drflac* pFlac, unsigned int* pBlockSizeOut, bool* pIsLastBlockOut)    // Returns the block type.
-{
-    assert(pFlac != NULL);
+        /*
+        Different techniques depending on encapsulation. Using the native FLAC seektable with Ogg encapsulation is a bit awkward so
+        we'll instead use Ogg's natural seeking facility.
+        */
+#ifndef DR_FLAC_NO_OGG
+        if (pFlac->container == drflac_container_ogg)
+        {
+            wasSuccessful = drflac_ogg__seek_to_pcm_frame(pFlac, pcmFrameIndex);
+        }
+        else
+#endif
+        {
+            /* First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower. */
+            if (/*!wasSuccessful && */!pFlac->_noSeekTableSeek) {
+                wasSuccessful = drflac__seek_to_pcm_frame__seek_table(pFlac, pcmFrameIndex);
+            }
 
 
-    unsigned char isLastBlock = 1;
-    unsigned char blockType = DRFLAC_BLOCK_TYPE_INVALID;
-    unsigned int blockSize = 0;
+#if !defined(DR_FLAC_NO_CRC)
+            /* Fall back to binary search if seek table seeking fails. This requires the length of the stream to be known. */
+            if (!wasSuccessful && !pFlac->_noBinarySearchSeek && pFlac->totalPCMFrameCount > 0) {
+                wasSuccessful = drflac__seek_to_pcm_frame__binary_search(pFlac, pcmFrameIndex);
+            }
+#endif
 
 
-    if (!drflac__read_uint8(pFlac, 1, &isLastBlock)) {
-        goto done_reading_block_header;
-    }
+            /* Fall back to brute force if all else fails. */
+            if (!wasSuccessful && !pFlac->_noBruteForceSeek) {
+                wasSuccessful = drflac__seek_to_pcm_frame__brute_force(pFlac, pcmFrameIndex);
+            }
+        }
 
 
-    if (!drflac__read_uint8(pFlac, 7, &blockType)) {
-        goto done_reading_block_header;
-    }
+        if (wasSuccessful) {
+            pFlac->currentPCMFrame = pcmFrameIndex;
+        } else {
+            /* Seek failed. Try putting the decoder back to it's original state. */
+            if (drflac_seek_to_pcm_frame(pFlac, originalPCMFrame) == DRFLAC_FALSE) {
+                /* Failed to seek back to the original PCM frame. Fall back to 0. */
+                drflac_seek_to_pcm_frame(pFlac, 0);
+            }
+        }
 
 
-    if (!drflac__read_uint32(pFlac, 24, &blockSize)) {
-        goto done_reading_block_header;
+        return wasSuccessful;
     }
     }
+}
 
 
 
 
-done_reading_block_header:
-    if (pBlockSizeOut) {
-        *pBlockSizeOut = blockSize;
-    }
 
 
-    if (pIsLastBlockOut) {
-        *pIsLastBlockOut = (isLastBlock != 0);
-    }
+/* High Level APIs */
 
 
-    return blockType;
+/* SIZE_MAX */
+#if defined(SIZE_MAX)
+    #define DRFLAC_SIZE_MAX  SIZE_MAX
+#else
+    #if defined(DRFLAC_64BIT)
+        #define DRFLAC_SIZE_MAX  ((drflac_uint64)0xFFFFFFFFFFFFFFFF)
+    #else
+        #define DRFLAC_SIZE_MAX  0xFFFFFFFF
+    #endif
+#endif
+/* End SIZE_MAX */
+
+
+/* Using a macro as the definition of the drflac__full_decode_and_close_*() API family. Sue me. */
+#define DRFLAC_DEFINE_FULL_READ_AND_CLOSE(extension, type) \
+static type* drflac__full_read_and_close_ ## extension (drflac* pFlac, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut)\
+{                                                                                                                                                                   \
+    type* pSampleData = NULL;                                                                                                                                       \
+    drflac_uint64 totalPCMFrameCount;                                                                                                                               \
+    type buffer[4096];                                                                                                                                              \
+    drflac_uint64 pcmFramesRead;                                                                                                                                    \
+    size_t sampleDataBufferSize = sizeof(buffer);                                                                                                                   \
+                                                                                                                                                                    \
+    DRFLAC_ASSERT(pFlac != NULL);                                                                                                                                   \
+                                                                                                                                                                    \
+    totalPCMFrameCount = 0;                                                                                                                                         \
+                                                                                                                                                                    \
+    pSampleData = (type*)drflac__malloc_from_callbacks(sampleDataBufferSize, &pFlac->allocationCallbacks);                                                          \
+    if (pSampleData == NULL) {                                                                                                                                      \
+        goto on_error;                                                                                                                                              \
+    }                                                                                                                                                               \
+                                                                                                                                                                    \
+    while ((pcmFramesRead = (drflac_uint64)drflac_read_pcm_frames_##extension(pFlac, sizeof(buffer)/sizeof(buffer[0])/pFlac->channels, buffer)) > 0) {              \
+        if (((totalPCMFrameCount + pcmFramesRead) * pFlac->channels * sizeof(type)) > sampleDataBufferSize) {                                                       \
+            type* pNewSampleData;                                                                                                                                   \
+            size_t newSampleDataBufferSize;                                                                                                                         \
+                                                                                                                                                                    \
+            newSampleDataBufferSize = sampleDataBufferSize * 2;                                                                                                     \
+            pNewSampleData = (type*)drflac__realloc_from_callbacks(pSampleData, newSampleDataBufferSize, sampleDataBufferSize, &pFlac->allocationCallbacks);        \
+            if (pNewSampleData == NULL) {                                                                                                                           \
+                drflac__free_from_callbacks(pSampleData, &pFlac->allocationCallbacks);                                                                              \
+                goto on_error;                                                                                                                                      \
+            }                                                                                                                                                       \
+                                                                                                                                                                    \
+            sampleDataBufferSize = newSampleDataBufferSize;                                                                                                         \
+            pSampleData = pNewSampleData;                                                                                                                           \
+        }                                                                                                                                                           \
+                                                                                                                                                                    \
+        DRFLAC_COPY_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), buffer, (size_t)(pcmFramesRead*pFlac->channels*sizeof(type)));                       \
+        totalPCMFrameCount += pcmFramesRead;                                                                                                                        \
+    }                                                                                                                                                               \
+                                                                                                                                                                    \
+    /* At this point everything should be decoded, but we just want to fill the unused part buffer with silence - need to                                           \
+       protect those ears from random noise! */                                                                                                                     \
+    DRFLAC_ZERO_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), (size_t)(sampleDataBufferSize - totalPCMFrameCount*pFlac->channels*sizeof(type)));       \
+                                                                                                                                                                    \
+    if (sampleRateOut) *sampleRateOut = pFlac->sampleRate;                                                                                                          \
+    if (channelsOut) *channelsOut = pFlac->channels;                                                                                                                \
+    if (totalPCMFrameCountOut) *totalPCMFrameCountOut = totalPCMFrameCount;                                                                                         \
+                                                                                                                                                                    \
+    drflac_close(pFlac);                                                                                                                                            \
+    return pSampleData;                                                                                                                                             \
+                                                                                                                                                                    \
+on_error:                                                                                                                                                           \
+    drflac_close(pFlac);                                                                                                                                            \
+    return NULL;                                                                                                                                                    \
 }
 }
 
 
+DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s32, drflac_int32)
+DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s16, drflac_int16)
+DRFLAC_DEFINE_FULL_READ_AND_CLOSE(f32, float)
 
 
-static void drflac__get_current_frame_sample_range(drflac* pFlac, uint64_t* pFirstSampleInFrameOut, uint64_t* pLastSampleInFrameOut)
+DRFLAC_API drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
 {
 {
-    assert(pFlac != NULL);
+    drflac* pFlac;
 
 
-    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
-
-    uint64_t firstSampleInFrame = pFlac->currentFrame.sampleNumber;
-    if (firstSampleInFrame == 0) {
-        firstSampleInFrame = pFlac->currentFrame.frameNumber * pFlac->maxBlockSize*channelCount;
+    if (channelsOut) {
+        *channelsOut = 0;
     }
     }
-
-    uint64_t lastSampleInFrame = firstSampleInFrame + (pFlac->currentFrame.blockSize*channelCount);
-    if (lastSampleInFrame > 0) {
-        lastSampleInFrame -= 1; // Needs to be zero based.
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
     }
     }
-
-
-    if (pFirstSampleInFrameOut) {
-        *pFirstSampleInFrameOut = firstSampleInFrame;
+    if (totalPCMFrameCountOut) {
+        *totalPCMFrameCountOut = 0;
     }
     }
-    if (pLastSampleInFrameOut) {
-        *pLastSampleInFrameOut = lastSampleInFrame;
+
+    pFlac = drflac_open(onRead, onSeek, onTell, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
     }
     }
+
+    return drflac__full_read_and_close_s32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
 }
 }
 
 
-static bool drflac__seek_to_first_frame(drflac* pFlac)
+DRFLAC_API drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
 {
 {
-    assert(pFlac != NULL);
-
-    bool result = drflac__seek_to_byte(pFlac, (long long)pFlac->firstFramePos);
-    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
-    pFlac->cache = 0;
-
-    memset(&pFlac->currentFrame, 0, sizeof(pFlac->currentFrame));
+    drflac* pFlac;
 
 
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalPCMFrameCountOut) {
+        *totalPCMFrameCountOut = 0;
+    }
 
 
-    return result;
-}
+    pFlac = drflac_open(onRead, onSeek, onTell, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
 
 
-static DRFLAC_INLINE bool drflac__seek_to_next_frame(drflac* pFlac)
-{
-    // This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section.
-    assert(pFlac != NULL);
-    return drflac__seek_frame(pFlac);
+    return drflac__full_read_and_close_s16(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
 }
 }
 
 
-static bool drflac__seek_to_frame_containing_sample(drflac* pFlac, uint64_t sampleIndex)
+DRFLAC_API float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
 {
 {
-    assert(pFlac != NULL);
+    drflac* pFlac;
 
 
-    if (!drflac__seek_to_first_frame(pFlac)) {
-        return false;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalPCMFrameCountOut) {
+        *totalPCMFrameCountOut = 0;
     }
     }
 
 
-    uint64_t firstSampleInFrame = 0;
-    uint64_t lastSampleInFrame = 0;
-    for (;;)
-    {
-        // We need to read the frame's header in order to determine the range of samples it contains.
-        if (!drflac__read_next_frame_header(pFlac)) {
-            return false;
-        }
-
-        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
-        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
-            break;  // The sample is in this frame.
-        }
-
-        if (!drflac__seek_to_next_frame(pFlac)) {
-            return false;
-        }
+    pFlac = drflac_open(onRead, onSeek, onTell, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
     }
     }
 
 
-    // If we get here we should be right at the start of the frame containing the sample.
-    return true;
+    return drflac__full_read_and_close_f32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
 }
 }
 
 
-static bool drflac__seek_to_sample__brute_force(drflac* pFlac, uint64_t sampleIndex)
+#ifndef DR_FLAC_NO_STDIO
+DRFLAC_API drflac_int32* drflac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
 {
 {
-    if (!drflac__seek_to_frame_containing_sample(pFlac, sampleIndex)) {
-        return false;
-    }
+    drflac* pFlac;
 
 
-    // At this point we should be sitting on the first byte of the frame containing the sample. We need to decode every sample up to (but
-    // not including) the sample we're seeking to.
-    uint64_t firstSampleInFrame = 0;
-    drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, NULL);
-
-    assert(firstSampleInFrame <= sampleIndex);
-    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
-    if (samplesToDecode == 0) {
-        return true;
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
     }
     }
 
 
-    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
-    if (!drflac__decode_frame(pFlac)) {
-        return false;
+    pFlac = drflac_open_file(filename, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
     }
     }
 
 
-    return (drflac_read_s16(pFlac, samplesToDecode, NULL) != 0);
+    return drflac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
 }
 }
 
 
-static bool drflac__seek_to_sample__seek_table(drflac* pFlac, uint64_t sampleIndex)
+DRFLAC_API drflac_int16* drflac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
 {
 {
-    assert(pFlac != NULL);
+    drflac* pFlac;
 
 
-    if (pFlac->seektableBlock.pos == 0) {
-        return false;
+    if (sampleRate) {
+        *sampleRate = 0;
     }
     }
-
-    if (!drflac__seek_to_byte(pFlac, pFlac->seektableBlock.pos)) {
-        return false;
+    if (channels) {
+        *channels = 0;
     }
     }
-
-    // The number of seek points is derived from the size of the SEEKTABLE block.
-    unsigned int seekpointCount = pFlac->seektableBlock.sizeInBytes / 18;   // 18 = the size of each seek point.
-    if (seekpointCount == 0) {
-        return false;   // Would this ever happen?
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
     }
     }
 
 
-
-    drflac_seekpoint closestSeekpoint = {0};
-
-    unsigned int seekpointsRemaining = seekpointCount;
-    while (seekpointsRemaining > 0)
-    {
-        drflac_seekpoint seekpoint;
-        if (!drflac__read_uint64(pFlac, 64, &seekpoint.firstSample)) {
-            break;
-        }
-        if (!drflac__read_uint64(pFlac, 64, &seekpoint.frameOffset)) {
-            break;
-        }
-        if (!drflac__read_uint16(pFlac, 16, &seekpoint.sampleCount)) {
-            break;
-        }
-
-        if (seekpoint.firstSample * pFlac->channels > sampleIndex) {
-            break;
-        }
-
-        closestSeekpoint = seekpoint;
-        seekpointsRemaining -= 1;
+    pFlac = drflac_open_file(filename, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
     }
     }
 
 
-    // At this point we should have found the seekpoint closest to our sample. We need to seek to it using basically the same
-    // technique as we use with the brute force method.
-    drflac__seek_to_byte(pFlac, pFlac->firstFramePos + closestSeekpoint.frameOffset);
-
-    uint64_t firstSampleInFrame = 0;
-    uint64_t lastSampleInFrame = 0;
-    for (;;)
-    {
-        // We need to read the frame's header in order to determine the range of samples it contains.
-        if (!drflac__read_next_frame_header(pFlac)) {
-            return false;
-        }
+    return drflac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
 
 
-        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
-        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
-            break;  // The sample is in this frame.
-        }
+DRFLAC_API float* drflac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac* pFlac;
 
 
-        if (!drflac__seek_to_next_frame(pFlac)) {
-            return false;
-        }
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
     }
     }
 
 
-    assert(firstSampleInFrame <= sampleIndex);
-
-    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
-    if (!drflac__decode_frame(pFlac)) {
-        return false;
+    pFlac = drflac_open_file(filename, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
     }
     }
 
 
-    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
-    return drflac_read_s16(pFlac, samplesToDecode, NULL) == samplesToDecode;
+    return drflac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
 }
 }
+#endif
 
 
-
-static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData)
+DRFLAC_API drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
 {
 {
-    if (onRead == NULL || onSeek == NULL) {
-        return NULL;
-    }
+    drflac* pFlac;
 
 
-    unsigned char id[4];
-    if (onRead(pUserData, id, 4) != 4 || id[0] != 'f' || id[1] != 'L' || id[2] != 'a' || id[3] != 'C') {
-        return NULL;    // Not a FLAC stream.
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
     }
     }
 
 
-    drflac tempFlac;
-    memset(&tempFlac, 0, sizeof(tempFlac));
-    tempFlac.onRead         = onRead;
-    tempFlac.onSeek         = onSeek;
-    tempFlac.pUserData      = pUserData;
-    tempFlac.currentBytePos = 4;
-    tempFlac.nextL2Line     = sizeof(tempFlac.cacheL2) / sizeof(tempFlac.cacheL2[0]); // <-- Initialize to this to force a client-side data retrieval right from the start.
-    tempFlac.consumedBits   = sizeof(tempFlac.cache)*8;
-
-    // The first metadata block should be the STREAMINFO block. We don't care about everything in here.
-    unsigned int blockSize;
-    bool isLastBlock;
-    int blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
-    if (blockType != DRFLAC_BLOCK_TYPE_STREAMINFO && blockSize != 34) {
+    pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
+    if (pFlac == NULL) {
         return NULL;
         return NULL;
     }
     }
 
 
-    if (!drflac__seek_bits(&tempFlac, 16)) {   // minBlockSize
-        return NULL;
+    return drflac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+
+DRFLAC_API drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac* pFlac;
+
+    if (sampleRate) {
+        *sampleRate = 0;
     }
     }
-    if (!drflac__read_uint16(&tempFlac, 16, &tempFlac.maxBlockSize)) {
-        return NULL;
+    if (channels) {
+        *channels = 0;
     }
     }
-    if (!drflac__seek_bits(&tempFlac, 48)) {   // minFrameSize + maxFrameSize
-        return NULL;
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
     }
     }
-    if (!drflac__read_uint32(&tempFlac, 20, &tempFlac.sampleRate)) {
+
+    pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
+    if (pFlac == NULL) {
         return NULL;
         return NULL;
     }
     }
-    if (!drflac__read_uint8(&tempFlac, 3, &tempFlac.channels)) {
-        return NULL;
+
+    return drflac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+
+DRFLAC_API float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    drflac* pFlac;
+
+    if (sampleRate) {
+        *sampleRate = 0;
     }
     }
-    if (!drflac__read_uint8(&tempFlac, 5, &tempFlac.bitsPerSample)) {
-        return NULL;
+    if (channels) {
+        *channels = 0;
     }
     }
-    if (!drflac__read_uint64(&tempFlac, 36, &tempFlac.totalSampleCount)) {
-        return NULL;
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
     }
     }
-    if (!drflac__seek_bits(&tempFlac, 128)) {  // MD5
+
+    pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
+    if (pFlac == NULL) {
         return NULL;
         return NULL;
     }
     }
 
 
-    tempFlac.channels += 1;
-    tempFlac.bitsPerSample += 1;
-    tempFlac.totalSampleCount *= tempFlac.channels;
+    return drflac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
 
 
-    while (!isLastBlock)
-    {
-        blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
 
 
-        switch (blockType)
-        {
-            case DRFLAC_BLOCK_TYPE_APPLICATION:
-            {
-                tempFlac.applicationBlock.pos = drflac__tell(&tempFlac);
-                tempFlac.applicationBlock.sizeInBytes = blockSize;
-            } break;
+DRFLAC_API void drflac_free(void* p, const drflac_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        drflac__free_from_callbacks(p, pAllocationCallbacks);
+    } else {
+        drflac__free_default(p, NULL);
+    }
+}
 
 
-            case DRFLAC_BLOCK_TYPE_SEEKTABLE:
-            {
-                tempFlac.seektableBlock.pos = drflac__tell(&tempFlac);
-                tempFlac.seektableBlock.sizeInBytes = blockSize;
-            } break;
 
 
-            case DRFLAC_BLOCK_TYPE_VORBIS_COMMENT:
-            {
-                tempFlac.vorbisCommentBlock.pos = drflac__tell(&tempFlac);
-                tempFlac.vorbisCommentBlock.sizeInBytes = blockSize;
-            } break;
 
 
-            case DRFLAC_BLOCK_TYPE_CUESHEET:
-            {
-                tempFlac.cuesheetBlock.pos = drflac__tell(&tempFlac);
-                tempFlac.cuesheetBlock.sizeInBytes = blockSize;
-            } break;
 
 
-            case DRFLAC_BLOCK_TYPE_PICTURE:
-            {
-                tempFlac.pictureBlock.pos = drflac__tell(&tempFlac);
-                tempFlac.pictureBlock.sizeInBytes = blockSize;
-            } break;
+DRFLAC_API void drflac_init_vorbis_comment_iterator(drflac_vorbis_comment_iterator* pIter, drflac_uint32 commentCount, const void* pComments)
+{
+    if (pIter == NULL) {
+        return;
+    }
 
 
+    pIter->countRemaining = commentCount;
+    pIter->pRunningData   = (const char*)pComments;
+}
 
 
-            // These blocks we either don't care about or aren't supporting.
-            case DRFLAC_BLOCK_TYPE_PADDING:
-            case DRFLAC_BLOCK_TYPE_INVALID:
-            default: break;
-        }
+DRFLAC_API const char* drflac_next_vorbis_comment(drflac_vorbis_comment_iterator* pIter, drflac_uint32* pCommentLengthOut)
+{
+    drflac_int32 length;
+    const char* pComment;
 
 
-        if (!drflac__seek_bits(&tempFlac, blockSize*8)) {
-            return NULL;
-        }
+    /* Safety. */
+    if (pCommentLengthOut) {
+        *pCommentLengthOut = 0;
+    }
+
+    if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
+        return NULL;
     }
     }
 
 
+    length = drflac__le2host_32_ptr_unaligned(pIter->pRunningData);
+    pIter->pRunningData += 4;
 
 
-    // At this point we should be sitting right at the start of the very first frame.
-    tempFlac.firstFramePos = drflac__tell(&tempFlac);
+    pComment = pIter->pRunningData;
+    pIter->pRunningData += length;
+    pIter->countRemaining -= 1;
 
 
-    drflac* pFlac = (drflac*)malloc(sizeof(*pFlac) - sizeof(pFlac->pExtraData) + (tempFlac.maxBlockSize * tempFlac.channels * sizeof(int32_t)));
-    memcpy(pFlac, &tempFlac, sizeof(tempFlac) - sizeof(pFlac->pExtraData));
-    pFlac->pDecodedSamples = (int32_t*)pFlac->pExtraData;
+    if (pCommentLengthOut) {
+        *pCommentLengthOut = length;
+    }
 
 
-    return pFlac;
+    return pComment;
 }
 }
 
 
-static void drflac_close(drflac* pFlac)
-{
-    if (pFlac == NULL) {
-        return;
-    }
 
 
-#ifndef DR_FLAC_NO_STDIO
-    // If we opened the file with drflac_open_file() we will want to close the file handle. We can know whether or not drflac_open_file()
-    // was used by looking at the callbacks.
-    if (pFlac->onRead == drflac__on_read_stdio) {
-#if defined(DR_OPUS_NO_WIN32_IO) || !defined(_WIN32)
-        fclose((FILE*)pFlac->pUserData);
-#else
-        CloseHandle((HANDLE)pFlac->pUserData);
-#endif
-    }
-#endif
 
 
-    // If we opened the file with drflac_open_memory() we will want to free() the user data.
-    if (pFlac->onRead == drflac__on_read_memory) {
-        free(pFlac->pUserData);
+
+DRFLAC_API void drflac_init_cuesheet_track_iterator(drflac_cuesheet_track_iterator* pIter, drflac_uint32 trackCount, const void* pTrackData)
+{
+    if (pIter == NULL) {
+        return;
     }
     }
 
 
-    free(pFlac);
+    pIter->countRemaining = trackCount;
+    pIter->pRunningData   = (const char*)pTrackData;
 }
 }
 
 
-static uint64_t drflac__read_s16__misaligned(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
+DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter, drflac_cuesheet_track* pCuesheetTrack)
 {
 {
-    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
-
-    // We should never be calling this when the number of samples to read is >= the sample count.
-    assert(samplesToRead < channelCount);
-    assert(pFlac->currentFrame.samplesRemaining > 0 && samplesToRead <= pFlac->currentFrame.samplesRemaining);
+    drflac_cuesheet_track cuesheetTrack;
+    const char* pRunningData;
+    drflac_uint64 offsetHi;
+    drflac_uint64 offsetLo;
 
 
+    if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
+        return DRFLAC_FALSE;
+    }
 
 
-    uint64_t samplesRead = 0;
-    while (samplesToRead > 0)
-    {
-        uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
-        uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
-        unsigned int channelIndex = samplesReadFromFrameSoFar % channelCount;
+    pRunningData = pIter->pRunningData;
 
 
-        unsigned long long nextSampleInFrame = samplesReadFromFrameSoFar / channelCount;
+    offsetHi                   = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
+    offsetLo                   = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
+    cuesheetTrack.offset       = offsetLo | (offsetHi << 32);
+    cuesheetTrack.trackNumber  = pRunningData[0];                                         pRunningData += 1;
+    DRFLAC_COPY_MEMORY(cuesheetTrack.ISRC, pRunningData, sizeof(cuesheetTrack.ISRC));     pRunningData += 12;
+    cuesheetTrack.isAudio      = (pRunningData[0] & 0x80) != 0;
+    cuesheetTrack.preEmphasis  = (pRunningData[0] & 0x40) != 0;                           pRunningData += 14;
+    cuesheetTrack.indexCount   = pRunningData[0];                                         pRunningData += 1;
+    cuesheetTrack.pIndexPoints = (const drflac_cuesheet_track_index*)pRunningData;        pRunningData += cuesheetTrack.indexCount * sizeof(drflac_cuesheet_track_index);
 
 
-        int decodedSample = 0;
-        switch (pFlac->currentFrame.channelAssignment)
-        {
-            case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
-            {
-                if (channelIndex == 0) {
-                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
-                } else {
-                    int side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
-                    int left = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
-                    decodedSample = left - side;
-                }
+    pIter->pRunningData = pRunningData;
+    pIter->countRemaining -= 1;
 
 
-            } break;
+    if (pCuesheetTrack) {
+        *pCuesheetTrack = cuesheetTrack;
+    }
 
 
-            case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-            {
-                if (channelIndex == 0) {
-                    int side  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
-                    int right = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
-                    decodedSample = side + right;
-                } else {
-                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
-                }
+    return DRFLAC_TRUE;
+}
 
 
-            } break;
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic pop
+#endif
+#endif  /* dr_flac_c */
+#endif  /* DR_FLAC_IMPLEMENTATION */
 
 
-            case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
-            {
-                int mid;
-                int side;
-                if (channelIndex == 0) {
-                    mid  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
-                    side = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
-
-                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
-                    decodedSample = (mid + side) >> 1;
-                } else {
-                    mid  = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
-                    side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
 
 
-                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
-                    decodedSample = (mid - side) >> 1;
-                }
+/*
+REVISION HISTORY
+================
+v0.13.2 - 2025-12-02
+  - Improve robustness of the parsing of picture metadata to improve support for memory constrained embedded devices.
+  - Fix a warning about an assigned by unused variable.
+  - Improvements to drflac_open_and_read_pcm_frames_*() and family to avoid excessively large memory allocations from malformed files.
 
 
-            } break;
+v0.13.1 - 2025-09-10
+  - Fix an error with the NXDK build.
 
 
-            case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
-            default:
-            {
-                decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
-            } break;
-        }
+v0.13.0 - 2025-07-23
+  - API CHANGE: Seek origin enums have been renamed to match the naming convention used by other dr_libs libraries:
+    - drflac_seek_origin_start   -> DRFLAC_SEEK_SET
+    - drflac_seek_origin_current -> DRFLAC_SEEK_CUR
+    - DRFLAC_SEEK_END (new)
+  - API CHANGE: A new seek origin has been added to allow seeking from the end of the file. If you implement your own `onSeek` callback, you should now detect and handle `DRFLAC_SEEK_END`. If seeking to the end is not supported, return `DRFLAC_FALSE`. If you only use `*_open_file()` or `*_open_memory()`, you need not change anything.
+  - API CHANGE: An `onTell` callback has been added to the following functions:
+    - drflac_open()
+    - drflac_open_relaxed()
+    - drflac_open_with_metadata()
+    - drflac_open_with_metadata_relaxed()
+    - drflac_open_and_read_pcm_frames_s32()
+    - drflac_open_and_read_pcm_frames_s16()
+    - drflac_open_and_read_pcm_frames_f32()
+  - Fix compilation for AIX OS.
 
 
-        int shift = (16 - pFlac->bitsPerSample) + pFlac->currentFrame.subframes[channelIndex].wastedBitsPerSample;
-        if (shift >= 0) {
-            decodedSample <<= shift;
-        } else {
-            decodedSample >>= -shift;
-        }
+v0.12.43 - 2024-12-17
+  - Fix a possible buffer overflow during decoding.
+  - Improve detection of ARM64EC
 
 
-        if (bufferOut) {
-            *bufferOut++ = decodedSample;
-        }
+v0.12.42 - 2023-11-02
+  - Fix build for ARMv6-M.
+  - Fix a compilation warning with GCC.
 
 
-        samplesRead += 1;
-        pFlac->currentFrame.samplesRemaining -= 1;
-        samplesToRead -= 1;
-    }
+v0.12.41 - 2023-06-17
+  - Fix an incorrect date in revision history. No functional change.
 
 
-    return samplesRead;
-}
+v0.12.40 - 2023-05-22
+  - Minor code restructure. No functional change.
 
 
-static uint64_t drflac__seek_forward_by_samples(drflac* pFlac, uint64_t samplesToRead)
-{
-    uint64_t samplesRead = 0;
-    while (samplesToRead > 0)
-    {
-        if (pFlac->currentFrame.samplesRemaining == 0)
-        {
-            if (!drflac__read_and_decode_next_frame(pFlac)) {
-                break;  // Couldn't read the next frame, so just break from the loop and return.
-            }
-        }
-        else
-        {
-            samplesRead += 1;
-            pFlac->currentFrame.samplesRemaining -= 1;
-            samplesToRead -= 1;
-        }
-    }
+v0.12.39 - 2022-09-17
+  - Fix compilation with DJGPP.
+  - Fix compilation error with Visual Studio 2019 and the ARM build.
+  - Fix an error with SSE 4.1 detection.
+  - Add support for disabling wchar_t with DR_WAV_NO_WCHAR.
+  - Improve compatibility with compilers which lack support for explicit struct packing.
+  - Improve compatibility with low-end and embedded hardware by reducing the amount of stack
+    allocation when loading an Ogg encapsulated file.
 
 
-    return samplesRead;
-}
+v0.12.38 - 2022-04-10
+  - Fix compilation error on older versions of GCC.
 
 
-static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
-{
-    // Note that <bufferOut> is allowed to be null, in which case this will be treated as something like a seek.
-    if (pFlac == NULL || samplesToRead == 0) {
-        return 0;
-    }
+v0.12.37 - 2022-02-12
+  - Improve ARM detection.
 
 
-    if (bufferOut == NULL) {
-        return drflac__seek_forward_by_samples(pFlac, samplesToRead);
-    }
+v0.12.36 - 2022-02-07
+  - Fix a compilation error with the ARM build.
 
 
+v0.12.35 - 2022-02-06
+  - Fix a bug due to underestimating the amount of precision required for the prediction stage.
+  - Fix some bugs found from fuzz testing.
 
 
-    uint64_t samplesRead = 0;
-    while (samplesToRead > 0)
-    {
-        // If we've run out of samples in this frame, go to the next.
-        if (pFlac->currentFrame.samplesRemaining == 0)
-        {
-            if (!drflac__read_and_decode_next_frame(pFlac)) {
-                break;  // Couldn't read the next frame, so just break from the loop and return.
-            }
-        }
-        else
-        {
-            // Here is where we grab the samples and interleave them.
+v0.12.34 - 2022-01-07
+  - Fix some misalignment bugs when reading metadata.
 
 
-            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
-            uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
-            uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
+v0.12.33 - 2021-12-22
+  - Fix a bug with seeking when the seek table does not start at PCM frame 0.
 
 
-            int misalignedSampleCount = samplesReadFromFrameSoFar % channelCount;
-            if (misalignedSampleCount > 0) {
-                uint64_t misalignedSamplesRead = drflac__read_s16__misaligned(pFlac, misalignedSampleCount, bufferOut);
-                samplesRead   += misalignedSamplesRead;
-                samplesReadFromFrameSoFar += misalignedSamplesRead;
-                bufferOut     += misalignedSamplesRead;
-                samplesToRead -= misalignedSamplesRead;
-            }
+v0.12.32 - 2021-12-11
+  - Fix a warning with Clang.
 
 
+v0.12.31 - 2021-08-16
+  - Silence some warnings.
 
 
-            uint64_t alignedSampleCountPerChannel = samplesToRead / channelCount;
-            if (alignedSampleCountPerChannel > pFlac->currentFrame.samplesRemaining / channelCount) {
-                alignedSampleCountPerChannel = pFlac->currentFrame.samplesRemaining / channelCount;
-            }
+v0.12.30 - 2021-07-31
+  - Fix platform detection for ARM64.
 
 
-            uint64_t firstAlignedSampleInFrame = samplesReadFromFrameSoFar / channelCount;
-            int unusedBitsPerSample = 16 - pFlac->bitsPerSample;
+v0.12.29 - 2021-04-02
+  - Fix a bug where the running PCM frame index is set to an invalid value when over-seeking.
+  - Fix a decoding error due to an incorrect validation check.
 
 
-            if (unusedBitsPerSample >= 0) {
-                int lshift0 = unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
-                int lshift1 = unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+v0.12.28 - 2021-02-21
+  - Fix a warning due to referencing _MSC_VER when it is undefined.
 
 
-                switch (pFlac->currentFrame.channelAssignment)
-                {
-                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
-                    {
-                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+v0.12.27 - 2021-01-31
+  - Fix a static analysis warning.
 
 
-                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                            int left  = pDecodedSamples0[i];
-                            int side  = pDecodedSamples1[i];
-                            int right = left - side;
+v0.12.26 - 2021-01-17
+  - Fix a compilation warning due to _BSD_SOURCE being deprecated.
+
+v0.12.25 - 2020-12-26
+  - Update documentation.
+
+v0.12.24 - 2020-11-29
+  - Fix ARM64/NEON detection when compiling with MSVC.
+
+v0.12.23 - 2020-11-21
+  - Fix compilation with OpenWatcom.
+
+v0.12.22 - 2020-11-01
+  - Fix an error with the previous release.
+
+v0.12.21 - 2020-11-01
+  - Fix a possible deadlock when seeking.
+  - Improve compiler support for older versions of GCC.
+
+v0.12.20 - 2020-09-08
+  - Fix a compilation error on older compilers.
+
+v0.12.19 - 2020-08-30
+  - Fix a bug due to an undefined 32-bit shift.
+
+v0.12.18 - 2020-08-14
+  - Fix a crash when compiling with clang-cl.
+
+v0.12.17 - 2020-08-02
+  - Simplify sized types.
+
+v0.12.16 - 2020-07-25
+  - Fix a compilation warning.
+
+v0.12.15 - 2020-07-06
+  - Check for negative LPC shifts and return an error.
+
+v0.12.14 - 2020-06-23
+  - Add include guard for the implementation section.
+
+v0.12.13 - 2020-05-16
+  - Add compile-time and run-time version querying.
+    - DRFLAC_VERSION_MINOR
+    - DRFLAC_VERSION_MAJOR
+    - DRFLAC_VERSION_REVISION
+    - DRFLAC_VERSION_STRING
+    - drflac_version()
+    - drflac_version_string()
+
+v0.12.12 - 2020-04-30
+  - Fix compilation errors with VC6.
+
+v0.12.11 - 2020-04-19
+  - Fix some pedantic warnings.
+  - Fix some undefined behaviour warnings.
+
+v0.12.10 - 2020-04-10
+  - Fix some bugs when trying to seek with an invalid seek table.
+
+v0.12.9 - 2020-04-05
+  - Fix warnings.
+
+v0.12.8 - 2020-04-04
+  - Add drflac_open_file_w() and drflac_open_file_with_metadata_w().
+  - Fix some static analysis warnings.
+  - Minor documentation updates.
+
+v0.12.7 - 2020-03-14
+  - Fix compilation errors with VC6.
+
+v0.12.6 - 2020-03-07
+  - Fix compilation error with Visual Studio .NET 2003.
+
+v0.12.5 - 2020-01-30
+  - Silence some static analysis warnings.
+
+v0.12.4 - 2020-01-29
+  - Silence some static analysis warnings.
+
+v0.12.3 - 2019-12-02
+  - Fix some warnings when compiling with GCC and the -Og flag.
+  - Fix a crash in out-of-memory situations.
+  - Fix potential integer overflow bug.
+  - Fix some static analysis warnings.
+  - Fix a possible crash when using custom memory allocators without a custom realloc() implementation.
+  - Fix a bug with binary search seeking where the bits per sample is not a multiple of 8.
+
+v0.12.2 - 2019-10-07
+  - Internal code clean up.
+
+v0.12.1 - 2019-09-29
+  - Fix some Clang Static Analyzer warnings.
+  - Fix an unused variable warning.
+
+v0.12.0 - 2019-09-23
+  - API CHANGE: Add support for user defined memory allocation routines. This system allows the program to specify their own memory allocation
+    routines with a user data pointer for client-specific contextual data. This adds an extra parameter to the end of the following APIs:
+    - drflac_open()
+    - drflac_open_relaxed()
+    - drflac_open_with_metadata()
+    - drflac_open_with_metadata_relaxed()
+    - drflac_open_file()
+    - drflac_open_file_with_metadata()
+    - drflac_open_memory()
+    - drflac_open_memory_with_metadata()
+    - drflac_open_and_read_pcm_frames_s32()
+    - drflac_open_and_read_pcm_frames_s16()
+    - drflac_open_and_read_pcm_frames_f32()
+    - drflac_open_file_and_read_pcm_frames_s32()
+    - drflac_open_file_and_read_pcm_frames_s16()
+    - drflac_open_file_and_read_pcm_frames_f32()
+    - drflac_open_memory_and_read_pcm_frames_s32()
+    - drflac_open_memory_and_read_pcm_frames_s16()
+    - drflac_open_memory_and_read_pcm_frames_f32()
+    Set this extra parameter to NULL to use defaults which is the same as the previous behaviour. Setting this NULL will use
+    DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE.
+  - Remove deprecated APIs:
+    - drflac_read_s32()
+    - drflac_read_s16()
+    - drflac_read_f32()
+    - drflac_seek_to_sample()
+    - drflac_open_and_decode_s32()
+    - drflac_open_and_decode_s16()
+    - drflac_open_and_decode_f32()
+    - drflac_open_and_decode_file_s32()
+    - drflac_open_and_decode_file_s16()
+    - drflac_open_and_decode_file_f32()
+    - drflac_open_and_decode_memory_s32()
+    - drflac_open_and_decode_memory_s16()
+    - drflac_open_and_decode_memory_f32()
+  - Remove drflac.totalSampleCount which is now replaced with drflac.totalPCMFrameCount. You can emulate drflac.totalSampleCount
+    by doing pFlac->totalPCMFrameCount*pFlac->channels.
+  - Rename drflac.currentFrame to drflac.currentFLACFrame to remove ambiguity with PCM frames.
+  - Fix errors when seeking to the end of a stream.
+  - Optimizations to seeking.
+  - SSE improvements and optimizations.
+  - ARM NEON optimizations.
+  - Optimizations to drflac_read_pcm_frames_s16().
+  - Optimizations to drflac_read_pcm_frames_s32().
+
+v0.11.10 - 2019-06-26
+  - Fix a compiler error.
+
+v0.11.9 - 2019-06-16
+  - Silence some ThreadSanitizer warnings.
+
+v0.11.8 - 2019-05-21
+  - Fix warnings.
+
+v0.11.7 - 2019-05-06
+  - C89 fixes.
+
+v0.11.6 - 2019-05-05
+  - Add support for C89.
+  - Fix a compiler warning when CRC is disabled.
+  - Change license to choice of public domain or MIT-0.
+
+v0.11.5 - 2019-04-19
+  - Fix a compiler error with GCC.
+
+v0.11.4 - 2019-04-17
+  - Fix some warnings with GCC when compiling with -std=c99.
+
+v0.11.3 - 2019-04-07
+  - Silence warnings with GCC.
+
+v0.11.2 - 2019-03-10
+  - Fix a warning.
+
+v0.11.1 - 2019-02-17
+  - Fix a potential bug with seeking.
+
+v0.11.0 - 2018-12-16
+  - API CHANGE: Deprecated drflac_read_s32(), drflac_read_s16() and drflac_read_f32() and replaced them with
+    drflac_read_pcm_frames_s32(), drflac_read_pcm_frames_s16() and drflac_read_pcm_frames_f32(). The new APIs take
+    and return PCM frame counts instead of sample counts. To upgrade you will need to change the input count by
+    dividing it by the channel count, and then do the same with the return value.
+  - API_CHANGE: Deprecated drflac_seek_to_sample() and replaced with drflac_seek_to_pcm_frame(). Same rules as
+    the changes to drflac_read_*() apply.
+  - API CHANGE: Deprecated drflac_open_and_decode_*() and replaced with drflac_open_*_and_read_*(). Same rules as
+    the changes to drflac_read_*() apply.
+  - Optimizations.
+
+v0.10.0 - 2018-09-11
+  - Remove the DR_FLAC_NO_WIN32_IO option and the Win32 file IO functionality. If you need to use Win32 file IO you
+    need to do it yourself via the callback API.
+  - Fix the clang build.
+  - Fix undefined behavior.
+  - Fix errors with CUESHEET metdata blocks.
+  - Add an API for iterating over each cuesheet track in the CUESHEET metadata block. This works the same way as the
+    Vorbis comment API.
+  - Other miscellaneous bug fixes, mostly relating to invalid FLAC streams.
+  - Minor optimizations.
+
+v0.9.11 - 2018-08-29
+  - Fix a bug with sample reconstruction.
+
+v0.9.10 - 2018-08-07
+  - Improve 64-bit detection.
+
+v0.9.9 - 2018-08-05
+  - Fix C++ build on older versions of GCC.
+
+v0.9.8 - 2018-07-24
+  - Fix compilation errors.
+
+v0.9.7 - 2018-07-05
+  - Fix a warning.
+
+v0.9.6 - 2018-06-29
+  - Fix some typos.
+
+v0.9.5 - 2018-06-23
+  - Fix some warnings.
+
+v0.9.4 - 2018-06-14
+  - Optimizations to seeking.
+  - Clean up.
+
+v0.9.3 - 2018-05-22
+  - Bug fix.
+
+v0.9.2 - 2018-05-12
+  - Fix a compilation error due to a missing break statement.
 
 
-                            bufferOut[i*2+0] = left  << lshift0;
-                            bufferOut[i*2+1] = right << lshift1;
-                        }
-                    } break;
+v0.9.1 - 2018-04-29
+  - Fix compilation error with Clang.
 
 
-                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-                    {
-                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+v0.9 - 2018-04-24
+  - Fix Clang build.
+  - Start using major.minor.revision versioning.
 
 
-                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                            int side  = pDecodedSamples0[i];
-                            int right = pDecodedSamples1[i];
-                            int left  = right + side;
+v0.8g - 2018-04-19
+  - Fix build on non-x86/x64 architectures.
 
 
-                            bufferOut[i*2+0] = left  << lshift0;
-                            bufferOut[i*2+1] = right << lshift1;
-                        }
-                    } break;
+v0.8f - 2018-02-02
+  - Stop pretending to support changing rate/channels mid stream.
 
 
-                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
-                    {
-                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+v0.8e - 2018-02-01
+  - Fix a crash when the block size of a frame is larger than the maximum block size defined by the FLAC stream.
+  - Fix a crash the the Rice partition order is invalid.
 
 
-                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                            int side = pDecodedSamples1[i];
-                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+v0.8d - 2017-09-22
+  - Add support for decoding streams with ID3 tags. ID3 tags are just skipped.
 
 
-                            bufferOut[i*2+0] = ((mid + side) >> 1) << lshift0;
-                            bufferOut[i*2+1] = ((mid - side) >> 1) << lshift1;
-                        }
-                    } break;
+v0.8c - 2017-09-07
+  - Fix warning on non-x86/x64 architectures.
 
 
-                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
-                    default:
-                    {
-                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
-                        {
-                            // Stereo optimized inner loop unroll.
-                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
-
-                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                                bufferOut[i*2+0] = pDecodedSamples0[i] << lshift0;
-                                bufferOut[i*2+1] = pDecodedSamples1[i] << lshift1;
-                            }
-                        }
-                        else
-                        {
-                            // Generic interleaving.
-                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                                for (unsigned int j = 0; j < channelCount; ++j) {
-                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) << (unusedBitsPerSample + pFlac->currentFrame.subframes[j].wastedBitsPerSample);
-                                }
-                            }
-                        }
-                    } break;
-                }
-            } else {
-                int rshift0 = -unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
-                int rshift1 = -unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+v0.8b - 2017-08-19
+  - Fix build on non-x86/x64 architectures.
 
 
-                switch (pFlac->currentFrame.channelAssignment)
-                {
-                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
-                    {
-                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+v0.8a - 2017-08-13
+  - A small optimization for the Clang build.
 
 
-                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                            int left  = pDecodedSamples0[i];
-                            int side  = pDecodedSamples1[i];
-                            int right = left - side;
+v0.8 - 2017-08-12
+  - API CHANGE: Rename dr_* types to drflac_*.
+  - Optimizations. This brings dr_flac back to about the same class of efficiency as the reference implementation.
+  - Add support for custom implementations of malloc(), realloc(), etc.
+  - Add CRC checking to Ogg encapsulated streams.
+  - Fix VC++ 6 build. This is only for the C++ compiler. The C compiler is not currently supported.
+  - Bug fixes.
 
 
-                            bufferOut[i*2+0] = left  >> rshift0;
-                            bufferOut[i*2+1] = right >> rshift1;
-                        }
-                    } break;
+v0.7 - 2017-07-23
+  - Add support for opening a stream without a header block. To do this, use drflac_open_relaxed() / drflac_open_with_metadata_relaxed().
 
 
-                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-                    {
-                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+v0.6 - 2017-07-22
+  - Add support for recovering from invalid frames. With this change, dr_flac will simply skip over invalid frames as if they
+    never existed. Frames are checked against their sync code, the CRC-8 of the frame header and the CRC-16 of the whole frame.
 
 
-                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                            int side  = pDecodedSamples0[i];
-                            int right = pDecodedSamples1[i];
-                            int left  = right + side;
+v0.5 - 2017-07-16
+  - Fix typos.
+  - Change drflac_bool* types to unsigned.
+  - Add CRC checking. This makes dr_flac slower, but can be disabled with #define DR_FLAC_NO_CRC.
 
 
-                            bufferOut[i*2+0] = left  >> rshift0;
-                            bufferOut[i*2+1] = right >> rshift1;
-                        }
-                    } break;
+v0.4f - 2017-03-10
+  - Fix a couple of bugs with the bitstreaming code.
 
 
-                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
-                    {
-                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+v0.4e - 2017-02-17
+  - Fix some warnings.
 
 
-                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                            int side = pDecodedSamples1[i];
-                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+v0.4d - 2016-12-26
+  - Add support for 32-bit floating-point PCM decoding.
+  - Use drflac_int* and drflac_uint* sized types to improve compiler support.
+  - Minor improvements to documentation.
 
 
-                            bufferOut[i*2+0] = ((mid + side) >> 1) >> rshift0;
-                            bufferOut[i*2+1] = ((mid - side) >> 1) >> rshift1;
-                        }
-                    } break;
+v0.4c - 2016-12-26
+  - Add support for signed 16-bit integer PCM decoding.
 
 
-                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
-                    default:
-                    {
-                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
-                        {
-                            // Stereo optimized inner loop unroll.
-                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
-                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
-
-                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                                bufferOut[i*2+0] = pDecodedSamples0[i] >> rshift0;
-                                bufferOut[i*2+1] = pDecodedSamples1[i] >> rshift1;
-                            }
-                        }
-                        else
-                        {
-                            // Generic interleaving.
-                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
-                                for (unsigned int j = 0; j < channelCount; ++j) {
-                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) >> (pFlac->currentFrame.subframes[j].wastedBitsPerSample - unusedBitsPerSample);
-                                }
-                            }
-                        }
-                    } break;
-                }
-            }
+v0.4b - 2016-10-23
+  - A minor change to drflac_bool8 and drflac_bool32 types.
 
 
-            uint64_t alignedSamplesRead = alignedSampleCountPerChannel * channelCount;
-            samplesRead   += alignedSamplesRead;
-            samplesReadFromFrameSoFar += alignedSamplesRead;
-            bufferOut     += alignedSamplesRead;
-            samplesToRead -= alignedSamplesRead;
-            pFlac->currentFrame.samplesRemaining -= (unsigned int)alignedSamplesRead;
+v0.4a - 2016-10-11
+  - Rename drBool32 to drflac_bool32 for styling consistency.
 
 
+v0.4 - 2016-09-29
+  - API/ABI CHANGE: Use fixed size 32-bit booleans instead of the built-in bool type.
+  - API CHANGE: Rename drflac_open_and_decode*() to drflac_open_and_decode*_s32().
+  - API CHANGE: Swap the order of "channels" and "sampleRate" parameters in drflac_open_and_decode*(). Rationale for this is to
+    keep it consistent with drflac_audio.
 
 
+v0.3f - 2016-09-21
+  - Fix a warning with GCC.
 
 
-            // At this point we may still have some excess samples left to read.
-            if (samplesToRead > 0 && pFlac->currentFrame.samplesRemaining > 0)
-            {
-                uint64_t excessSamplesRead = 0;
-                if (samplesToRead < pFlac->currentFrame.samplesRemaining) {
-                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, samplesToRead, bufferOut);
-                } else {
-                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, pFlac->currentFrame.samplesRemaining, bufferOut);
-                }
+v0.3e - 2016-09-18
+  - Fixed a bug where GCC 4.3+ was not getting properly identified.
+  - Fixed a few typos.
+  - Changed date formats to ISO 8601 (YYYY-MM-DD).
 
 
-                samplesRead   += excessSamplesRead;
-                samplesReadFromFrameSoFar += excessSamplesRead;
-                bufferOut     += excessSamplesRead;
-                samplesToRead -= excessSamplesRead;
-            }
-        }
-    }
+v0.3d - 2016-06-11
+  - Minor clean up.
 
 
-    return samplesRead;
-}
+v0.3c - 2016-05-28
+  - Fixed compilation error.
 
 
-static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex)
-{
-    if (pFlac == NULL) {
-        return false;
-    }
+v0.3b - 2016-05-16
+  - Fixed Linux/GCC build.
+  - Updated documentation.
 
 
-    if (sampleIndex == 0) {
-        return drflac__seek_to_first_frame(pFlac);
-    }
+v0.3a - 2016-05-15
+  - Minor fixes to documentation.
 
 
-    // Clamp the sample to the end.
-    if (sampleIndex >= pFlac->totalSampleCount) {
-        sampleIndex  = pFlac->totalSampleCount - 1;
-    }
+v0.3 - 2016-05-11
+  - Optimizations. Now at about parity with the reference implementation on 32-bit builds.
+  - Lots of clean up.
 
 
+v0.2b - 2016-05-10
+  - Bug fixes.
 
 
-    // First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower.
-    if (!drflac__seek_to_sample__seek_table(pFlac, sampleIndex)) {
-        return drflac__seek_to_sample__brute_force(pFlac, sampleIndex);
-    }
+v0.2a - 2016-05-10
+  - Made drflac_open_and_decode() more robust.
+  - Removed an unused debugging variable
 
 
-    return true;
-}
+v0.2 - 2016-05-09
+  - Added support for Ogg encapsulation.
+  - API CHANGE. Have the onSeek callback take a third argument which specifies whether or not the seek
+    should be relative to the start or the current position. Also changes the seeking rules such that
+    seeking offsets will never be negative.
+  - Have drflac_open_and_decode() fail gracefully if the stream has an unknown total sample count.
 
 
+v0.1b - 2016-05-07
+  - Properly close the file handle in drflac_open_file() and family when the decoder fails to initialize.
+  - Removed a stale comment.
 
 
-#endif  //DR_FLAC_IMPLEMENTATION
+v0.1a - 2016-05-05
+  - Minor formatting changes.
+  - Fixed a warning on the GCC build.
 
 
+v0.1 - 2016-05-03
+  - Initial versioned release.
+*/
 
 
 /*
 /*
+This software is available as a choice of the following licenses. Choose
+whichever you prefer.
+
+===============================================================================
+ALTERNATIVE 1 - Public Domain (www.unlicense.org)
+===============================================================================
 This is free and unencumbered software released into the public domain.
 This is free and unencumbered software released into the public domain.
 
 
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
 For more information, please refer to <http://unlicense.org/>
 For more information, please refer to <http://unlicense.org/>
+
+===============================================================================
+ALTERNATIVE 2 - MIT No Attribution
+===============================================================================
+Copyright 2023 David Reid
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
 */
 */

+ 1 - 1
panda/src/movies/flacAudio.h

@@ -20,7 +20,7 @@
 class FlacAudioCursor;
 class FlacAudioCursor;
 
 
 /**
 /**
- * Reads FLAC audio files.  Ogg-encapsulated FLAC files are not supported.
+ * Reads FLAC audio files.  Ogg-encapsulated FLAC files are now supported.
  *
  *
  * @since 1.10.0
  * @since 1.10.0
  */
  */

+ 62 - 15
panda/src/movies/flacAudioCursor.cxx

@@ -24,9 +24,9 @@ extern "C" {
 /**
 /**
  * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
  * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
  */
  */
-static size_t cb_read_proc(void *user, void *buffer, size_t size) {
-  std::istream *stream = (std::istream *)user;
-  nassertr(stream != nullptr, false);
+size_t cb_read_proc(void *user, void *buffer, size_t size) {
+  std::istream *stream = static_cast<FlacAudioCursor*>(user)->_stream;
+  nassertr(stream != nullptr, 0);
 
 
   stream->read((char *)buffer, size);
   stream->read((char *)buffer, size);
 
 
@@ -39,14 +39,53 @@ static size_t cb_read_proc(void *user, void *buffer, size_t size) {
 }
 }
 
 
 /**
 /**
- * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
+ * Callback passed to dr_flac to implement file seeking via the VirtualFileSystem.
  */
  */
-static bool cb_seek_proc(void *user, int offset) {
-  std::istream *stream = (std::istream *)user;
-  nassertr(stream != nullptr, false);
+drflac_bool32 cb_seek_proc(void* user, int offset, drflac_seek_origin origin) {
+  std::istream* stream = static_cast<FlacAudioCursor*>(user)->_stream;
+  nassertr(stream != nullptr, DRFLAC_FALSE);
+
+  std::ios_base::seekdir dir;
+  switch (origin) {
+    case DRFLAC_SEEK_SET:
+      dir = std::ios_base::beg;
+      break;
+    case DRFLAC_SEEK_CUR:
+      dir = std::ios_base::cur;
+      break;
+    default:
+      return DRFLAC_FALSE;
+  }
 
 
-  stream->seekg(offset, std::ios::cur);
-  return !stream->fail();
+  stream->seekg(offset, dir);
+  return !stream->fail() ? DRFLAC_TRUE : DRFLAC_FALSE;
+}
+
+/**
+ * Callback passed to dr_flac to report the current stream position.
+ */
+drflac_bool32 cb_tell_proc(void *user, drflac_int64 *pCursor) {
+  std::istream *stream = static_cast<FlacAudioCursor*>(user)->_stream;
+  nassertr(stream != nullptr, DRFLAC_FALSE);
+
+  *pCursor = (drflac_int64)stream->tellg();
+  return !stream->fail() ? DRFLAC_TRUE : DRFLAC_FALSE;
+}
+
+/**
+ * Callback passed to dr_flac to process metadata found inside flac files.
+ */
+void cb_meta_proc(void* pUserData, drflac_metadata* pMetadata) {
+  FlacAudioCursor* cursor = static_cast<FlacAudioCursor*>(pUserData);
+  if (pMetadata->type == DRFLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT) {
+    drflac_vorbis_comment_iterator iter;
+    drflac_init_vorbis_comment_iterator(&iter, pMetadata->data.vorbis_comment.commentCount, pMetadata->data.vorbis_comment.pComments);
+    const char* next_comment;
+    drflac_uint32 comment_length;
+    while ((next_comment = drflac_next_vorbis_comment(&iter, &comment_length)) != nullptr) {
+      cursor->raw_comment.push_back({ next_comment, comment_length });
+    }
+  }
 }
 }
 
 
 TypeHandle FlacAudioCursor::_type_handle;
 TypeHandle FlacAudioCursor::_type_handle;
@@ -60,12 +99,13 @@ FlacAudioCursor(FlacAudio *src, std::istream *stream) :
   MovieAudioCursor(src),
   MovieAudioCursor(src),
   _is_valid(false),
   _is_valid(false),
   _drflac(nullptr),
   _drflac(nullptr),
-  _stream(stream)
+  _stream(stream),
+  raw_comment()
 {
 {
   nassertv(stream != nullptr);
   nassertv(stream != nullptr);
   nassertv(stream->good());
   nassertv(stream->good());
 
 
-  _drflac = drflac_open(&cb_read_proc, &cb_seek_proc, (void *)stream);
+  _drflac = drflac_open_with_metadata(&cb_read_proc, &cb_seek_proc, &cb_tell_proc, &cb_meta_proc, (void *)this, nullptr);
 
 
   if (_drflac == nullptr) {
   if (_drflac == nullptr) {
     movies_cat.error()
     movies_cat.error()
@@ -73,7 +113,7 @@ FlacAudioCursor(FlacAudio *src, std::istream *stream) :
     _is_valid = false;
     _is_valid = false;
   }
   }
 
 
-  _length = (_drflac->totalSampleCount / _drflac->channels) / (double)_drflac->sampleRate;
+  _length = _drflac->totalPCMFrameCount / (double)_drflac->sampleRate;
 
 
   _audio_channels = _drflac->channels;
   _audio_channels = _drflac->channels;
   _audio_rate = _drflac->sampleRate;
   _audio_rate = _drflac->sampleRate;
@@ -107,7 +147,7 @@ seek(double t) {
 
 
   uint64_t sample = t * _drflac->sampleRate;
   uint64_t sample = t * _drflac->sampleRate;
 
 
-  if (drflac_seek_to_sample(_drflac, sample * _drflac->channels)) {
+  if (drflac_seek_to_pcm_frame(_drflac, sample)) {
     _last_seek = sample / (double)_drflac->sampleRate;
     _last_seek = sample / (double)_drflac->sampleRate;
     _samples_read = 0;
     _samples_read = 0;
   }
   }
@@ -120,8 +160,15 @@ seek(double t) {
  */
  */
 int FlacAudioCursor::
 int FlacAudioCursor::
 read_samples(int n, int16_t *data) {
 read_samples(int n, int16_t *data) {
-  int desired = n * _audio_channels;
-  n = drflac_read_s16(_drflac, desired, data) / _audio_channels;
+  n = (int)drflac_read_pcm_frames_s16(_drflac, n, data);
   _samples_read += n;
   _samples_read += n;
   return n;
   return n;
 }
 }
+
+/**
+ *
+ */
+vector_string FlacAudioCursor::
+get_raw_comment() const {
+  return raw_comment;
+}

+ 8 - 0
panda/src/movies/flacAudioCursor.h

@@ -16,6 +16,7 @@
 
 
 #include "pandabase.h"
 #include "pandabase.h"
 #include "movieAudioCursor.h"
 #include "movieAudioCursor.h"
+#include "vector_string.h"
 
 
 #define DR_FLAC_NO_STDIO
 #define DR_FLAC_NO_STDIO
 extern "C" {
 extern "C" {
@@ -31,6 +32,11 @@ class FlacAudio;
  * @since 1.10.0
  * @since 1.10.0
  */
  */
 class EXPCL_PANDA_MOVIES FlacAudioCursor : public MovieAudioCursor {
 class EXPCL_PANDA_MOVIES FlacAudioCursor : public MovieAudioCursor {
+  friend drflac_bool32 cb_tell_proc(void* pUserData, drflac_int64 *pCursor);
+  friend drflac_bool32 cb_seek_proc(void* pUserData, int offset, drflac_seek_origin origin);
+  friend size_t cb_read_proc(void* pUserData, void *buffer, size_t size);
+  friend void cb_meta_proc(void* pUserData, drflac_metadata* pMetadata);
+
 PUBLISHED:
 PUBLISHED:
   explicit FlacAudioCursor(FlacAudio *src, std::istream *stream);
   explicit FlacAudioCursor(FlacAudio *src, std::istream *stream);
   virtual ~FlacAudioCursor();
   virtual ~FlacAudioCursor();
@@ -58,9 +64,11 @@ public:
     return get_class_type();
     return get_class_type();
   }
   }
   virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
   virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+  vector_string get_raw_comment() const;
 
 
 private:
 private:
   static TypeHandle _type_handle;
   static TypeHandle _type_handle;
+  vector_string raw_comment;
 };
 };
 
 
 #include "flacAudioCursor.I"
 #include "flacAudioCursor.I"

+ 9 - 0
panda/src/movies/movieAudioCursor.cxx

@@ -12,6 +12,7 @@
  */
  */
 
 
 #include "movieAudioCursor.h"
 #include "movieAudioCursor.h"
+#include "vector_string.h"
 
 
 TypeHandle MovieAudioCursor::_type_handle;
 TypeHandle MovieAudioCursor::_type_handle;
 
 
@@ -167,3 +168,11 @@ int MovieAudioCursor::
 ready() const {
 ready() const {
   return 0x40000000;
   return 0x40000000;
 }
 }
+
+/**
+ *
+ */
+vector_string MovieAudioCursor::
+get_raw_comment() const {
+  return vector_string();
+}

+ 2 - 0
panda/src/movies/movieAudioCursor.h

@@ -18,6 +18,7 @@
 #include "namable.h"
 #include "namable.h"
 #include "texture.h"
 #include "texture.h"
 #include "pointerTo.h"
 #include "pointerTo.h"
+#include "vector_string.h"
 class MovieAudio;
 class MovieAudio;
 
 
 /**
 /**
@@ -49,6 +50,7 @@ PUBLISHED:
   virtual void seek(double offset);
   virtual void seek(double offset);
   void read_samples(int n, Datagram *dg);
   void read_samples(int n, Datagram *dg);
   vector_uchar read_samples(int n);
   vector_uchar read_samples(int n);
+  virtual vector_string get_raw_comment() const;
 
 
 public:
 public:
   virtual int read_samples(int n, int16_t *data);
   virtual int read_samples(int n, int16_t *data);

+ 26 - 0
panda/src/movies/opusAudioCursor.cxx

@@ -17,6 +17,7 @@
 
 
 #include "opusAudio.h"
 #include "opusAudio.h"
 #include "virtualFileSystem.h"
 #include "virtualFileSystem.h"
+#include "vector_string.h"
 
 
 #ifdef HAVE_OPUS
 #ifdef HAVE_OPUS
 
 
@@ -121,6 +122,20 @@ static const OpusFileCallbacks callbacks = {cb_read, cb_seek, cb_tell, nullptr};
 
 
 TypeHandle OpusAudioCursor::_type_handle;
 TypeHandle OpusAudioCursor::_type_handle;
 
 
+/**
+ *
+ */
+vector_string
+parse_opus_comments(const OpusTags *com) {
+  // Blatant copy from vorbis code because they're the same struct
+  vector_string comments;
+  for (int cnum = 0; cnum < com->comments; ++cnum) {
+    std::string tag(com->user_comments[cnum], com->comment_lengths[cnum]);
+    comments.push_back(std::move(tag));
+  }
+  return comments;
+}
+
 /**
 /**
  * Reads the .wav header from the indicated stream.  This leaves the read
  * Reads the .wav header from the indicated stream.  This leaves the read
  * pointer positioned at the start of the data.
  * pointer positioned at the start of the data.
@@ -156,6 +171,9 @@ OpusAudioCursor(OpusAudio *src, istream *stream) :
   _can_seek_fast = _can_seek;
   _can_seek_fast = _can_seek;
 
 
   _is_valid = true;
   _is_valid = true;
+
+  const OpusTags *tag = op_tags(_op, -1);
+  _comment = parse_opus_comments(tag);
 }
 }
 
 
 /**
 /**
@@ -270,4 +288,12 @@ read_samples(int n, int16_t *data) {
   return n;
   return n;
 }
 }
 
 
+/**
+ *
+ */
+vector_string OpusAudioCursor::
+get_raw_comment() const {
+  return _comment;
+}
+
 #endif // HAVE_OPUS
 #endif // HAVE_OPUS

+ 3 - 0
panda/src/movies/opusAudioCursor.h

@@ -16,6 +16,7 @@
 
 
 #include "pandabase.h"
 #include "pandabase.h"
 #include "movieAudioCursor.h"
 #include "movieAudioCursor.h"
+#include "vector_string.h"
 
 
 #ifdef HAVE_OPUS
 #ifdef HAVE_OPUS
 
 
@@ -37,6 +38,7 @@ PUBLISHED:
   explicit OpusAudioCursor(OpusAudio *src, std::istream *stream);
   explicit OpusAudioCursor(OpusAudio *src, std::istream *stream);
   virtual ~OpusAudioCursor();
   virtual ~OpusAudioCursor();
   virtual void seek(double offset);
   virtual void seek(double offset);
+  vector_string get_raw_comment() const;
 
 
 public:
 public:
   virtual int read_samples(int n, int16_t *data);
   virtual int read_samples(int n, int16_t *data);
@@ -47,6 +49,7 @@ protected:
   OggOpusFile *_op;
   OggOpusFile *_op;
   std::istream *_stream;
   std::istream *_stream;
   int _link;
   int _link;
+  vector_string _comment;
 
 
 public:
 public:
   static TypeHandle get_class_type() {
   static TypeHandle get_class_type() {

+ 25 - 0
panda/src/movies/vorbisAudioCursor.cxx

@@ -17,6 +17,7 @@
 
 
 #include "vorbisAudio.h"
 #include "vorbisAudio.h"
 #include "virtualFileSystem.h"
 #include "virtualFileSystem.h"
+#include "vector_string.h"
 
 
 #ifdef HAVE_VORBIS
 #ifdef HAVE_VORBIS
 
 
@@ -24,6 +25,19 @@ using std::istream;
 
 
 TypeHandle VorbisAudioCursor::_type_handle;
 TypeHandle VorbisAudioCursor::_type_handle;
 
 
+/**
+ *
+ */
+vector_string
+parse_vorbis_comments(vorbis_comment *com) {
+  vector_string comments;
+  for (int cnum = 0; cnum < com->comments; ++cnum) {
+    std::string tag(com->user_comments[cnum], com->comment_lengths[cnum]);
+    comments.push_back(std::move(tag));
+  }
+  return comments;
+}
+
 /**
 /**
  * Reads the .wav header from the indicated stream.  This leaves the read
  * Reads the .wav header from the indicated stream.  This leaves the read
  * pointer positioned at the start of the data.
  * pointer positioned at the start of the data.
@@ -67,6 +81,9 @@ VorbisAudioCursor(VorbisAudio *src, istream *stream) :
   _can_seek = vorbis_enable_seek && (ov_seekable(&_ov) != 0);
   _can_seek = vorbis_enable_seek && (ov_seekable(&_ov) != 0);
   _can_seek_fast = _can_seek;
   _can_seek_fast = _can_seek;
 
 
+  vorbis_comment *com = ov_comment(&_ov, -1);
+  _comment = parse_vorbis_comments(com);
+
   _is_valid = true;
   _is_valid = true;
 }
 }
 
 
@@ -313,4 +330,12 @@ cb_tell_func(void *datasource) {
   return stream->tellg();
   return stream->tellg();
 }
 }
 
 
+/**
+ *
+ */
+vector_string VorbisAudioCursor::
+get_raw_comment() const {
+  return _comment;
+}
+
 #endif // HAVE_VORBIS
 #endif // HAVE_VORBIS

+ 3 - 0
panda/src/movies/vorbisAudioCursor.h

@@ -16,6 +16,7 @@
 
 
 #include "pandabase.h"
 #include "pandabase.h"
 #include "movieAudioCursor.h"
 #include "movieAudioCursor.h"
+#include "vector_string.h"
 
 
 #ifdef HAVE_VORBIS
 #ifdef HAVE_VORBIS
 
 
@@ -33,6 +34,7 @@ PUBLISHED:
   explicit VorbisAudioCursor(VorbisAudio *src, std::istream *stream);
   explicit VorbisAudioCursor(VorbisAudio *src, std::istream *stream);
   virtual ~VorbisAudioCursor();
   virtual ~VorbisAudioCursor();
   virtual void seek(double offset);
   virtual void seek(double offset);
+  vector_string get_raw_comment() const;
 
 
 public:
 public:
   virtual int read_samples(int n, int16_t *data);
   virtual int read_samples(int n, int16_t *data);
@@ -51,6 +53,7 @@ protected:
   OggVorbis_File _ov;
   OggVorbis_File _ov;
 #endif
 #endif
   int _bitstream;
   int _bitstream;
+  vector_string _comment;
 
 
 public:
 public:
   static TypeHandle get_class_type() {
   static TypeHandle get_class_type() {

+ 18 - 13
panda/src/pgraph/pythonLoaderFileType.cxx

@@ -305,19 +305,19 @@ load_file(const Filename &path, const LoaderOptions &options,
 #endif
 #endif
 
 
   // Wrap the arguments.
   // Wrap the arguments.
-  PyObject *args = PyTuple_New(3);
-  PyTuple_SET_ITEM(args, 0, DTool_CreatePyInstance((void *)&path, Dtool_Filename, false, true));
-  PyTuple_SET_ITEM(args, 1, DTool_CreatePyInstance((void *)&options, Dtool_LoaderOptions, false, true));
+  PyObject *args[4];
+  args[1] = DTool_CreatePyInstance((void *)&path, Dtool_Filename, false, true);
+  args[2] = DTool_CreatePyInstance((void *)&options, Dtool_LoaderOptions, false, true);
   if (record != nullptr) {
   if (record != nullptr) {
     record->ref();
     record->ref();
-    PyTuple_SET_ITEM(args, 2, DTool_CreatePyInstanceTyped((void *)record, Dtool_BamCacheRecord, true, false, record->get_type_index()));
+    args[3] = DTool_CreatePyInstanceTyped((void *)record, Dtool_BamCacheRecord, true, false, record->get_type_index());
   } else {
   } else {
-    PyTuple_SET_ITEM(args, 2, Py_NewRef(Py_None));
+    args[3] = Py_NewRef(Py_None);
   }
   }
 
 
   PT(PandaNode) node;
   PT(PandaNode) node;
 
 
-  PyObject *result = PythonThread::call_python_func(_load_func, args);
+  PyObject *result = PythonThread::call_python_func(_load_func, args + 1, 3 | PY_VECTORCALL_ARGUMENTS_OFFSET);
   if (result != nullptr) {
   if (result != nullptr) {
     if (DtoolInstance_Check(result)) {
     if (DtoolInstance_Check(result)) {
       node = (PandaNode *)DtoolInstance_UPCAST(result, Dtool_PandaNode);
       node = (PandaNode *)DtoolInstance_UPCAST(result, Dtool_PandaNode);
@@ -325,7 +325,9 @@ load_file(const Filename &path, const LoaderOptions &options,
     Py_DECREF(result);
     Py_DECREF(result);
   }
   }
 
 
-  Py_DECREF(args);
+  Py_DECREF(args[1]);
+  Py_DECREF(args[2]);
+  Py_DECREF(args[3]);
 
 
   if (node == nullptr) {
   if (node == nullptr) {
     PyObject *exc_type = PyErr_Occurred();
     PyObject *exc_type = PyErr_Occurred();
@@ -372,13 +374,16 @@ save_file(const Filename &path, const LoaderOptions &options,
 #endif
 #endif
 
 
   // Wrap the arguments.
   // Wrap the arguments.
-  PyObject *args = PyTuple_New(3);
-  PyTuple_SET_ITEM(args, 0, DTool_CreatePyInstance((void *)&path, Dtool_Filename, false, true));
-  PyTuple_SET_ITEM(args, 1, DTool_CreatePyInstance((void *)&options, Dtool_LoaderOptions, false, true));
-  PyTuple_SET_ITEM(args, 2, DTool_CreatePyInstanceTyped((void *)node, Dtool_PandaNode, true, false, node->get_type_index()));
+  PyObject *args[4];
+  args[1] = DTool_CreatePyInstance((void *)&path, Dtool_Filename, false, true);
+  args[2] = DTool_CreatePyInstance((void *)&options, Dtool_LoaderOptions, false, true);
+  args[3] = DTool_CreatePyInstanceTyped((void *)node, Dtool_PandaNode, true, false, node->get_type_index());
+
+  PyObject *result = PythonThread::call_python_func(_save_func, args + 1, 3 | PY_VECTORCALL_ARGUMENTS_OFFSET);
+  Py_DECREF(args[1]);
+  Py_DECREF(args[2]);
+  Py_DECREF(args[3]);
 
 
-  PyObject *result = PythonThread::call_python_func(_load_func, args);
-  Py_DECREF(args);
   if (result != nullptr) {
   if (result != nullptr) {
     Py_DECREF(result);
     Py_DECREF(result);
   } else {
   } else {

+ 7 - 5
panda/src/pipeline/pythonThread.cxx

@@ -123,7 +123,7 @@ set_args(PyObject *args) {
  * exception.
  * exception.
  */
  */
 PyObject *PythonThread::
 PyObject *PythonThread::
-call_python_func(PyObject *function, PyObject *args) {
+call_python_func(PyObject *function, PyObject **args, size_t nargsf) {
   Thread *current_thread = get_current_thread();
   Thread *current_thread = get_current_thread();
 
 
   // Create a new Python thread state data structure, so Python can properly
   // Create a new Python thread state data structure, so Python can properly
@@ -132,7 +132,7 @@ call_python_func(PyObject *function, PyObject *args) {
 
 
   if (current_thread == get_main_thread()) {
   if (current_thread == get_main_thread()) {
     // In the main thread, just call the function.
     // In the main thread, just call the function.
-    result = PyObject_Call(function, args, nullptr);
+    result = _PyObject_Vectorcall(function, args, nargsf, nullptr);
 
 
     if (result == nullptr) {
     if (result == nullptr) {
       if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_SystemExit)) {
       if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_SystemExit)) {
@@ -189,7 +189,7 @@ call_python_func(PyObject *function, PyObject *args) {
     PyThreadState_Swap(new_thread_state);
     PyThreadState_Swap(new_thread_state);
 
 
     // Call the user's function.
     // Call the user's function.
-    result = PyObject_Call(function, args, nullptr);
+    result = _PyObject_Vectorcall(function, args, nargsf, nullptr);
     if (result == nullptr && PyErr_Occurred()) {
     if (result == nullptr && PyErr_Occurred()) {
       // We got an exception.  Move the exception from the current thread into
       // We got an exception.  Move the exception from the current thread into
       // the main thread, so it can be handled there.
       // the main thread, so it can be handled there.
@@ -230,7 +230,7 @@ call_python_func(PyObject *function, PyObject *args) {
     gstate = PyGILState_Ensure();
     gstate = PyGILState_Ensure();
 
 
     // Call the user's function.
     // Call the user's function.
-    result = PyObject_Call(function, args, nullptr);
+    result = _PyObject_Vectorcall(function, args, nargsf, nullptr);
     if (result == nullptr && PyErr_Occurred()) {
     if (result == nullptr && PyErr_Occurred()) {
       // We got an exception.  Move the exception from the current thread into
       // We got an exception.  Move the exception from the current thread into
       // the main thread, so it can be handled there.
       // the main thread, so it can be handled there.
@@ -275,7 +275,9 @@ call_python_func(PyObject *function, PyObject *args) {
  */
  */
 void PythonThread::
 void PythonThread::
 thread_main() {
 thread_main() {
-  _result = call_python_func(_function, _args);
+  PyObject **args = &PyTuple_GET_ITEM(_args, 0);
+  Py_ssize_t nargs = PyTuple_GET_SIZE(_args);
+  _result = call_python_func(_function, args, (size_t)nargs);
 }
 }
 
 
 #endif  // HAVE_PYTHON
 #endif  // HAVE_PYTHON

+ 2 - 1
panda/src/pipeline/pythonThread.h

@@ -36,7 +36,8 @@ public:
   PyObject *get_args() const;
   PyObject *get_args() const;
   void set_args(PyObject *);
   void set_args(PyObject *);
 
 
-  static PyObject *call_python_func(PyObject *function, PyObject *args);
+  static PyObject *call_python_func(PyObject *function, PyObject **args,
+                                    size_t nargsf);
 
 
 PUBLISHED:
 PUBLISHED:
   MAKE_PROPERTY(args, get_args, set_args);
   MAKE_PROPERTY(args, get_args, set_args);

+ 6 - 7
panda/src/putil/bamReader_ext.cxx

@@ -41,15 +41,14 @@ static TypedWritable *factory_callback(const FactoryParams &params){
   BamReader *manager;
   BamReader *manager;
   parse_params(params, scan, manager);
   parse_params(params, scan, manager);
 
 
-  PyObject *py_scan = DTool_CreatePyInstance(&scan, Dtool_DatagramIterator, false, false);
-  PyObject *py_manager = DTool_CreatePyInstance(manager, Dtool_BamReader, false, false);
-  PyObject *args = PyTuple_Pack(2, py_scan, py_manager);
+  PyObject *args[3];
+  args[1] = DTool_CreatePyInstance(&scan, Dtool_DatagramIterator, false, false);
+  args[2] = DTool_CreatePyInstance(manager, Dtool_BamReader, false, false);
 
 
   // Now call the Python function.
   // Now call the Python function.
-  PyObject *result = PythonThread::call_python_func(func, args);
-  Py_DECREF(args);
-  Py_DECREF(py_scan);
-  Py_DECREF(py_manager);
+  PyObject *result = PythonThread::call_python_func(func, args + 1, 2 | PY_VECTORCALL_ARGUMENTS_OFFSET);
+  Py_DECREF(args[1]);
+  Py_DECREF(args[2]);
 
 
   if (result == nullptr) {
   if (result == nullptr) {
     util_cat.error()
     util_cat.error()

+ 39 - 0
panda/src/putil/bitArray_ext.cxx

@@ -20,6 +20,29 @@
  */
  */
 void Extension<BitArray>::
 void Extension<BitArray>::
 __init__(PyObject *init_value) {
 __init__(PyObject *init_value) {
+#if PY_VERSION_HEX >= 0x030e0000  // Python 3.14
+  BitArray::WordType word = 0;
+  Py_ssize_t result = PyLong_AsNativeBytes(init_value, &word, sizeof(word), Py_ASNATIVEBYTES_LITTLE_ENDIAN | Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE);
+  if (result < 0) {
+    return;
+  }
+
+  if ((size_t)result <= sizeof(BitArray::WordType)) {
+    // It fit in a single word
+    _this->_array.clear();
+    if (word != 0) {
+      _this->_array.push_back(word);
+    }
+  } else {
+    // Requires multiple words.
+    size_t num_words = ((size_t)result + sizeof(BitArray::WordType) - 1) / sizeof(BitArray::WordType);
+    _this->_array.resize(num_words);
+
+    if (PyLong_AsNativeBytes(init_value, &_this->_array[0], num_words * sizeof(BitArray::WordType), Py_ASNATIVEBYTES_LITTLE_ENDIAN | Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE) <= 0) {
+      return;
+    }
+  }
+#else
   if (!PyLong_Check(init_value) || !PyLong_IsNonNegative(init_value)) {
   if (!PyLong_Check(init_value) || !PyLong_IsNonNegative(init_value)) {
     PyErr_SetString(PyExc_ValueError, "BitArray constructor requires a positive integer");
     PyErr_SetString(PyExc_ValueError, "BitArray constructor requires a positive integer");
     return;
     return;
@@ -38,6 +61,7 @@ __init__(PyObject *init_value) {
 #endif
 #endif
       );
       );
   }
   }
+#endif
 }
 }
 
 
 /**
 /**
@@ -55,19 +79,34 @@ __getstate__() const {
   }
   }
 
 
   if (_this->_highest_bits == 0) {
   if (_this->_highest_bits == 0) {
+#if PY_VERSION_HEX >= 0x030e0000  // Python 3.14
+    return PyLong_FromUnsignedNativeBytes(
+      &_this->_array[0], _this->_array.size() * sizeof(BitArray::WordType),
+      Py_ASNATIVEBYTES_LITTLE_ENDIAN);
+#else
     return _PyLong_FromByteArray(
     return _PyLong_FromByteArray(
       (const unsigned char *)&_this->_array[0],
       (const unsigned char *)&_this->_array[0],
       _this->_array.size() * sizeof(BitArray::WordType),
       _this->_array.size() * sizeof(BitArray::WordType),
       1, 0);
       1, 0);
+#endif
   } else {
   } else {
     // This is an infinite array, so we invert it to make it a finite array and
     // This is an infinite array, so we invert it to make it a finite array and
     // store it as an inverted long.
     // store it as an inverted long.
     BitArray copy(*_this);
     BitArray copy(*_this);
     copy.invert_in_place();
     copy.invert_in_place();
+#if PY_VERSION_HEX >= 0x030e0000  // Python 3.14
+    PyObject *state = PyLong_FromUnsignedNativeBytes(
+      &copy._array[0], copy._array.size() * sizeof(BitArray::WordType),
+      Py_ASNATIVEBYTES_LITTLE_ENDIAN);
+#else
     PyObject *state = _PyLong_FromByteArray(
     PyObject *state = _PyLong_FromByteArray(
       (const unsigned char *)&copy._array[0],
       (const unsigned char *)&copy._array[0],
       copy._array.size() * sizeof(BitArray::WordType),
       copy._array.size() * sizeof(BitArray::WordType),
       1, 0);
       1, 0);
+#endif
+    if (state == nullptr) {
+      return nullptr;
+    }
     PyObject *inverted = PyNumber_Invert(state);
     PyObject *inverted = PyNumber_Invert(state);
     Py_DECREF(state);
     Py_DECREF(state);
     return inverted;
     return inverted;

+ 1 - 0
panda/src/putil/callbackObject.h

@@ -16,6 +16,7 @@
 
 
 #include "pandabase.h"
 #include "pandabase.h"
 #include "typedReferenceCount.h"
 #include "typedReferenceCount.h"
+#include "pointerTo.h"
 
 
 class CallbackData;
 class CallbackData;
 
 

+ 1 - 0
panda/src/putil/config_putil.cxx

@@ -212,6 +212,7 @@ init_libputil() {
   FactoryParam::init_type();
   FactoryParam::init_type();
   Namable::init_type();
   Namable::init_type();
   NodeCachedReferenceCount::init_type();
   NodeCachedReferenceCount::init_type();
+  ParamBytes::init_type("ParamBytes");
   ParamMatrix3d::init_type("ParamMatrix3d");
   ParamMatrix3d::init_type("ParamMatrix3d");
   ParamMatrix3f::init_type("ParamMatrix3f");
   ParamMatrix3f::init_type("ParamMatrix3f");
   ParamMatrix4d::init_type("ParamMatrix4d");
   ParamMatrix4d::init_type("ParamMatrix4d");

+ 1 - 0
panda/src/putil/paramValue.cxx

@@ -16,6 +16,7 @@
 
 
 template class ParamValue<std::string>;
 template class ParamValue<std::string>;
 template class ParamValue<std::wstring>;
 template class ParamValue<std::wstring>;
+template class ParamValue<vector_uchar>;
 
 
 template class ParamValue<LVecBase2d>;
 template class ParamValue<LVecBase2d>;
 template class ParamValue<LVecBase2f>;
 template class ParamValue<LVecBase2f>;

+ 3 - 0
panda/src/putil/paramValue.h

@@ -22,6 +22,7 @@
 #include "bamReader.h"
 #include "bamReader.h"
 #include "bamWriter.h"
 #include "bamWriter.h"
 #include "luse.h"
 #include "luse.h"
+#include "vector_uchar.h"
 
 
 /**
 /**
  * A non-template base class of ParamValue (below), which serves mainly to
  * A non-template base class of ParamValue (below), which serves mainly to
@@ -152,6 +153,7 @@ private:
 
 
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<std::string>);
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<std::string>);
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<std::wstring>);
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<std::wstring>);
+EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<vector_uchar>);
 
 
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<LVecBase2d>);
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<LVecBase2d>);
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<LVecBase2f>);
 EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<LVecBase2f>);
@@ -173,6 +175,7 @@ EXPORT_TEMPLATE_CLASS(EXPCL_PANDA_PUTIL, EXPTP_PANDA_PUTIL, ParamValue<LMatrix4f
 
 
 typedef ParamValue<std::string> ParamString;
 typedef ParamValue<std::string> ParamString;
 typedef ParamValue<std::wstring> ParamWstring;
 typedef ParamValue<std::wstring> ParamWstring;
+typedef ParamValue<vector_uchar> ParamBytes;
 
 
 typedef ParamValue<LVecBase2d> ParamVecBase2d;
 typedef ParamValue<LVecBase2d> ParamVecBase2d;
 typedef ParamValue<LVecBase2f> ParamVecBase2f;
 typedef ParamValue<LVecBase2f> ParamVecBase2f;

+ 7 - 10
panda/src/putil/pythonCallbackObject.cxx

@@ -129,16 +129,13 @@ void PythonCallbackObject::
 do_python_callback(CallbackData *cbdata) {
 do_python_callback(CallbackData *cbdata) {
   nassertv(cbdata != nullptr);
   nassertv(cbdata != nullptr);
 
 
-  // Wrap the cbdata up in a Python object, then put it in a tuple, for the
-  // argument list.
-  PyObject *pycbdata =
-    DTool_CreatePyInstanceTyped(cbdata, Dtool_TypedObject,
-                                false, false, cbdata->get_type_index());
-  PyObject *args = Py_BuildValue("(O)", pycbdata);
-  Py_DECREF(pycbdata);
-
-  PyObject *result = PythonThread::call_python_func(_function, args);
-  Py_DECREF(args);
+  // Wrap the cbdata up in a Python object.
+  PyObject *args[2];
+  args[1] = DTool_CreatePyInstanceTyped(cbdata, Dtool_TypedObject,
+                                        false, false, cbdata->get_type_index());
+
+  PyObject *result = PythonThread::call_python_func(_function, args + 1, 1 | PY_VECTORCALL_ARGUMENTS_OFFSET);
+  Py_DECREF(args[1]);
 
 
   if (result == nullptr) {
   if (result == nullptr) {
     if (PyErr_Occurred() != PyExc_SystemExit) {
     if (PyErr_Occurred() != PyExc_SystemExit) {

+ 4 - 0
panda/src/text/textProperties.cxx

@@ -489,4 +489,8 @@ load_default_font() {
 
 
 #endif  // HAVE_FREETYPE
 #endif  // HAVE_FREETYPE
 #endif  // COMPILE_IN_DEFAULT_FONT
 #endif  // COMPILE_IN_DEFAULT_FONT
+
+  if (_default_font != nullptr && !_default_font->has_name()) {
+    _default_font->set_name("default_font");
+  }
 }
 }

+ 1 - 0
setup.cfg

@@ -20,6 +20,7 @@ classifiers =
     Programming Language :: Python :: 3.11
     Programming Language :: Python :: 3.11
     Programming Language :: Python :: 3.12
     Programming Language :: Python :: 3.12
     Programming Language :: Python :: 3.13
     Programming Language :: Python :: 3.13
+    Programming Language :: Python :: 3.14
     Programming Language :: Python :: Implementation :: CPython
     Programming Language :: Python :: Implementation :: CPython
     Topic :: Games/Entertainment
     Topic :: Games/Entertainment
     Topic :: Multimedia
     Topic :: Multimedia

BIN
tests/audio/openclose_with_comments.flac


BIN
tests/audio/openclose_with_comments.mp3


BIN
tests/audio/openclose_with_comments.ogg


BIN
tests/audio/openclose_with_comments.opus


+ 33 - 2
tests/audio/test_loading.py

@@ -1,5 +1,36 @@
+import os
+
 import pytest
 import pytest
+from panda3d.core import Filename
+
 
 
 def test_missing_file(audiomgr):
 def test_missing_file(audiomgr):
-    sound = audiomgr.get_sound('/not/a/valid/file.ogg')
-    assert str(sound).startswith('NullAudioSound')
+    sound = audiomgr.get_sound("/not/a/valid/file.ogg")
+    assert str(sound).startswith("NullAudioSound")
+
+
[email protected]("extension", ["ogg", "opus", "mp3", "flac"])
+def test_comments(audiomgr, extension):
+    if "openal" not in str(audiomgr).lower():
+        # NULL audio manager (as well as fmod) don't support comment reading
+        pytest.skip("Comment reading is only supported on OpenAL")
+    # ogg should be loaded with libvorbis, opus with libopus, mp3 with ffmpeg
+    # we cannot test this though because after loading the loader information is gone
+    sound_path = os.path.join(
+        os.path.dirname(__file__), f"openclose_with_comments.{extension}"
+    )
+    sound_path = Filename.from_os_specific(sound_path)
+    sound = audiomgr.get_sound(sound_path)
+    if extension == "mp3":  # FFMPEG encodes/decodes tags differently
+        tags = ["artist", "comment", "genre"]
+    else:
+        tags = ["ARTIST", "COMMENTS", "GENRE"]
+    assert sorted(sound.raw_comments) == [
+        f"{tags[0]}=Example Artist",
+        f"{tags[1]}=This is an example OGG comment",
+        f"{tags[2]}=Blues",
+    ]
+    assert sound.comments[tags[-1]] == "Blues"
+    assert sound.get_comment(tags[0]) == "Example Artist"
+    assert sound.get_comment("DOES NOT EXIST") == ""
+    assert sound.has_comment(tags[0])

+ 18 - 6
tests/display/test_glsl_shader.py

@@ -323,9 +323,9 @@ def test_glsl_ssbo(env):
     from struct import pack, unpack
     from struct import pack, unpack
     num1 = pack('<i', 1234567)
     num1 = pack('<i', 1234567)
     num2 = pack('<ii', -1234567, 0)
     num2 = pack('<ii', -1234567, 0)
-    buffer1 = core.ShaderBuffer("buffer1", num1, core.GeomEnums.UH_static)
-    buffer2 = core.ShaderBuffer("buffer2", num2, core.GeomEnums.UH_static)
-    buffer3 = core.ShaderBuffer("buffer3", 8, core.GeomEnums.UH_static)
+    buffer1 = core.ShaderBuffer("buffer1", num1, core.GeomEnums.UH_dynamic)
+    buffer2 = core.ShaderBuffer("buffer2", num2, core.GeomEnums.UH_dynamic)
+    buffer3 = core.ShaderBuffer("buffer3", 8, core.GeomEnums.UH_dynamic)
 
 
     preamble = """
     preamble = """
     layout(std430, binding=0) readonly buffer buffer1 {
     layout(std430, binding=0) readonly buffer buffer1 {
@@ -348,9 +348,8 @@ def test_glsl_ssbo(env):
     value4 = 5343525;
     value4 = 5343525;
     value5 = 999;
     value5 = 999;
     """
     """
-    env.run_glsl(code, preamble,
-                  {'buffer1': buffer1, 'buffer2': buffer2, 'buffer3': buffer3},
-                  version=430)
+    inputs = {'buffer1': buffer1, 'buffer2': buffer2, 'buffer3': buffer3}
+    env.run_glsl(code, preamble, inputs, version=430)
 
 
     data1 = env.engine.extract_shader_buffer_data(buffer2, env.gsg)
     data1 = env.engine.extract_shader_buffer_data(buffer2, env.gsg)
     assert unpack('<ii', data1[:8]) == (-1234567, 98765)
     assert unpack('<ii', data1[:8]) == (-1234567, 98765)
@@ -358,6 +357,19 @@ def test_glsl_ssbo(env):
     data2 = env.engine.extract_shader_buffer_data(buffer3, env.gsg)
     data2 = env.engine.extract_shader_buffer_data(buffer3, env.gsg)
     assert unpack('<ii', data2[:8]) == (5343525, 999)
     assert unpack('<ii', data2[:8]) == (5343525, 999)
 
 
+    data3 = env.engine.extract_shader_buffer_data(buffer2, env.gsg, 4, 8)
+    assert unpack('<i', data3[:4]) == (98765, )
+
+    env.engine.update_shader_buffer_data(buffer1, env.gsg, pack('<i', 92573))
+    env.engine.update_shader_buffer_data(buffer2, env.gsg, pack('<i', 64515), 4)
+
+    code = """
+    assert(value1 == 92573);
+    assert(value2 == -1234567);
+    assert(value3 == 64515);
+    """
+    env.run_glsl(code, preamble, inputs, version=430)
+
 
 
 def test_glsl_ssbo_array(env):
 def test_glsl_ssbo_array(env):
     from struct import pack
     from struct import pack

+ 3 - 0
tests/event/test_futures.py

@@ -37,6 +37,9 @@ def check_result(fut, expected):
     if fut.result() != expected:
     if fut.result() != expected:
         return False
         return False
 
 
+    if sys.version_info < (3, 5):
+        return True
+
     # Make sure that await also returns the values properly
     # Make sure that await also returns the values properly
     with pytest.raises(StopIteration) as e:
     with pytest.raises(StopIteration) as e:
         next(fut.__await__())
         next(fut.__await__())

+ 14 - 0
tests/putil/test_bitarray.py

@@ -11,6 +11,20 @@ def test_bitarray_constructor():
     assert BitArray().is_zero()
     assert BitArray().is_zero()
     assert BitArray(0).is_zero()
     assert BitArray(0).is_zero()
 
 
+    one = BitArray(1)
+    assert not one.is_zero()
+    assert one.get_lowest_on_bit() == 0
+    assert one.get_highest_on_bit() == 0
+    assert one.get_lowest_off_bit() == 1
+    assert one.get_highest_off_bit() == -1
+
+    two = BitArray(2)
+    assert not two.is_zero()
+    assert two.get_lowest_on_bit() == 1
+    assert two.get_highest_on_bit() == 1
+    assert two.get_lowest_off_bit() == 0
+    assert two.get_highest_off_bit() == -1
+
     ba = BitArray(0x10000000000000000000000000)
     ba = BitArray(0x10000000000000000000000000)
     assert ba.get_lowest_on_bit() == 100
     assert ba.get_lowest_on_bit() == 100
     assert ba.get_highest_on_bit() == 100
     assert ba.get_highest_on_bit() == 100