1 year ago · 925978ffde
--- a/src/external/qoa.h
+++ b/src/external/qoa.h
@@ -8,71 +8,96 @@ QOA - The "Quite OK Audio" format for fast, lossy audio compression
 
				 
			
 
				 -- Data Format
			
 
				 
			
 
				-A QOA file has an 8 byte file header, followed by a number of frames. Each frame 
			
 
				-consists of an 8 byte frame header, the current 8 byte en-/decoder state per
			
 
				-channel and 256 slices per channel. Each slice is 8 bytes wide and encodes 20 
			
 
				-samples of audio data.
			
 
				+QOA encodes pulse-code modulated (PCM) audio data with up to 255 channels, 
			
 
				+sample rates from 1 up to 16777215 hertz and a bit depth of 16 bits.
			
 
				 
			
 
				-Note that the last frame of a file may contain less than 256 slices per channel.
			
 
				-The last slice (per channel) in the last frame may contain less 20 samples, but
			
 
				-the slice will still be 8 bytes wide, with the unused samples zeroed out.
			
 
				+The compression method employed in QOA is lossy; it discards some information
			
 
				+from the uncompressed PCM data. For many types of audio signals this compression
			
 
				+is "transparent", i.e. the difference from the original file is often not
			
 
				+audible.
			
 
				 
			
 
				-The samplerate and number of channels is only stated in the frame headers, but
			
 
				-not in the file header. A decoder may peek into the first frame of the file to 
			
 
				-find these values.
			
 
				+QOA encodes 20 samples of 16 bit PCM data into slices of 64 bits. A single
			
 
				+sample therefore requires 3.2 bits of storage space, resulting in a 5x
			
 
				+compression (16 / 3.2).
			
 
				 
			
 
				-In a valid QOA file all frames have the same number of channels and the same
			
 
				-samplerate. These restrictions may be relaxed for streaming. This remains to 
			
 
				-be decided.
			
 
				+A QOA file consists of an 8 byte file header, followed by a number of frames.
			
 
				+Each frame contains an 8 byte frame header, the current 16 byte en-/decoder
			
 
				+state per channel and 256 slices per channel. Each slice is 8 bytes wide and
			
 
				+encodes 20 samples of audio data.
			
 
				 
			
 
				-All values in a QOA file are BIG ENDIAN. Luckily, EVERYTHING in a QOA file,
			
 
				-including the headers, is 64 bit aligned, so it's possible to read files with 
			
 
				-just a read_u64() that does the byte swapping if necessary.
			
 
				-
			
 
				-In pseudocode, the file layout is as follows:
			
 
				+All values, including the slices, are big endian. The file layout is as follows:
			
 
				 
			
 
				 struct {
			
 
				 	struct {
			
 
				-		char     magic[4];         // magic bytes 'qoaf'
			
 
				-		uint32_t samples;          // number of samples per channel in this file
			
 
				-	} file_header;                 // = 64 bits
			
 
				+		char     magic[4];         // magic bytes "qoaf"
			
 
				+		uint32_t samples;          // samples per channel in this file
			
 
				+	} file_header;             
			
 
				 
			
 
				 	struct {
			
 
				 		struct {
			
 
				-			uint8_t  num_channels; // number of channels
			
 
				+			uint8_t  num_channels; // no. of channels
			
 
				 			uint24_t samplerate;   // samplerate in hz
			
 
				-			uint16_t fsamples;     // sample count per channel in this frame
			
 
				-			uint16_t fsize;        // frame size (including the frame header)
			
 
				-		} frame_header;            // = 64 bits
			
 
				+			uint16_t fsamples;     // samples per channel in this frame
			
 
				+			uint16_t fsize;        // frame size (includes this header)
			
 
				+		} frame_header;          
			
 
				 
			
 
				 		struct {
			
 
				-			int16_t history[4];    // = 64 bits
			
 
				-			int16_t weights[4];    // = 64 bits
			
 
				+			int16_t history[4];    // most recent last
			
 
				+			int16_t weights[4];    // most recent last
			
 
				 		} lms_state[num_channels]; 
			
 
				 
			
 
				-		qoa_slice_t slices[256][num_channels]; // = 64 bits each
			
 
				-	} frames[samples * channels / qoa_max_framesize()];
			
 
				-} qoa_file;
			
 
				+		qoa_slice_t slices[256][num_channels];
			
 
				+
			
 
				+	} frames[ceil(samples / (256 * 20))];
			
 
				+} qoa_file_t;
			
 
				 
			
 
				-Wheras the 64bit qoa_slice_t is defined as follows:
			
 
				+Each `qoa_slice_t` contains a quantized scalefactor `sf_quant` and 20 quantized
			
 
				+residuals `qrNN`:
			
 
				 
			
 
				 .- QOA_SLICE -- 64 bits, 20 samples --------------------------/  /------------.
			
 
				 |        Byte[0]         |        Byte[1]         |  Byte[2]  \  \  Byte[7]   |
			
 
				 | 7  6  5  4  3  2  1  0 | 7  6  5  4  3  2  1  0 | 7  6  5   /  /    2  1  0 |
			
 
				 |------------+--------+--------+--------+---------+---------+-\  \--+---------|
			
 
				-|  sf_index  |  r00   |   r01  |   r02  |  r03    |   r04   | /  /  |   r19   |
			
 
				+|  sf_quant  |  qr00  |  qr01  |  qr02  |  qr03   |  qr04   | /  /  |  qr19   |
			
 
				 `-------------------------------------------------------------\  \------------`
			
 
				 
			
 
				-`sf_index` defines the scalefactor to use for this slice as an index into the
			
 
				-qoa_scalefactor_tab[16]
			
 
				+Each frame except the last must contain exactly 256 slices per channel. The last
			
 
				+frame may contain between 1 .. 256 (inclusive) slices per channel. The last
			
 
				+slice (for each channel) in the last frame may contain less than 20 samples; the
			
 
				+slice still must be 8 bytes wide, with the unused samples zeroed out.
			
 
				+
			
 
				+Channels are interleaved per slice. E.g. for 2 channel stereo: 
			
 
				+slice[0] = L, slice[1] = R, slice[2] = L, slice[3] = R ...
			
 
				+
			
 
				+A valid QOA file or stream must have at least one frame. Each frame must contain
			
 
				+at least one channel and one sample with a samplerate between 1 .. 16777215
			
 
				+(inclusive).
			
 
				+
			
 
				+If the total number of samples is not known by the encoder, the samples in the
			
 
				+file header may be set to 0x00000000 to indicate that the encoder is 
			
 
				+"streaming". In a streaming context, the samplerate and number of channels may
			
 
				+differ from frame to frame. For static files (those with samples set to a
			
 
				+non-zero value), each frame must have the same number of channels and same
			
 
				+samplerate.
			
 
				 
			
 
				-`r00`--`r19` are the residuals for the individual samples, divided by the
			
 
				-scalefactor and quantized by the qoa_quant_tab[].
			
 
				+Note that this implementation of QOA only handles files with a known total
			
 
				+number of samples.
			
 
				 
			
 
				-In the decoder, a prediction of the next sample is computed by multiplying the 
			
 
				-state (the last four output samples) with the predictor. The residual from the 
			
 
				-slice is then dequantized using the qoa_dequant_tab[] and added to the 
			
 
				-prediction. The result is clamped to int16 to form the final output sample.
			
 
				+A decoder should support at least 8 channels. The channel layout for channel
			
 
				+counts 1 .. 8 is:
			
 
				+
			
 
				+	1. Mono
			
 
				+	2. L, R
			
 
				+	3. L, R, C 
			
 
				+	4. FL, FR, B/SL, B/SR 
			
 
				+	5. FL, FR, C, B/SL, B/SR 
			
 
				+	6. FL, FR, C, LFE, B/SL, B/SR
			
 
				+	7. FL, FR, C, LFE, B, SL, SR 
			
 
				+	8. FL, FR, C, LFE, BL, BR, SL, SR
			
 
				+
			
 
				+QOA predicts each audio sample based on the previously decoded ones using a
			
 
				+"Sign-Sign Least Mean Squares Filter" (LMS). This prediction plus the 
			
 
				+dequantized residual forms the final output sample.
			
 
				 
			
 
				 */
			
 
				 
			
@@ -158,7 +183,7 @@ the higher end. Note that the residual zero is identical to the lowest positive
 
				 value. This is mostly fine, since the qoa_div() function always rounds away 
			
 
				 from zero. */
			
 
				 
			
 
				-static int qoa_quant_tab[17] = {
			
 
				+static const int qoa_quant_tab[17] = {
			
 
				 	7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
			
 
				 	0,                      /*  0     */
			
 
				 	0, 2, 2, 4, 4, 6, 6, 6  /*  1.. 8 */
			
@@ -169,13 +194,13 @@ static int qoa_quant_tab[17] = {
 
				 less accurate at the higher end. In theory, the highest scalefactor that we
			
 
				 would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
			
 
				 rely on the LMS filter to predict samples accurately enough that a maximum 
			
 
				-residual of one quarter of the 16 bit range is high sufficient. I.e. with the 
			
 
				+residual of one quarter of the 16 bit range is sufficient. I.e. with the 
			
 
				 scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
			
 
				 
			
 
				 The scalefactor values are computed as:
			
 
				 scalefactor_tab[s] <- round(pow(s + 1, 2.75)) */
			
 
				 
			
 
				-static int qoa_scalefactor_tab[16] = {
			
 
				+static const int qoa_scalefactor_tab[16] = {
			
 
				 	1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
			
 
				 };
			
 
				 
			
@@ -188,7 +213,7 @@ do this in .16 fixed point with integers, instead of floats.
 
				 The reciprocal_tab is computed as:
			
 
				 reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] */
			
 
				 
			
 
				-static int qoa_reciprocal_tab[16] = {
			
 
				+static const int qoa_reciprocal_tab[16] = {
			
 
				 	65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
			
 
				 };
			
 
				 
			
@@ -200,9 +225,13 @@ Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
 
				 instead of 1. The dequant_tab assumes the following dequantized values for each 
			
 
				 of the quant_tab indices and is computed as:
			
 
				 float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
			
 
				-dequant_tab[s][q] <- round(scalefactor_tab[s] * dqt[q]) */
			
 
				+dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
			
 
				+
			
 
				+The rounding employed here is "to nearest, ties away from zero",  i.e. positive
			
 
				+and negative values are treated symmetrically.
			
 
				+*/
			
 
				 
			
 
				-static int qoa_dequant_tab[16][8] = {
			
 
				+static const int qoa_dequant_tab[16][8] = {
			
 
				 	{   1,    -1,    3,    -3,    5,    -5,     7,     -7},
			
 
				 	{   5,    -5,   18,   -18,   32,   -32,    49,    -49},
			
 
				 	{  16,   -16,   53,   -53,   95,   -95,   147,   -147},
			
@@ -270,7 +299,21 @@ static inline int qoa_div(int v, int scalefactor) {
 
				 }
			
 
				 
			
 
				 static inline int qoa_clamp(int v, int min, int max) {
			
 
				-	return (v < min) ? min : (v > max) ? max : v;
			
 
				+	if (v < min) { return min; }
			
 
				+	if (v > max) { return max; }
			
 
				+	return v;
			
 
				+}
			
 
				+
			
 
				+/* This specialized clamp function for the signed 16 bit range improves decode
			
 
				+performance quite a bit. The extra if() statement works nicely with the CPUs
			
 
				+branch prediction as this branch is rarely taken. */
			
 
				+
			
 
				+static inline int qoa_clamp_s16(int v) {
			
 
				+	if ((unsigned int)(v + 32768) > 65535) {
			
 
				+		if (v < -32768) { return -32768; }
			
 
				+		if (v >  32767) { return  32767; }
			
 
				+	}
			
 
				+	return v;
			
 
				 }
			
 
				 
			
 
				 static inline qoa_uint64_t qoa_read_u64(const unsigned char *bytes, unsigned int *p) {
			
@@ -312,6 +355,7 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 
				 	unsigned int p = 0;
			
 
				 	unsigned int slices = (frame_len + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
			
 
				 	unsigned int frame_size = QOA_FRAME_SIZE(channels, slices);
			
 
				+	int prev_scalefactor[QOA_MAX_CHANNELS] = {0};
			
 
				 
			
 
				 	/* Write the frame header */
			
 
				 	qoa_write_u64((
			
@@ -321,8 +365,24 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 
				 		(qoa_uint64_t)frame_size
			
 
				 	), bytes, &p);
			
 
				 
			
 
				-	/* Write the current LMS state */
			
 
				+	
			
 
				 	for (int c = 0; c < channels; c++) {
			
 
				+		/* If the weights have grown too large, reset them to 0. This may happen
			
 
				+		with certain high-frequency sounds. This is a last resort and will 
			
 
				+		introduce quite a bit of noise, but should at least prevent pops/clicks */
			
 
				+		int weights_sum = 
			
 
				+			qoa->lms[c].weights[0] * qoa->lms[c].weights[0] + 
			
 
				+			qoa->lms[c].weights[1] * qoa->lms[c].weights[1] + 
			
 
				+			qoa->lms[c].weights[2] * qoa->lms[c].weights[2] + 
			
 
				+			qoa->lms[c].weights[3] * qoa->lms[c].weights[3];
			
 
				+		if (weights_sum > 0x2fffffff) {
			
 
				+			qoa->lms[c].weights[0] = 0;
			
 
				+			qoa->lms[c].weights[1] = 0;
			
 
				+			qoa->lms[c].weights[2] = 0;
			
 
				+			qoa->lms[c].weights[3] = 0;
			
 
				+		}
			
 
				+
			
 
				+		/* Write the current LMS state */
			
 
				 		qoa_uint64_t weights = 0;
			
 
				 		qoa_uint64_t history = 0;
			
 
				 		for (int i = 0; i < QOA_LMS_LEN; i++) {
			
@@ -348,8 +408,13 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 
				 			qoa_uint64_t best_error = -1;
			
 
				 			qoa_uint64_t best_slice;
			
 
				 			qoa_lms_t best_lms;
			
 
				+			int best_scalefactor;
			
 
				 
			
 
				-			for (int scalefactor = 0; scalefactor < 16; scalefactor++) {
			
 
				+			for (int sfi = 0; sfi < 16; sfi++) {
			
 
				+				/* There is a strong correlation between the scalefactors of
			
 
				+				neighboring slices. As an optimization, start testing
			
 
				+				the best scalefactor of the previous slice first. */
			
 
				+				int scalefactor = (sfi + prev_scalefactor[c]) % 16;
			
 
				 
			
 
				 				/* We have to reset the LMS state to the last known good one
			
 
				 				before trying each scalefactor, as each pass updates the LMS
			
@@ -367,7 +432,7 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 
				 					int clamped = qoa_clamp(scaled, -8, 8);
			
 
				 					int quantized = qoa_quant_tab[clamped + 8];
			
 
				 					int dequantized = qoa_dequant_tab[scalefactor][quantized];
			
 
				-					int reconstructed = qoa_clamp(predicted + dequantized, -32768, 32767);
			
 
				+					int reconstructed = qoa_clamp_s16(predicted + dequantized);
			
 
				 
			
 
				 					long long error = (sample - reconstructed);
			
 
				 					current_error += error * error;
			
@@ -383,9 +448,12 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 
				 					best_error = current_error;
			
 
				 					best_slice = slice;
			
 
				 					best_lms = lms;
			
 
				+					best_scalefactor = scalefactor;
			
 
				 				}
			
 
				 			}
			
 
				 
			
 
				+			prev_scalefactor[c] = best_scalefactor;
			
 
				+
			
 
				 			qoa->lms[c] = best_lms;
			
 
				 			#ifdef QOA_RECORD_TOTAL_ERROR
			
 
				 				qoa->error += best_error;
			
@@ -553,7 +621,7 @@ unsigned int qoa_decode_frame(const unsigned char *bytes, unsigned int size, qoa
 
				 				int predicted = qoa_lms_predict(&qoa->lms[c]);
			
 
				 				int quantized = (slice >> 57) & 0x7;
			
 
				 				int dequantized = qoa_dequant_tab[scalefactor][quantized];
			
 
				-				int reconstructed = qoa_clamp(predicted + dequantized, -32768, 32767);
			
 
				+				int reconstructed = qoa_clamp_s16(predicted + dequantized);
			
 
				 				
			
 
				 				sample_data[si] = reconstructed;
			
 
				 				slice <<= 3;