Browse Source

Update qoa.h

Ray 1 year ago
parent
commit
925978ffde
1 changed files with 119 additions and 51 deletions
  1. 119 51
      src/external/qoa.h

+ 119 - 51
src/external/qoa.h

@@ -8,71 +8,96 @@ QOA - The "Quite OK Audio" format for fast, lossy audio compression
 
 
 -- Data Format
 -- Data Format
 
 
-A QOA file has an 8 byte file header, followed by a number of frames. Each frame 
-consists of an 8 byte frame header, the current 8 byte en-/decoder state per
-channel and 256 slices per channel. Each slice is 8 bytes wide and encodes 20 
-samples of audio data.
+QOA encodes pulse-code modulated (PCM) audio data with up to 255 channels, 
+sample rates from 1 up to 16777215 hertz and a bit depth of 16 bits.
 
 
-Note that the last frame of a file may contain less than 256 slices per channel.
-The last slice (per channel) in the last frame may contain less 20 samples, but
-the slice will still be 8 bytes wide, with the unused samples zeroed out.
+The compression method employed in QOA is lossy; it discards some information
+from the uncompressed PCM data. For many types of audio signals this compression
+is "transparent", i.e. the difference from the original file is often not
+audible.
 
 
-The samplerate and number of channels is only stated in the frame headers, but
-not in the file header. A decoder may peek into the first frame of the file to 
-find these values.
+QOA encodes 20 samples of 16 bit PCM data into slices of 64 bits. A single
+sample therefore requires 3.2 bits of storage space, resulting in a 5x
+compression (16 / 3.2).
 
 
-In a valid QOA file all frames have the same number of channels and the same
-samplerate. These restrictions may be relaxed for streaming. This remains to 
-be decided.
+A QOA file consists of an 8 byte file header, followed by a number of frames.
+Each frame contains an 8 byte frame header, the current 16 byte en-/decoder
+state per channel and 256 slices per channel. Each slice is 8 bytes wide and
+encodes 20 samples of audio data.
 
 
-All values in a QOA file are BIG ENDIAN. Luckily, EVERYTHING in a QOA file,
-including the headers, is 64 bit aligned, so it's possible to read files with 
-just a read_u64() that does the byte swapping if necessary.
-
-In pseudocode, the file layout is as follows:
+All values, including the slices, are big endian. The file layout is as follows:
 
 
 struct {
 struct {
 	struct {
 	struct {
-		char     magic[4];         // magic bytes 'qoaf'
-		uint32_t samples;          // number of samples per channel in this file
-	} file_header;                 // = 64 bits
+		char     magic[4];         // magic bytes "qoaf"
+		uint32_t samples;          // samples per channel in this file
+	} file_header;             
 
 
 	struct {
 	struct {
 		struct {
 		struct {
-			uint8_t  num_channels; // number of channels
+			uint8_t  num_channels; // no. of channels
 			uint24_t samplerate;   // samplerate in hz
 			uint24_t samplerate;   // samplerate in hz
-			uint16_t fsamples;     // sample count per channel in this frame
-			uint16_t fsize;        // frame size (including the frame header)
-		} frame_header;            // = 64 bits
+			uint16_t fsamples;     // samples per channel in this frame
+			uint16_t fsize;        // frame size (includes this header)
+		} frame_header;          
 
 
 		struct {
 		struct {
-			int16_t history[4];    // = 64 bits
-			int16_t weights[4];    // = 64 bits
+			int16_t history[4];    // most recent last
+			int16_t weights[4];    // most recent last
 		} lms_state[num_channels]; 
 		} lms_state[num_channels]; 
 
 
-		qoa_slice_t slices[256][num_channels]; // = 64 bits each
-	} frames[samples * channels / qoa_max_framesize()];
-} qoa_file;
+		qoa_slice_t slices[256][num_channels];
+
+	} frames[ceil(samples / (256 * 20))];
+} qoa_file_t;
 
 
-Wheras the 64bit qoa_slice_t is defined as follows:
+Each `qoa_slice_t` contains a quantized scalefactor `sf_quant` and 20 quantized
+residuals `qrNN`:
 
 
 .- QOA_SLICE -- 64 bits, 20 samples --------------------------/  /------------.
 .- QOA_SLICE -- 64 bits, 20 samples --------------------------/  /------------.
 |        Byte[0]         |        Byte[1]         |  Byte[2]  \  \  Byte[7]   |
 |        Byte[0]         |        Byte[1]         |  Byte[2]  \  \  Byte[7]   |
 | 7  6  5  4  3  2  1  0 | 7  6  5  4  3  2  1  0 | 7  6  5   /  /    2  1  0 |
 | 7  6  5  4  3  2  1  0 | 7  6  5  4  3  2  1  0 | 7  6  5   /  /    2  1  0 |
 |------------+--------+--------+--------+---------+---------+-\  \--+---------|
 |------------+--------+--------+--------+---------+---------+-\  \--+---------|
-|  sf_index  |  r00   |   r01  |   r02  |  r03    |   r04   | /  /  |   r19   |
+|  sf_quant  |  qr00  |  qr01  |  qr02  |  qr03   |  qr04   | /  /  |  qr19   |
 `-------------------------------------------------------------\  \------------`
 `-------------------------------------------------------------\  \------------`
 
 
-`sf_index` defines the scalefactor to use for this slice as an index into the
-qoa_scalefactor_tab[16]
+Each frame except the last must contain exactly 256 slices per channel. The last
+frame may contain between 1 .. 256 (inclusive) slices per channel. The last
+slice (for each channel) in the last frame may contain less than 20 samples; the
+slice still must be 8 bytes wide, with the unused samples zeroed out.
+
+Channels are interleaved per slice. E.g. for 2 channel stereo: 
+slice[0] = L, slice[1] = R, slice[2] = L, slice[3] = R ...
+
+A valid QOA file or stream must have at least one frame. Each frame must contain
+at least one channel and one sample with a samplerate between 1 .. 16777215
+(inclusive).
+
+If the total number of samples is not known by the encoder, the samples in the
+file header may be set to 0x00000000 to indicate that the encoder is 
+"streaming". In a streaming context, the samplerate and number of channels may
+differ from frame to frame. For static files (those with samples set to a
+non-zero value), each frame must have the same number of channels and same
+samplerate.
 
 
-`r00`--`r19` are the residuals for the individual samples, divided by the
-scalefactor and quantized by the qoa_quant_tab[].
+Note that this implementation of QOA only handles files with a known total
+number of samples.
 
 
-In the decoder, a prediction of the next sample is computed by multiplying the 
-state (the last four output samples) with the predictor. The residual from the 
-slice is then dequantized using the qoa_dequant_tab[] and added to the 
-prediction. The result is clamped to int16 to form the final output sample.
+A decoder should support at least 8 channels. The channel layout for channel
+counts 1 .. 8 is:
+
+	1. Mono
+	2. L, R
+	3. L, R, C 
+	4. FL, FR, B/SL, B/SR 
+	5. FL, FR, C, B/SL, B/SR 
+	6. FL, FR, C, LFE, B/SL, B/SR
+	7. FL, FR, C, LFE, B, SL, SR 
+	8. FL, FR, C, LFE, BL, BR, SL, SR
+
+QOA predicts each audio sample based on the previously decoded ones using a
+"Sign-Sign Least Mean Squares Filter" (LMS). This prediction plus the 
+dequantized residual forms the final output sample.
 
 
 */
 */
 
 
@@ -158,7 +183,7 @@ the higher end. Note that the residual zero is identical to the lowest positive
 value. This is mostly fine, since the qoa_div() function always rounds away 
 value. This is mostly fine, since the qoa_div() function always rounds away 
 from zero. */
 from zero. */
 
 
-static int qoa_quant_tab[17] = {
+static const int qoa_quant_tab[17] = {
 	7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
 	7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
 	0,                      /*  0     */
 	0,                      /*  0     */
 	0, 2, 2, 4, 4, 6, 6, 6  /*  1.. 8 */
 	0, 2, 2, 4, 4, 6, 6, 6  /*  1.. 8 */
@@ -169,13 +194,13 @@ static int qoa_quant_tab[17] = {
 less accurate at the higher end. In theory, the highest scalefactor that we
 less accurate at the higher end. In theory, the highest scalefactor that we
 would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
 would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
 rely on the LMS filter to predict samples accurately enough that a maximum 
 rely on the LMS filter to predict samples accurately enough that a maximum 
-residual of one quarter of the 16 bit range is high sufficient. I.e. with the 
+residual of one quarter of the 16 bit range is sufficient. I.e. with the 
 scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
 scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
 
 
 The scalefactor values are computed as:
 The scalefactor values are computed as:
 scalefactor_tab[s] <- round(pow(s + 1, 2.75)) */
 scalefactor_tab[s] <- round(pow(s + 1, 2.75)) */
 
 
-static int qoa_scalefactor_tab[16] = {
+static const int qoa_scalefactor_tab[16] = {
 	1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
 	1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
 };
 };
 
 
@@ -188,7 +213,7 @@ do this in .16 fixed point with integers, instead of floats.
 The reciprocal_tab is computed as:
 The reciprocal_tab is computed as:
 reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] */
 reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] */
 
 
-static int qoa_reciprocal_tab[16] = {
+static const int qoa_reciprocal_tab[16] = {
 	65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
 	65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
 };
 };
 
 
@@ -200,9 +225,13 @@ Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
 instead of 1. The dequant_tab assumes the following dequantized values for each 
 instead of 1. The dequant_tab assumes the following dequantized values for each 
 of the quant_tab indices and is computed as:
 of the quant_tab indices and is computed as:
 float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
 float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
-dequant_tab[s][q] <- round(scalefactor_tab[s] * dqt[q]) */
+dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
+
+The rounding employed here is "to nearest, ties away from zero",  i.e. positive
+and negative values are treated symmetrically.
+*/
 
 
-static int qoa_dequant_tab[16][8] = {
+static const int qoa_dequant_tab[16][8] = {
 	{   1,    -1,    3,    -3,    5,    -5,     7,     -7},
 	{   1,    -1,    3,    -3,    5,    -5,     7,     -7},
 	{   5,    -5,   18,   -18,   32,   -32,    49,    -49},
 	{   5,    -5,   18,   -18,   32,   -32,    49,    -49},
 	{  16,   -16,   53,   -53,   95,   -95,   147,   -147},
 	{  16,   -16,   53,   -53,   95,   -95,   147,   -147},
@@ -270,7 +299,21 @@ static inline int qoa_div(int v, int scalefactor) {
 }
 }
 
 
 static inline int qoa_clamp(int v, int min, int max) {
 static inline int qoa_clamp(int v, int min, int max) {
-	return (v < min) ? min : (v > max) ? max : v;
+	if (v < min) { return min; }
+	if (v > max) { return max; }
+	return v;
+}
+
+/* This specialized clamp function for the signed 16 bit range improves decode
+performance quite a bit. The extra if() statement works nicely with the CPUs
+branch prediction as this branch is rarely taken. */
+
+static inline int qoa_clamp_s16(int v) {
+	if ((unsigned int)(v + 32768) > 65535) {
+		if (v < -32768) { return -32768; }
+		if (v >  32767) { return  32767; }
+	}
+	return v;
 }
 }
 
 
 static inline qoa_uint64_t qoa_read_u64(const unsigned char *bytes, unsigned int *p) {
 static inline qoa_uint64_t qoa_read_u64(const unsigned char *bytes, unsigned int *p) {
@@ -312,6 +355,7 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 	unsigned int p = 0;
 	unsigned int p = 0;
 	unsigned int slices = (frame_len + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
 	unsigned int slices = (frame_len + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
 	unsigned int frame_size = QOA_FRAME_SIZE(channels, slices);
 	unsigned int frame_size = QOA_FRAME_SIZE(channels, slices);
+	int prev_scalefactor[QOA_MAX_CHANNELS] = {0};
 
 
 	/* Write the frame header */
 	/* Write the frame header */
 	qoa_write_u64((
 	qoa_write_u64((
@@ -321,8 +365,24 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 		(qoa_uint64_t)frame_size
 		(qoa_uint64_t)frame_size
 	), bytes, &p);
 	), bytes, &p);
 
 
-	/* Write the current LMS state */
+	
 	for (int c = 0; c < channels; c++) {
 	for (int c = 0; c < channels; c++) {
+		/* If the weights have grown too large, reset them to 0. This may happen
+		with certain high-frequency sounds. This is a last resort and will 
+		introduce quite a bit of noise, but should at least prevent pops/clicks */
+		int weights_sum = 
+			qoa->lms[c].weights[0] * qoa->lms[c].weights[0] + 
+			qoa->lms[c].weights[1] * qoa->lms[c].weights[1] + 
+			qoa->lms[c].weights[2] * qoa->lms[c].weights[2] + 
+			qoa->lms[c].weights[3] * qoa->lms[c].weights[3];
+		if (weights_sum > 0x2fffffff) {
+			qoa->lms[c].weights[0] = 0;
+			qoa->lms[c].weights[1] = 0;
+			qoa->lms[c].weights[2] = 0;
+			qoa->lms[c].weights[3] = 0;
+		}
+
+		/* Write the current LMS state */
 		qoa_uint64_t weights = 0;
 		qoa_uint64_t weights = 0;
 		qoa_uint64_t history = 0;
 		qoa_uint64_t history = 0;
 		for (int i = 0; i < QOA_LMS_LEN; i++) {
 		for (int i = 0; i < QOA_LMS_LEN; i++) {
@@ -348,8 +408,13 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 			qoa_uint64_t best_error = -1;
 			qoa_uint64_t best_error = -1;
 			qoa_uint64_t best_slice;
 			qoa_uint64_t best_slice;
 			qoa_lms_t best_lms;
 			qoa_lms_t best_lms;
+			int best_scalefactor;
 
 
-			for (int scalefactor = 0; scalefactor < 16; scalefactor++) {
+			for (int sfi = 0; sfi < 16; sfi++) {
+				/* There is a strong correlation between the scalefactors of
+				neighboring slices. As an optimization, start testing
+				the best scalefactor of the previous slice first. */
+				int scalefactor = (sfi + prev_scalefactor[c]) % 16;
 
 
 				/* We have to reset the LMS state to the last known good one
 				/* We have to reset the LMS state to the last known good one
 				before trying each scalefactor, as each pass updates the LMS
 				before trying each scalefactor, as each pass updates the LMS
@@ -367,7 +432,7 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 					int clamped = qoa_clamp(scaled, -8, 8);
 					int clamped = qoa_clamp(scaled, -8, 8);
 					int quantized = qoa_quant_tab[clamped + 8];
 					int quantized = qoa_quant_tab[clamped + 8];
 					int dequantized = qoa_dequant_tab[scalefactor][quantized];
 					int dequantized = qoa_dequant_tab[scalefactor][quantized];
-					int reconstructed = qoa_clamp(predicted + dequantized, -32768, 32767);
+					int reconstructed = qoa_clamp_s16(predicted + dequantized);
 
 
 					long long error = (sample - reconstructed);
 					long long error = (sample - reconstructed);
 					current_error += error * error;
 					current_error += error * error;
@@ -383,9 +448,12 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
 					best_error = current_error;
 					best_error = current_error;
 					best_slice = slice;
 					best_slice = slice;
 					best_lms = lms;
 					best_lms = lms;
+					best_scalefactor = scalefactor;
 				}
 				}
 			}
 			}
 
 
+			prev_scalefactor[c] = best_scalefactor;
+
 			qoa->lms[c] = best_lms;
 			qoa->lms[c] = best_lms;
 			#ifdef QOA_RECORD_TOTAL_ERROR
 			#ifdef QOA_RECORD_TOTAL_ERROR
 				qoa->error += best_error;
 				qoa->error += best_error;
@@ -553,7 +621,7 @@ unsigned int qoa_decode_frame(const unsigned char *bytes, unsigned int size, qoa
 				int predicted = qoa_lms_predict(&qoa->lms[c]);
 				int predicted = qoa_lms_predict(&qoa->lms[c]);
 				int quantized = (slice >> 57) & 0x7;
 				int quantized = (slice >> 57) & 0x7;
 				int dequantized = qoa_dequant_tab[scalefactor][quantized];
 				int dequantized = qoa_dequant_tab[scalefactor][quantized];
-				int reconstructed = qoa_clamp(predicted + dequantized, -32768, 32767);
+				int reconstructed = qoa_clamp_s16(predicted + dequantized);
 				
 				
 				sample_data[si] = reconstructed;
 				sample_data[si] = reconstructed;
 				slice <<= 3;
 				slice <<= 3;