5 жил өмнө · 974646309b
--- a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
+++ b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
@@ -1,555 +0,0 @@
 
				-    .syntax unified
			
 
				-@ Copyright (c) 2007-2008 CSIRO
			
 
				-@ Copyright (c) 2007-2009 Xiph.Org Foundation
			
 
				-@ Copyright (c) 2013      Parrot
			
 
				-@ Written by Aurélien Zanelli
			
 
				-@
			
 
				-@ Redistribution and use in source and binary forms, with or without
			
 
				-@ modification, are permitted provided that the following conditions
			
 
				-@ are met:
			
 
				-@
			
 
				-@ - Redistributions of source code must retain the above copyright
			
 
				-@ notice, this list of conditions and the following disclaimer.
			
 
				-@
			
 
				-@ - Redistributions in binary form must reproduce the above copyright
			
 
				-@ notice, this list of conditions and the following disclaimer in the
			
 
				-@ documentation and/or other materials provided with the distribution.
			
 
				-@
			
 
				-@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				-@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				-@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				-@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
			
 
				-@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
			
 
				-@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
			
 
				-@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
			
 
				-@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
			
 
				-@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
			
 
				-@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				-@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				-
			
 
				-    .text;   .p2align 2;   .arch armv7-a
			
 
				-   .fpu neon
			
 
				-   .object_arch armv4t
			
 
				-
			
 
				-  .include "celt/arm/armopts-gnu.S"
			
 
				-
			
 
				- .if OPUS_ARM_MAY_HAVE_EDSP
			
 
				-  .global celt_pitch_xcorr_edsp
			
 
				- .endif
			
 
				-
			
 
				- .if OPUS_ARM_MAY_HAVE_NEON
			
 
				-  .global celt_pitch_xcorr_neon
			
 
				- .endif
			
 
				-
			
 
				- .if OPUS_ARM_MAY_HAVE_NEON
			
 
				-
			
 
				-@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
			
 
				-	.type	xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
			
 
				-xcorr_kernel_neon_start:
			
 
				-  @ input:
			
 
				-  @   r3     = int         len
			
 
				-  @   r4     = opus_val16 *x
			
 
				-  @   r5     = opus_val16 *y
			
 
				-  @   q0     = opus_val32  sum[4]
			
 
				-  @ output:
			
 
				-  @   q0     = opus_val32  sum[4]
			
 
				-  @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
			
 
				-  @ internal usage:
			
 
				-  @   r12 = int j
			
 
				-  @   d3  = y_3|y_2|y_1|y_0
			
 
				-  @   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
			
 
				-  @   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
			
 
				-  @   q8  = scratch
			
 
				-  @
			
 
				-  @ Load y[0...3]
			
 
				-  @ This requires len>0 to always be valid (which we assert in the C code).
			
 
				-  VLD1.16      {d5}, [r5]!
			
 
				-  SUBS         r12, r3, #8
			
 
				-  BLE xcorr_kernel_neon_process4
			
 
				-@ Process 8 samples at a time.
			
 
				-@ This loop loads one y value more than we actually need. Therefore we have to
			
 
				-@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
			
 
				-@ reading past the end of the array.
			
 
				-xcorr_kernel_neon_process8:
			
 
				-  @ This loop has 19 total instructions (10 cycles to issue, minimum), with
			
 
				-  @ - 2 cycles of ARM insrtuctions,
			
 
				-  @ - 10 cycles of load/store/byte permute instructions, and
			
 
				-  @ - 9 cycles of data processing instructions.
			
 
				-  @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
			
 
				-  @ latter two categories, meaning the whole loop should run in 10 cycles per
			
 
				-  @ iteration, barring cache misses.
			
 
				-  @
			
 
				-  @ Load x[0...7]
			
 
				-  VLD1.16      {d6, d7}, [r4]!
			
 
				-  @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
			
 
				-  @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
			
 
				-  VAND         d3, d5, d5
			
 
				-  SUBS         r12, r12, #8
			
 
				-  @ Load y[4...11]
			
 
				-  VLD1.16      {d4, d5}, [r5]!
			
 
				-  VMLAL.S16    q0, d3, d6[0]
			
 
				-  VEXT.16      d16, d3, d4, #1
			
 
				-  VMLAL.S16    q0, d4, d7[0]
			
 
				-  VEXT.16      d17, d4, d5, #1
			
 
				-  VMLAL.S16    q0, d16, d6[1]
			
 
				-  VEXT.16      d16, d3, d4, #2
			
 
				-  VMLAL.S16    q0, d17, d7[1]
			
 
				-  VEXT.16      d17, d4, d5, #2
			
 
				-  VMLAL.S16    q0, d16, d6[2]
			
 
				-  VEXT.16      d16, d3, d4, #3
			
 
				-  VMLAL.S16    q0, d17, d7[2]
			
 
				-  VEXT.16      d17, d4, d5, #3
			
 
				-  VMLAL.S16    q0, d16, d6[3]
			
 
				-  VMLAL.S16    q0, d17, d7[3]
			
 
				-  BGT xcorr_kernel_neon_process8
			
 
				-@ Process 4 samples here if we have > 4 left (still reading one extra y value).
			
 
				-xcorr_kernel_neon_process4:
			
 
				-  ADDS         r12, r12, #4
			
 
				-  BLE xcorr_kernel_neon_process2
			
 
				-  @ Load x[0...3]
			
 
				-  VLD1.16      d6, [r4]!
			
 
				-  @ Use VAND since it's a data processing instruction again.
			
 
				-  VAND         d4, d5, d5
			
 
				-  SUB          r12, r12, #4
			
 
				-  @ Load y[4...7]
			
 
				-  VLD1.16      d5, [r5]!
			
 
				-  VMLAL.S16    q0, d4, d6[0]
			
 
				-  VEXT.16      d16, d4, d5, #1
			
 
				-  VMLAL.S16    q0, d16, d6[1]
			
 
				-  VEXT.16      d16, d4, d5, #2
			
 
				-  VMLAL.S16    q0, d16, d6[2]
			
 
				-  VEXT.16      d16, d4, d5, #3
			
 
				-  VMLAL.S16    q0, d16, d6[3]
			
 
				-@ Process 2 samples here if we have > 2 left (still reading one extra y value).
			
 
				-xcorr_kernel_neon_process2:
			
 
				-  ADDS         r12, r12, #2
			
 
				-  BLE xcorr_kernel_neon_process1
			
 
				-  @ Load x[0...1]
			
 
				-  VLD2.16      {d6[],d7[]}, [r4]!
			
 
				-  @ Use VAND since it's a data processing instruction again.
			
 
				-  VAND         d4, d5, d5
			
 
				-  SUB          r12, r12, #2
			
 
				-  @ Load y[4...5]
			
 
				-  VLD1.32      {d5[]}, [r5]!
			
 
				-  VMLAL.S16    q0, d4, d6
			
 
				-  VEXT.16      d16, d4, d5, #1
			
 
				-  @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
			
 
				-  @ instead of VEXT, since it's a data-processing instruction.
			
 
				-  VSRI.64      d5, d4, #32
			
 
				-  VMLAL.S16    q0, d16, d7
			
 
				-@ Process 1 sample using the extra y value we loaded above.
			
 
				-xcorr_kernel_neon_process1:
			
 
				-  @ Load next *x
			
 
				-  VLD1.16      {d6[]}, [r4]!
			
 
				-  ADDS         r12, r12, #1
			
 
				-  @ y[0...3] are left in d5 from prior iteration(s) (if any)
			
 
				-  VMLAL.S16    q0, d5, d6
			
 
				-  MOVLE        pc, lr
			
 
				-@ Now process 1 last sample, not reading ahead.
			
 
				-  @ Load last *y
			
 
				-  VLD1.16      {d4[]}, [r5]!
			
 
				-  VSRI.64      d4, d5, #16
			
 
				-  @ Load last *x
			
 
				-  VLD1.16      {d6[]}, [r4]!
			
 
				-  VMLAL.S16    q0, d4, d6
			
 
				-  MOV          pc, lr
			
 
				-	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
			
 
				-
			
 
				-@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
			
 
				-@  opus_val32 *xcorr, int len, int max_pitch, int arch)
			
 
				-	.type	celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
			
 
				-  @ input:
			
 
				-  @   r0  = opus_val16 *_x
			
 
				-  @   r1  = opus_val16 *_y
			
 
				-  @   r2  = opus_val32 *xcorr
			
 
				-  @   r3  = int         len
			
 
				-  @ output:
			
 
				-  @   r0  = int         maxcorr
			
 
				-  @ internal usage:
			
 
				-  @   r4  = opus_val16 *x (for xcorr_kernel_neon())
			
 
				-  @   r5  = opus_val16 *y (for xcorr_kernel_neon())
			
 
				-  @   r6  = int         max_pitch
			
 
				-  @   r12 = int         j
			
 
				-  @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
			
 
				-  @ ignored:
			
 
				-  @         int         arch
			
 
				-  STMFD        sp!, {r4-r6, lr}
			
 
				-  LDR          r6, [sp, #16]
			
 
				-  VMOV.S32     q15, #1
			
 
				-  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
			
 
				-  SUBS         r6, r6, #4
			
 
				-  BLT celt_pitch_xcorr_neon_process4_done
			
 
				-celt_pitch_xcorr_neon_process4:
			
 
				-  @ xcorr_kernel_neon parameters:
			
 
				-  @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
			
 
				-  MOV          r4, r0
			
 
				-  MOV          r5, r1
			
 
				-  VEOR         q0, q0, q0
			
 
				-  @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
			
 
				-  @ So we don't save/restore any other registers.
			
 
				-  BL xcorr_kernel_neon_start
			
 
				-  SUBS         r6, r6, #4
			
 
				-  VST1.32      {q0}, [r2]!
			
 
				-  @ _y += 4
			
 
				-  ADD          r1, r1, #8
			
 
				-  VMAX.S32     q15, q15, q0
			
 
				-  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
			
 
				-  BGE celt_pitch_xcorr_neon_process4
			
 
				-@ We have less than 4 sums left to compute.
			
 
				-celt_pitch_xcorr_neon_process4_done:
			
 
				-  ADDS         r6, r6, #4
			
 
				-  @ Reduce maxcorr to a single value
			
 
				-  VMAX.S32     d30, d30, d31
			
 
				-  VPMAX.S32    d30, d30, d30
			
 
				-  @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
			
 
				-  BLE celt_pitch_xcorr_neon_done
			
 
				-@ Now compute each remaining sum one at a time.
			
 
				-celt_pitch_xcorr_neon_process_remaining:
			
 
				-  MOV          r4, r0
			
 
				-  MOV          r5, r1
			
 
				-  VMOV.I32     q0, #0
			
 
				-  SUBS         r12, r3, #8
			
 
				-  BLT celt_pitch_xcorr_neon_process_remaining4
			
 
				-@ Sum terms 8 at a time.
			
 
				-celt_pitch_xcorr_neon_process_remaining_loop8:
			
 
				-  @ Load x[0...7]
			
 
				-  VLD1.16      {q1}, [r4]!
			
 
				-  @ Load y[0...7]
			
 
				-  VLD1.16      {q2}, [r5]!
			
 
				-  SUBS         r12, r12, #8
			
 
				-  VMLAL.S16    q0, d4, d2
			
 
				-  VMLAL.S16    q0, d5, d3
			
 
				-  BGE celt_pitch_xcorr_neon_process_remaining_loop8
			
 
				-@ Sum terms 4 at a time.
			
 
				-celt_pitch_xcorr_neon_process_remaining4:
			
 
				-  ADDS         r12, r12, #4
			
 
				-  BLT celt_pitch_xcorr_neon_process_remaining4_done
			
 
				-  @ Load x[0...3]
			
 
				-  VLD1.16      {d2}, [r4]!
			
 
				-  @ Load y[0...3]
			
 
				-  VLD1.16      {d3}, [r5]!
			
 
				-  SUB          r12, r12, #4
			
 
				-  VMLAL.S16    q0, d3, d2
			
 
				-celt_pitch_xcorr_neon_process_remaining4_done:
			
 
				-  @ Reduce the sum to a single value.
			
 
				-  VADD.S32     d0, d0, d1
			
 
				-  VPADDL.S32   d0, d0
			
 
				-  ADDS         r12, r12, #4
			
 
				-  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
			
 
				-@ Sum terms 1 at a time.
			
 
				-celt_pitch_xcorr_neon_process_remaining_loop1:
			
 
				-  VLD1.16      {d2[]}, [r4]!
			
 
				-  VLD1.16      {d3[]}, [r5]!
			
 
				-  SUBS         r12, r12, #1
			
 
				-  VMLAL.S16    q0, d2, d3
			
 
				-  BGT celt_pitch_xcorr_neon_process_remaining_loop1
			
 
				-celt_pitch_xcorr_neon_process_remaining_loop_done:
			
 
				-  VST1.32      {d0[0]}, [r2]!
			
 
				-  VMAX.S32     d30, d30, d0
			
 
				-  SUBS         r6, r6, #1
			
 
				-  @ _y++
			
 
				-  ADD          r1, r1, #2
			
 
				-  @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
			
 
				-  BGT celt_pitch_xcorr_neon_process_remaining
			
 
				-celt_pitch_xcorr_neon_done:
			
 
				-  VMOV.32      r0, d30[0]
			
 
				-  LDMFD        sp!, {r4-r6, pc}
			
 
				-	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  @ ENDP
			
 
				-
			
 
				- .endif
			
 
				-
			
 
				- .if OPUS_ARM_MAY_HAVE_EDSP
			
 
				-
			
 
				-@ This will get used on ARMv7 devices without NEON, so it has been optimized
			
 
				-@ to take advantage of dual-issuing where possible.
			
 
				-	.type	xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
			
 
				-xcorr_kernel_edsp_start:
			
 
				-  @ input:
			
 
				-  @   r3      = int         len
			
 
				-  @   r4      = opus_val16 *_x (must be 32-bit aligned)
			
 
				-  @   r5      = opus_val16 *_y (must be 32-bit aligned)
			
 
				-  @   r6...r9 = opus_val32  sum[4]
			
 
				-  @ output:
			
 
				-  @   r6...r9 = opus_val32  sum[4]
			
 
				-  @ preserved: r0-r5
			
 
				-  @ internal usage
			
 
				-  @   r2      = int         j
			
 
				-  @   r12,r14 = opus_val16  x[4]
			
 
				-  @   r10,r11 = opus_val16  y[4]
			
 
				-  STMFD        sp!, {r2,r4,r5,lr}
			
 
				-  LDR          r10, [r5], #4      @ Load y[0...1]
			
 
				-  SUBS         r2, r3, #4         @ j = len-4
			
 
				-  LDR          r11, [r5], #4      @ Load y[2...3]
			
 
				-  BLE xcorr_kernel_edsp_process4_done
			
 
				-  LDR          r12, [r4], #4      @ Load x[0...1]
			
 
				-  @ Stall
			
 
				-xcorr_kernel_edsp_process4:
			
 
				-  @ The multiplies must issue from pipeline 0, and can't dual-issue with each
			
 
				-  @ other. Every other instruction here dual-issues with a multiply, and is
			
 
				-  @ thus "free". There should be no stalls in the body of the loop.
			
 
				-  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_0,y_0)
			
 
				-  LDR          r14, [r4], #4      @ Load x[2...3]
			
 
				-  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x_0,y_1)
			
 
				-  SUBS         r2, r2, #4         @ j-=4
			
 
				-  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_0,y_2)
			
 
				-  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x_0,y_3)
			
 
				-  SMLATT       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_1,y_1)
			
 
				-  LDR          r10, [r5], #4      @ Load y[4...5]
			
 
				-  SMLATB       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],x_1,y_2)
			
 
				-  SMLATT       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_1,y_3)
			
 
				-  SMLATB       r9, r12, r10, r9   @ sum[3] = MAC16_16(sum[3],x_1,y_4)
			
 
				-  LDRGT        r12, [r4], #4      @ Load x[0...1]
			
 
				-  SMLABB       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_2,y_2)
			
 
				-  SMLABT       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x_2,y_3)
			
 
				-  SMLABB       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_2,y_4)
			
 
				-  SMLABT       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x_2,y_5)
			
 
				-  SMLATT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_3,y_3)
			
 
				-  LDR          r11, [r5], #4      @ Load y[6...7]
			
 
				-  SMLATB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],x_3,y_4)
			
 
				-  SMLATT       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_3,y_5)
			
 
				-  SMLATB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],x_3,y_6)
			
 
				-  BGT xcorr_kernel_edsp_process4
			
 
				-xcorr_kernel_edsp_process4_done:
			
 
				-  ADDS         r2, r2, #4
			
 
				-  BLE xcorr_kernel_edsp_done
			
 
				-  LDRH         r12, [r4], #2      @ r12 = *x++
			
 
				-  SUBS         r2, r2, #1         @ j--
			
 
				-  @ Stall
			
 
				-  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_0)
			
 
				-  LDRHGT       r14, [r4], #2      @ r14 = *x++
			
 
				-  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x,y_1)
			
 
				-  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_2)
			
 
				-  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x,y_3)
			
 
				-  BLE xcorr_kernel_edsp_done
			
 
				-  SMLABT       r6, r14, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_1)
			
 
				-  SUBS         r2, r2, #1         @ j--
			
 
				-  SMLABB       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x,y_2)
			
 
				-  LDRH         r10, [r5], #2      @ r10 = y_4 = *y++
			
 
				-  SMLABT       r8, r14, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_3)
			
 
				-  LDRHGT       r12, [r4], #2      @ r12 = *x++
			
 
				-  SMLABB       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x,y_4)
			
 
				-  BLE xcorr_kernel_edsp_done
			
 
				-  SMLABB       r6, r12, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_2)
			
 
				-  CMP          r2, #1             @ j--
			
 
				-  SMLABT       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_3)
			
 
				-  LDRH         r2, [r5], #2       @ r2 = y_5 = *y++
			
 
				-  SMLABB       r8, r12, r10, r8   @ sum[2] = MAC16_16(sum[2],tmp,y_4)
			
 
				-  LDRHGT       r14, [r4]          @ r14 = *x
			
 
				-  SMLABB       r9, r12, r2, r9    @ sum[3] = MAC16_16(sum[3],tmp,y_5)
			
 
				-  BLE xcorr_kernel_edsp_done
			
 
				-  SMLABT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_3)
			
 
				-  LDRH         r11, [r5]          @ r11 = y_6 = *y
			
 
				-  SMLABB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_4)
			
 
				-  SMLABB       r8, r14, r2, r8    @ sum[2] = MAC16_16(sum[2],tmp,y_5)
			
 
				-  SMLABB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],tmp,y_6)
			
 
				-xcorr_kernel_edsp_done:
			
 
				-  LDMFD        sp!, {r2,r4,r5,pc}
			
 
				-	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
			
 
				-
			
 
				-	.type	celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
			
 
				-  @ input:
			
 
				-  @   r0  = opus_val16 *_x (must be 32-bit aligned)
			
 
				-  @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
			
 
				-  @   r2  = opus_val32 *xcorr
			
 
				-  @   r3  = int         len
			
 
				-  @ output:
			
 
				-  @   r0  = maxcorr
			
 
				-  @ internal usage
			
 
				-  @   r4  = opus_val16 *x
			
 
				-  @   r5  = opus_val16 *y
			
 
				-  @   r6  = opus_val32  sum0
			
 
				-  @   r7  = opus_val32  sum1
			
 
				-  @   r8  = opus_val32  sum2
			
 
				-  @   r9  = opus_val32  sum3
			
 
				-  @   r1  = int         max_pitch
			
 
				-  @   r12 = int         j
			
 
				-  @ ignored:
			
 
				-  @         int         arch
			
 
				-  STMFD        sp!, {r4-r11, lr}
			
 
				-  MOV          r5, r1
			
 
				-  LDR          r1, [sp, #36]
			
 
				-  MOV          r4, r0
			
 
				-  TST          r5, #3
			
 
				-  @ maxcorr = 1
			
 
				-  MOV          r0, #1
			
 
				-  BEQ          celt_pitch_xcorr_edsp_process1u_done
			
 
				-@ Compute one sum at the start to make y 32-bit aligned.
			
 
				-  SUBS         r12, r3, #4
			
 
				-  @ r14 = sum = 0
			
 
				-  MOV          r14, #0
			
 
				-  LDRH         r8, [r5], #2
			
 
				-  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
			
 
				-  LDR          r6, [r4], #4
			
 
				-  MOV          r8, r8, LSL #16
			
 
				-celt_pitch_xcorr_edsp_process1u_loop4:
			
 
				-  LDR          r9, [r5], #4
			
 
				-  SMLABT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
			
 
				-  LDR          r7, [r4], #4
			
 
				-  SMLATB       r14, r6, r9, r14     @ sum = MAC16_16(sum, x_1, y_1)
			
 
				-  LDR          r8, [r5], #4
			
 
				-  SMLABT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
			
 
				-  SUBS         r12, r12, #4         @ j-=4
			
 
				-  SMLATB       r14, r7, r8, r14     @ sum = MAC16_16(sum, x_3, y_3)
			
 
				-  LDRGT        r6, [r4], #4
			
 
				-  BGT celt_pitch_xcorr_edsp_process1u_loop4
			
 
				-  MOV          r8, r8, LSR #16
			
 
				-celt_pitch_xcorr_edsp_process1u_loop4_done:
			
 
				-  ADDS         r12, r12, #4
			
 
				-celt_pitch_xcorr_edsp_process1u_loop1:
			
 
				-  LDRHGE       r6, [r4], #2
			
 
				-  @ Stall
			
 
				-  SMLABBGE     r14, r6, r8, r14    @ sum = MAC16_16(sum, *x, *y)
			
 
				-  SUBSGE       r12, r12, #1
			
 
				-  LDRHGT       r8, [r5], #2
			
 
				-  BGT celt_pitch_xcorr_edsp_process1u_loop1
			
 
				-  @ Restore _x
			
 
				-  SUB          r4, r4, r3, LSL #1
			
 
				-  @ Restore and advance _y
			
 
				-  SUB          r5, r5, r3, LSL #1
			
 
				-  @ maxcorr = max(maxcorr, sum)
			
 
				-  CMP          r0, r14
			
 
				-  ADD          r5, r5, #2
			
 
				-  MOVLT        r0, r14
			
 
				-  SUBS         r1, r1, #1
			
 
				-  @ xcorr[i] = sum
			
 
				-  STR          r14, [r2], #4
			
 
				-  BLE celt_pitch_xcorr_edsp_done
			
 
				-celt_pitch_xcorr_edsp_process1u_done:
			
 
				-  @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
			
 
				-  SUBS         r1, r1, #4
			
 
				-  BLT celt_pitch_xcorr_edsp_process2
			
 
				-celt_pitch_xcorr_edsp_process4:
			
 
				-  @ xcorr_kernel_edsp parameters:
			
 
				-  @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
			
 
				-  MOV          r6, #0
			
 
				-  MOV          r7, #0
			
 
				-  MOV          r8, #0
			
 
				-  MOV          r9, #0
			
 
				-  BL xcorr_kernel_edsp_start  @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
			
 
				-  @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
			
 
				-  CMP          r0, r6
			
 
				-  @ _y+=4
			
 
				-  ADD          r5, r5, #8
			
 
				-  MOVLT        r0, r6
			
 
				-  CMP          r0, r7
			
 
				-  MOVLT        r0, r7
			
 
				-  CMP          r0, r8
			
 
				-  MOVLT        r0, r8
			
 
				-  CMP          r0, r9
			
 
				-  MOVLT        r0, r9
			
 
				-  STMIA        r2!, {r6-r9}
			
 
				-  SUBS         r1, r1, #4
			
 
				-  BGE celt_pitch_xcorr_edsp_process4
			
 
				-celt_pitch_xcorr_edsp_process2:
			
 
				-  ADDS         r1, r1, #2
			
 
				-  BLT celt_pitch_xcorr_edsp_process1a
			
 
				-  SUBS         r12, r3, #4
			
 
				-  @ {r10, r11} = {sum0, sum1} = {0, 0}
			
 
				-  MOV          r10, #0
			
 
				-  MOV          r11, #0
			
 
				-  LDR          r8, [r5], #4
			
 
				-  BLE celt_pitch_xcorr_edsp_process2_loop_done
			
 
				-  LDR          r6, [r4], #4
			
 
				-  LDR          r9, [r5], #4
			
 
				-celt_pitch_xcorr_edsp_process2_loop4:
			
 
				-  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
			
 
				-  LDR          r7, [r4], #4
			
 
				-  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
			
 
				-  SUBS         r12, r12, #4         @ j-=4
			
 
				-  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
			
 
				-  LDR          r8, [r5], #4
			
 
				-  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
			
 
				-  LDRGT        r6, [r4], #4
			
 
				-  SMLABB       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_2, y_2)
			
 
				-  SMLABT       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_2, y_3)
			
 
				-  SMLATT       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_3, y_3)
			
 
				-  LDRGT        r9, [r5], #4
			
 
				-  SMLATB       r11, r7, r8, r11     @ sum1 = MAC16_16(sum1, x_3, y_4)
			
 
				-  BGT celt_pitch_xcorr_edsp_process2_loop4
			
 
				-celt_pitch_xcorr_edsp_process2_loop_done:
			
 
				-  ADDS         r12, r12, #2
			
 
				-  BLE  celt_pitch_xcorr_edsp_process2_1
			
 
				-  LDR          r6, [r4], #4
			
 
				-  @ Stall
			
 
				-  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
			
 
				-  LDR          r9, [r5], #4
			
 
				-  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
			
 
				-  SUB          r12, r12, #2
			
 
				-  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
			
 
				-  MOV          r8, r9
			
 
				-  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
			
 
				-celt_pitch_xcorr_edsp_process2_1:
			
 
				-  LDRH         r6, [r4], #2
			
 
				-  ADDS         r12, r12, #1
			
 
				-  @ Stall
			
 
				-  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
			
 
				-  LDRHGT       r7, [r4], #2
			
 
				-  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
			
 
				-  BLE celt_pitch_xcorr_edsp_process2_done
			
 
				-  LDRH         r9, [r5], #2
			
 
				-  SMLABT       r10, r7, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_1)
			
 
				-  SMLABB       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_0, y_2)
			
 
				-celt_pitch_xcorr_edsp_process2_done:
			
 
				-  @ Restore _x
			
 
				-  SUB          r4, r4, r3, LSL #1
			
 
				-  @ Restore and advance _y
			
 
				-  SUB          r5, r5, r3, LSL #1
			
 
				-  @ maxcorr = max(maxcorr, sum0)
			
 
				-  CMP          r0, r10
			
 
				-  ADD          r5, r5, #2
			
 
				-  MOVLT        r0, r10
			
 
				-  SUB          r1, r1, #2
			
 
				-  @ maxcorr = max(maxcorr, sum1)
			
 
				-  CMP          r0, r11
			
 
				-  @ xcorr[i] = sum
			
 
				-  STR          r10, [r2], #4
			
 
				-  MOVLT        r0, r11
			
 
				-  STR          r11, [r2], #4
			
 
				-celt_pitch_xcorr_edsp_process1a:
			
 
				-  ADDS         r1, r1, #1
			
 
				-  BLT celt_pitch_xcorr_edsp_done
			
 
				-  SUBS         r12, r3, #4
			
 
				-  @ r14 = sum = 0
			
 
				-  MOV          r14, #0
			
 
				-  BLT celt_pitch_xcorr_edsp_process1a_loop_done
			
 
				-  LDR          r6, [r4], #4
			
 
				-  LDR          r8, [r5], #4
			
 
				-  LDR          r7, [r4], #4
			
 
				-  LDR          r9, [r5], #4
			
 
				-celt_pitch_xcorr_edsp_process1a_loop4:
			
 
				-  SMLABB       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
			
 
				-  SUBS         r12, r12, #4         @ j-=4
			
 
				-  SMLATT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
			
 
				-  LDRGE        r6, [r4], #4
			
 
				-  SMLABB       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
			
 
				-  LDRGE        r8, [r5], #4
			
 
				-  SMLATT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_3, y_3)
			
 
				-  LDRGE        r7, [r4], #4
			
 
				-  LDRGE        r9, [r5], #4
			
 
				-  BGE celt_pitch_xcorr_edsp_process1a_loop4
			
 
				-celt_pitch_xcorr_edsp_process1a_loop_done:
			
 
				-  ADDS         r12, r12, #2
			
 
				-  LDRGE        r6, [r4], #4
			
 
				-  LDRGE        r8, [r5], #4
			
 
				-  @ Stall
			
 
				-  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
			
 
				-  SUBGE        r12, r12, #2
			
 
				-  SMLATTGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
			
 
				-  ADDS         r12, r12, #1
			
 
				-  LDRHGE       r6, [r4], #2
			
 
				-  LDRHGE       r8, [r5], #2
			
 
				-  @ Stall
			
 
				-  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, *x, *y)
			
 
				-  @ maxcorr = max(maxcorr, sum)
			
 
				-  CMP          r0, r14
			
 
				-  @ xcorr[i] = sum
			
 
				-  STR          r14, [r2], #4
			
 
				-  MOVLT        r0, r14
			
 
				-celt_pitch_xcorr_edsp_done:
			
 
				-  LDMFD        sp!, {r4-r11, pc}
			
 
				-	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  @ ENDP
			
 
				-
			
 
				- .endif
			
 
				-
			
 
				-@ END:
			
 
				-    .section	.note.GNU-stack,"",%progbits
			
--- a/thirdparty/opus/celt/fixed_c5x.h
+++ b/thirdparty/opus/celt/fixed_c5x.h
@@ -0,0 +1,79 @@
 
				+/* Copyright (C) 2003 Jean-Marc Valin */
			
 
				+/**
			
 
				+   @file fixed_c5x.h
			
 
				+   @brief Fixed-point operations for the TI C5x DSP family
			
 
				+*/
			
 
				+/*
			
 
				+   Redistribution and use in source and binary forms, with or without
			
 
				+   modification, are permitted provided that the following conditions
			
 
				+   are met:
			
 
				+
			
 
				+   - Redistributions of source code must retain the above copyright
			
 
				+   notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+   - Redistributions in binary form must reproduce the above copyright
			
 
				+   notice, this list of conditions and the following disclaimer in the
			
 
				+   documentation and/or other materials provided with the distribution.
			
 
				+
			
 
				+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
			
 
				+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
			
 
				+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
			
 
				+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
			
 
				+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
			
 
				+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
			
 
				+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+*/
			
 
				+
			
 
				+#ifndef FIXED_C5X_H
			
 
				+#define FIXED_C5X_H
			
 
				+
			
 
				+#include "dsplib.h"
			
 
				+
			
 
				+#undef IMUL32
			
 
				+static OPUS_INLINE long IMUL32(long i, long j)
			
 
				+{
			
 
				+   long ac0, ac1;
			
 
				+   ac0 = _lmpy(i>>16,j);
			
 
				+   ac1 = ac0 + _lmpy(i,j>>16);
			
 
				+   return _lmpyu(i,j) + (ac1<<16);
			
 
				+}
			
 
				+
			
 
				+#undef MAX16
			
 
				+#define MAX16(a,b) _max(a,b)
			
 
				+
			
 
				+#undef MIN16
			
 
				+#define MIN16(a,b) _min(a,b)
			
 
				+
			
 
				+#undef MAX32
			
 
				+#define MAX32(a,b) _lmax(a,b)
			
 
				+
			
 
				+#undef MIN32
			
 
				+#define MIN32(a,b) _lmin(a,b)
			
 
				+
			
 
				+#undef VSHR32
			
 
				+#define VSHR32(a, shift) _lshl(a,-(shift))
			
 
				+
			
 
				+#undef MULT16_16_Q15
			
 
				+#define MULT16_16_Q15(a,b) (_smpy(a,b))
			
 
				+
			
 
				+#undef MULT16_16SU
			
 
				+#define MULT16_16SU(a,b) _lmpysu(a,b)
			
 
				+
			
 
				+#undef MULT_16_16
			
 
				+#define MULT_16_16(a,b) _lmpy(a,b)
			
 
				+
			
 
				+/* FIXME: This is technically incorrect and is bound to cause problems. Is there any cleaner solution? */
			
 
				+#undef MULT16_32_Q15
			
 
				+#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),(b)),15))
			
 
				+
			
 
				+#define celt_ilog2(x) (30 - _lnorm(x))
			
 
				+#define OVERRIDE_CELT_ILOG2
			
 
				+
			
 
				+#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
			
 
				+#define OVERRIDE_CELT_MAXABS16
			
 
				+
			
 
				+#endif /* FIXED_C5X_H */
			
--- a/thirdparty/opus/celt/arm/armopts.s
+++ b/thirdparty/opus/celt/arm/armopts.s
@@ -1,4 +1,8 @@
 
				-/* Copyright (C) 2013 Mozilla Corporation */
			
 
				+/* Copyright (C) 2008 CSIRO */
			
 
				+/**
			
 
				+   @file fixed_c6x.h
			
 
				+   @brief Fixed-point operations for the TI C6x DSP family
			
 
				+*/
			
 
				 /*
			
 
				    Redistribution and use in source and binary forms, with or without
			
 
				    modification, are permitted provided that the following conditions
			
@@ -24,14 +28,43 @@
 
				    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				 */
			
 
				 
			
 
				-; Set the following to 1 if we have EDSP instructions
			
 
				-;  (LDRD/STRD, etc., ARMv5E and later).
			
 
				-OPUS_ARM_MAY_HAVE_EDSP  * 
			
 
				+#ifndef FIXED_C6X_H
			
 
				+#define FIXED_C6X_H
			
 
				+
			
 
				+#undef MULT16_16SU
			
 
				+#define MULT16_16SU(a,b) _mpysu(a,b)
			
 
				+
			
 
				+#undef MULT_16_16
			
 
				+#define MULT_16_16(a,b) _mpy(a,b)
			
 
				+
			
 
				+#define celt_ilog2(x) (30 - _norm(x))
			
 
				+#define OVERRIDE_CELT_ILOG2
			
 
				+
			
 
				+#undef MULT16_32_Q15
			
 
				+#define MULT16_32_Q15(a,b) (_mpylill(a, b) >> 15)
			
 
				+
			
 
				+#if 0
			
 
				+#include "dsplib.h"
			
 
				+
			
 
				+#undef MAX16
			
 
				+#define MAX16(a,b) _max(a,b)
			
 
				+
			
 
				+#undef MIN16
			
 
				+#define MIN16(a,b) _min(a,b)
			
 
				+
			
 
				+#undef MAX32
			
 
				+#define MAX32(a,b) _lmax(a,b)
			
 
				+
			
 
				+#undef MIN32
			
 
				+#define MIN32(a,b) _lmin(a,b)
			
 
				+
			
 
				+#undef VSHR32
			
 
				+#define VSHR32(a, shift) _lshl(a,-(shift))
			
 
				 
			
 
				-; Set the following to 1 if we have ARMv6 media instructions.
			
 
				-OPUS_ARM_MAY_HAVE_MEDIA * 
			
 
				+#undef MULT16_16_Q15
			
 
				+#define MULT16_16_Q15(a,b) (_smpy(a,b))
			
 
				 
			
 
				-; Set the following to 1 if we have NEON (some ARMv7)
			
 
				-OPUS_ARM_MAY_HAVE_NEON  * 
			
 
				+#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
			
 
				+#define OVERRIDE_CELT_MAXABS16
			
 
				 
			
 
				-END
			
 
				+#endif /* FIXED_C6X_H */
			
--- a/thirdparty/opus/config.h
+++ b/thirdparty/opus/config.h
@@ -35,7 +35,7 @@
 
				 /* #undef FUZZING */
			
 
				 
			
 
				 /* Define to 1 if you have the <alloca.h> header file. */
			
 
				-/* #undef HAVE_ALLOCA_H */
			
 
				+/*  #undef HAVE_ALLOCA_H  */
			
 
				 
			
 
				 /* NE10 library is installed on host. Make sure it is on target! */
			
 
				 /* #undef HAVE_ARM_NE10 */
			
@@ -46,12 +46,16 @@
 
				 /* Define to 1 if you have the <inttypes.h> header file. */
			
 
				 #define HAVE_INTTYPES_H 1
			
 
				 
			
 
				+#if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 ))
			
 
				+
			
 
				 /* Define to 1 if you have the `lrint' function. */
			
 
				 #define HAVE_LRINT 1
			
 
				 
			
 
				 /* Define to 1 if you have the `lrintf' function. */
			
 
				 #define HAVE_LRINTF 1
			
 
				 
			
 
				+#endif
			
 
				+
			
 
				 /* Define to 1 if you have the <memory.h> header file. */
			
 
				 #define HAVE_MEMORY_H 1
			
 
				 
			
@@ -79,7 +83,8 @@
 
				 /* Define to 1 if you have the `__malloc_hook' function. */
			
 
				 #define HAVE___MALLOC_HOOK 1
			
 
				 
			
 
				-/* Define to the sub-directory where libtool stores uninstalled libraries. */
			
 
				+/* Define to the sub-directory in which libtool stores uninstalled libraries.
			
 
				+   */
			
 
				 #define LT_OBJDIR ".libs/"
			
 
				 
			
 
				 #ifdef OPUS_ARM_OPT
			
@@ -186,7 +191,7 @@
 
				 #define PACKAGE_NAME "opus"
			
 
				 
			
 
				 /* Define to the full name and version of this package. */
			
 
				-#define PACKAGE_STRING "opus 1.3.1"
			
 
				+#define PACKAGE_STRING "opus unknown"
			
 
				 
			
 
				 /* Define to the one symbol short name of this package. */
			
 
				 #define PACKAGE_TARNAME "opus"
			
@@ -195,7 +200,7 @@
 
				 #define PACKAGE_URL ""
			
 
				 
			
 
				 /* Define to the version of this package. */
			
 
				-#define PACKAGE_VERSION "1.3.1"
			
 
				+#define PACKAGE_VERSION "unknown"
			
 
				 
			
 
				 /* Define to 1 if you have the ANSI C header files. */
			
 
				 #define STDC_HEADERS 1
			
@@ -227,7 +232,11 @@
 
				 /* Define to the equivalent of the C99 'restrict' keyword, or to
			
 
				    nothing if this is not supported.  Do not define if restrict is
			
 
				    supported directly.  */
			
 
				+#if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 ))
			
 
				 #define restrict __restrict
			
 
				+#else
			
 
				+#undef restrict
			
 
				+#endif
			
 
				 /* Work around a bug in Sun C++: it does not support _Restrict or
			
 
				    __restrict__, even though the corresponding Sun C compiler ends up with
			
 
				    "#define restrict _Restrict" or "#define restrict __restrict__" in the
			
--- a/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h
+++ b/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h
@@ -0,0 +1,184 @@
 
				+/***********************************************************************
			
 
				+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions
			
 
				+are met:
			
 
				+- Redistributions of source code must retain the above copyright notice,
			
 
				+this list of conditions and the following disclaimer.
			
 
				+- Redistributions in binary form must reproduce the above copyright
			
 
				+notice, this list of conditions and the following disclaimer in the
			
 
				+documentation and/or other materials provided with the distribution.
			
 
				+- Neither the name of Internet Society, IETF or IETF Trust, nor the
			
 
				+names of specific contributors, may be used to endorse or promote
			
 
				+products derived from this software without specific prior written
			
 
				+permission.
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
			
 
				+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
			
 
				+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
			
 
				+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
			
 
				+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
			
 
				+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
			
 
				+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
			
 
				+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
			
 
				+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
			
 
				+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
			
 
				+POSSIBILITY OF SUCH DAMAGE.
			
 
				+***********************************************************************/
			
 
				+#ifndef __PREFILTER_FIX_MIPSR1_H__
			
 
				+#define __PREFILTER_FIX_MIPSR1_H__
			
 
				+
			
 
				+#ifdef HAVE_CONFIG_H
			
 
				+#include "config.h"
			
 
				+#endif
			
 
				+
			
 
				+#include "main_FIX.h"
			
 
				+#include "stack_alloc.h"
			
 
				+#include "tuning_parameters.h"
			
 
				+
			
 
				+#define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
			
 
				+void silk_warped_LPC_analysis_filter_FIX(
			
 
				+          opus_int32            state[],                    /* I/O  State [order + 1]                   */
			
 
				+          opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
			
 
				+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
			
 
				+    const opus_int16            input[],                    /* I    Input signal [length]               */
			
 
				+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
			
 
				+    const opus_int              length,                     /* I    Length of input signal              */
			
 
				+    const opus_int              order,                      /* I    Filter order (even)                 */
			
 
				+               int              arch
			
 
				+)
			
 
				+{
			
 
				+    opus_int     n, i;
			
 
				+    opus_int32   acc_Q11, acc_Q22, tmp1, tmp2, tmp3, tmp4;
			
 
				+    opus_int32   state_cur, state_next;
			
 
				+
			
 
				+    (void)arch;
			
 
				+
			
 
				+    /* Order must be even */
			
 
				+    /* Length must be even */
			
 
				+
			
 
				+    silk_assert( ( order & 1 ) == 0 );
			
 
				+    silk_assert( ( length & 1 ) == 0 );
			
 
				+
			
 
				+    for( n = 0; n < length; n+=2 ) {
			
 
				+        /* Output of lowpass section */
			
 
				+        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
			
 
				+        state_cur = silk_LSHIFT( input[ n ], 14 );
			
 
				+        /* Output of allpass section */
			
 
				+        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
			
 
				+        state_next = tmp2;
			
 
				+        acc_Q11 = silk_RSHIFT( order, 1 );
			
 
				+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
			
 
				+
			
 
				+
			
 
				+        /* Output of lowpass section */
			
 
				+        tmp4 = silk_SMLAWB( state_cur, state_next, lambda_Q16 );
			
 
				+        state[ 0 ] = silk_LSHIFT( input[ n+1 ], 14 );
			
 
				+        /* Output of allpass section */
			
 
				+        tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
			
 
				+        state[ 1 ] = tmp4;
			
 
				+        acc_Q22 = silk_RSHIFT( order, 1 );
			
 
				+        acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ 0 ] );
			
 
				+
			
 
				+        /* Loop over allpass sections */
			
 
				+        for( i = 2; i < order; i += 2 ) {
			
 
				+            /* Output of allpass section */
			
 
				+            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
			
 
				+            state_cur = tmp1;
			
 
				+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
			
 
				+            /* Output of allpass section */
			
 
				+            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
			
 
				+            state_next = tmp2;
			
 
				+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
			
 
				+
			
 
				+
			
 
				+            /* Output of allpass section */
			
 
				+            tmp4 = silk_SMLAWB( state_cur, state_next - tmp3, lambda_Q16 );
			
 
				+            state[ i ] = tmp3;
			
 
				+            acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ i - 1 ] );
			
 
				+            /* Output of allpass section */
			
 
				+            tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
			
 
				+            state[ i + 1 ] = tmp4;
			
 
				+            acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ i ] );
			
 
				+        }
			
 
				+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
			
 
				+        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
			
 
				+
			
 
				+        state[ order ] = tmp3;
			
 
				+        acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ order - 1 ] );
			
 
				+        res_Q2[ n+1 ] = silk_LSHIFT( (opus_int32)input[ n+1 ], 2 ) - silk_RSHIFT_ROUND( acc_Q22, 9 );
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* Prefilter for finding Quantizer input signal */
			
 
				+#define OVERRIDE_silk_prefilt_FIX
			
 
				+static inline void silk_prefilt_FIX(
			
 
				+    silk_prefilter_state_FIX    *P,                         /* I/O  state                               */
			
 
				+    opus_int32                  st_res_Q12[],               /* I    short term residual signal          */
			
 
				+    opus_int32                  xw_Q3[],                    /* O    prefiltered signal                  */
			
 
				+    opus_int32                  HarmShapeFIRPacked_Q12,     /* I    Harmonic shaping coeficients        */
			
 
				+    opus_int                    Tilt_Q14,                   /* I    Tilt shaping coeficient             */
			
 
				+    opus_int32                  LF_shp_Q14,                 /* I    Low-frequancy shaping coeficients   */
			
 
				+    opus_int                    lag,                        /* I    Lag for harmonic shaping            */
			
 
				+    opus_int                    length                      /* I    Length of signals                   */
			
 
				+)
			
 
				+{
			
 
				+    opus_int   i, idx, LTP_shp_buf_idx;
			
 
				+    opus_int32 n_LTP_Q12, n_Tilt_Q10, n_LF_Q10;
			
 
				+    opus_int32 sLF_MA_shp_Q12, sLF_AR_shp_Q12;
			
 
				+    opus_int16 *LTP_shp_buf;
			
 
				+
			
 
				+    /* To speed up use temp variables instead of using the struct */
			
 
				+    LTP_shp_buf     = P->sLTP_shp;
			
 
				+    LTP_shp_buf_idx = P->sLTP_shp_buf_idx;
			
 
				+    sLF_AR_shp_Q12  = P->sLF_AR_shp_Q12;
			
 
				+    sLF_MA_shp_Q12  = P->sLF_MA_shp_Q12;
			
 
				+
			
 
				+    if( lag > 0 ) {
			
 
				+        for( i = 0; i < length; i++ ) {
			
 
				+            /* unrolled loop */
			
 
				+            silk_assert( HARM_SHAPE_FIR_TAPS == 3 );
			
 
				+            idx = lag + LTP_shp_buf_idx;
			
 
				+            n_LTP_Q12 = silk_SMULBB(            LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 - 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
			
 
				+            n_LTP_Q12 = silk_SMLABT( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2    ) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
			
 
				+            n_LTP_Q12 = silk_SMLABB( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 + 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
			
 
				+
			
 
				+            n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
			
 
				+            n_LF_Q10   = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
			
 
				+
			
 
				+            sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
			
 
				+            sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12,  silk_LSHIFT( n_LF_Q10,   2 ) );
			
 
				+
			
 
				+            LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
			
 
				+            LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
			
 
				+
			
 
				+            xw_Q3[i] = silk_RSHIFT_ROUND( silk_SUB32( sLF_MA_shp_Q12, n_LTP_Q12 ), 9 );
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for( i = 0; i < length; i++ ) {
			
 
				+
			
 
				+            n_LTP_Q12 = 0;
			
 
				+
			
 
				+            n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
			
 
				+            n_LF_Q10   = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
			
 
				+
			
 
				+            sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
			
 
				+            sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12,  silk_LSHIFT( n_LF_Q10,   2 ) );
			
 
				+
			
 
				+            LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
			
 
				+            LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
			
 
				+
			
 
				+            xw_Q3[i] = silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 9 );
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /* Copy temp variable back to state */
			
 
				+    P->sLF_AR_shp_Q12   = sLF_AR_shp_Q12;
			
 
				+    P->sLF_MA_shp_Q12   = sLF_MA_shp_Q12;
			
 
				+    P->sLTP_shp_buf_idx = LTP_shp_buf_idx;
			
 
				+}
			
 
				+
			
 
				+#endif /* __PREFILTER_FIX_MIPSR1_H__ */
			
--- a/thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c
+++ b/thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c
@@ -0,0 +1,160 @@
 
				+/* Copyright (c) 2014, Cisco Systems, INC
			
 
				+   Written by XiangMingZhu WeiZhou MinPeng YanWang
			
 
				+
			
 
				+   Redistribution and use in source and binary forms, with or without
			
 
				+   modification, are permitted provided that the following conditions
			
 
				+   are met:
			
 
				+
			
 
				+   - Redistributions of source code must retain the above copyright
			
 
				+   notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+   - Redistributions in binary form must reproduce the above copyright
			
 
				+   notice, this list of conditions and the following disclaimer in the
			
 
				+   documentation and/or other materials provided with the distribution.
			
 
				+
			
 
				+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
			
 
				+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
			
 
				+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
			
 
				+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
			
 
				+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
			
 
				+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
			
 
				+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+*/
			
 
				+
			
 
				+#ifdef HAVE_CONFIG_H
			
 
				+#include "config.h"
			
 
				+#endif
			
 
				+
			
 
				+#include <xmmintrin.h>
			
 
				+#include <emmintrin.h>
			
 
				+#include <smmintrin.h>
			
 
				+#include "main.h"
			
 
				+#include "celt/x86/x86cpu.h"
			
 
				+
			
 
				+void silk_warped_LPC_analysis_filter_FIX_sse4_1(
			
 
				+    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
			
 
				+    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
			
 
				+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
			
 
				+    const opus_int16            input[],                    /* I    Input signal [length]               */
			
 
				+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
			
 
				+    const opus_int              length,                     /* I    Length of input signal              */
			
 
				+    const opus_int              order                       /* I    Filter order (even)                 */
			
 
				+)
			
 
				+{
			
 
				+    opus_int     n, i;
			
 
				+    opus_int32   acc_Q11, tmp1, tmp2;
			
 
				+
			
 
				+    /* Order must be even */
			
 
				+    celt_assert( ( order & 1 ) == 0 );
			
 
				+
			
 
				+    if (order == 10)
			
 
				+    {
			
 
				+        if (0 == lambda_Q16)
			
 
				+        {
			
 
				+            __m128i coef_Q13_3210, coef_Q13_7654;
			
 
				+            __m128i coef_Q13_0123, coef_Q13_4567;
			
 
				+            __m128i state_0123, state_4567;
			
 
				+            __m128i xmm_product1, xmm_product2;
			
 
				+            __m128i xmm_tempa, xmm_tempb;
			
 
				+
			
 
				+            register opus_int32 sum;
			
 
				+            register opus_int32 state_8, state_9, state_a;
			
 
				+            register opus_int64 coef_Q13_8, coef_Q13_9;
			
 
				+
			
 
				+            celt_assert( length > 0 );
			
 
				+
			
 
				+            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
			
 
				+            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
			
 
				+
			
 
				+            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
			
 
				+            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
			
 
				+
			
 
				+            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
			
 
				+            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
			
 
				+
			
 
				+            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
			
 
				+            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
			
 
				+
			
 
				+            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
			
 
				+            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
			
 
				+
			
 
				+            state_8 = state[ 8 ];
			
 
				+            state_9 = state[ 9 ];
			
 
				+            state_a = 0;
			
 
				+
			
 
				+            for( n = 0; n < length; n++ )
			
 
				+            {
			
 
				+                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
			
 
				+                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
			
 
				+
			
 
				+                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
			
 
				+                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
			
 
				+
			
 
				+                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
			
 
				+                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
			
 
				+
			
 
				+                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
			
 
				+                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
			
 
				+
			
 
				+                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
			
 
				+                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
			
 
				+
			
 
				+                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
			
 
				+                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
			
 
				+                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
			
 
				+
			
 
				+                sum  = (opus_int32)((coef_Q13_8 * state_8) >> 16);
			
 
				+                sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
			
 
				+
			
 
				+                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
			
 
				+                sum += _mm_cvtsi128_si32( xmm_tempa);
			
 
				+                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
			
 
				+
			
 
				+                /* move right */
			
 
				+                state_a = state_9;
			
 
				+                state_9 = state_8;
			
 
				+                state_8 = _mm_cvtsi128_si32( state_4567 );
			
 
				+                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
			
 
				+
			
 
				+                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
			
 
				+            }
			
 
				+
			
 
				+            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
			
 
				+            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
			
 
				+            state[ 8 ] = state_8;
			
 
				+            state[ 9 ] = state_9;
			
 
				+            state[ 10 ] = state_a;
			
 
				+
			
 
				+            return;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for( n = 0; n < length; n++ ) {
			
 
				+        /* Output of lowpass section */
			
 
				+        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
			
 
				+        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
			
 
				+        /* Output of allpass section */
			
 
				+        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
			
 
				+        state[ 1 ] = tmp2;
			
 
				+        acc_Q11 = silk_RSHIFT( order, 1 );
			
 
				+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
			
 
				+        /* Loop over allpass sections */
			
 
				+        for( i = 2; i < order; i += 2 ) {
			
 
				+            /* Output of allpass section */
			
 
				+            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
			
 
				+            state[ i ] = tmp1;
			
 
				+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
			
 
				+            /* Output of allpass section */
			
 
				+            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
			
 
				+            state[ i + 1 ] = tmp2;
			
 
				+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
			
 
				+        }
			
 
				+        state[ order ] = tmp1;
			
 
				+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
			
 
				+        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
			
 
				+    }
			
 
				+}