/*******************************************************************************
* Function Name  : fir_16by16
* Description    : FIR 16-bit filter
* Input          : - a0: Output array (y)
*                  - a1: Input array (x)
*                  - a2: the number of output samples
*                  - a3: coefficients array (c)
*                  - a4: coefficients array length
* Output         : None
* Return         : None
*******************************************************************************/
.global  fir_16by16
.type   fir_16by16, %function
fir_16by16:
	addi sp,sp,-40
	sw	s0, (4)(sp)
    sw	s1, (8)(sp)
    sw	a0, (12)(sp)
    sw	a1, (16)(sp)
    sw	a2, (20)(sp)
    sw	a3, (24)(sp)
    sw	a4, (28)(sp)
    sw	a2, (32)(sp) /* samples left */
	sw  zero, (36)(sp) /* taps left */
/*
	a0           | coeff
	a1           | input ptr
	a2 a3 a4 a5  | input
	s0 s1 t0 t1  | output
	t2           | coeff ptr
*/
next_sample:
	/* load taps */
    lw	a0, (28)(sp)
	/* load coeffs ptr */
    lw	t2, (24)(sp)
	/* load data ptr */
    lw	a1, (16)(sp)
	/* load x0-x3 */
	lh a2,(a1)
	lh a3,2(a1)
	lh a4,4(a1)
	lh a5,6(a1)
	/* set y0-y3 to 0 */
	mv s0,zero
	mv s1,zero
	mv t0,zero
	mv t1,zero
next_tap:
	/* save taps */
	sw  a0, (36)(sp)
	/* Tap1 a0 a2 a3 a4 a5 */
	/* 	    c0 x0 x1 x2 x3 */
	lh a0,(t2)		/* load c0 */
	mul a2,a0,a2	/* y0+=c0*x0 */
	add s0,s0,a2
	mul a2,a0,a3	/* y1+=c0*x1 */
	add s1,s1,a2
	mul a2,a0,a4	/* y2+=c0*x2 */
	add t0,t0,a2
	mul a2,a0,a5	/* y3+=c0*x3 */
	add t1,t1,a2
	
	/* Tap2 a0 a2 a3 a4 a5 */
	/* 	    c1 x4 x1 x2 x3 */
	lh a0,2(t2)		/* load c1 */
	lh a2,8(a1)		/* load x4 */
	mul a3,a0,a3	/* y0+=c1*x1 */
	add s0,s0,a3
	mul a3,a0,a4	/* y1+=c1*x2 */
	add s1,s1,a3
	mul a3,a0,a5	/* y2+=c1*x3 */
	add t0,t0,a3
	mul a3,a0,a2	/* y3+=c1*x4 */
	add t1,t1,a3
	
	/* Tap3 a0 a2 a3 a4 a5 */
	/* 	    c2 x4 x5 x2 x3 */
	lh a0,4(t2)		/* load c2 */
	lh a3,10(a1)	/* load x5 */
	mul a4,a0,a4	/* y0+=c2*x2 */
	add s0,s0,a4
	mul a4,a0,a5	/* y1+=c2*x3 */
	add s1,s1,a4
	mul a4,a0,a2	/* y2+=c2*x4 */
	add t0,t0,a4
	mul a4,a0,a3	/* y3+=c2*x5 */
	add t1,t1,a4
	
	/* Tap3 a0 a2 a3 a4 a5 */
	/* 	    c3 x4 x5 x6 x3 */
	lh a0,6(t2)		/* load c3 */
	lh a4,12(a1)	/* load x6 */
	mul a5,a0,a5	/* y0+=c3*x3 */
	add s0,s0,a5
	mul a5,a0,a2	/* y1+=c3*x4 */
	add s1,s1,a5
	mul a5,a0,a3	/* y2+=c3*x5 */
	add t0,t0,a5
	mul a5,a0,a4	/* y3+=c3*x6 */
	add t1,t1,a5
	
	lh a5,14(a1)	/* load x7 */
	/*      a0 a2 a3 a4 a5 */
	/* 	    -- x4 x5 x6 x7*/
	
	add a1,a1,8 /* move to next data */
	add t2,t2,8  /* move to next coefs */
	/* taps left */
	lw  a0, (36)(sp)
	addi a0,a0,-4
	bgt a0,zero,next_tap
	
	/* save samples */
	lw	a0, (12)(sp)
	sw	s0,(a0)
	sw	s1,4(a0)
	sw	t0,8(a0)
	sw	t1,12(a0)
	add a0,a0,16
	sw	a0, (12)(sp)
	
	/* move data pointer */
    lw	a0, (16)(sp)
	add a0,a0,8
	sw  a0, (16)(sp)
	/* check whether have samples to deal */
	lw  a0, (32)(sp)
	add a0,a0,-4
	sw  a0, (32)(sp)
	bgt a0,zero,next_sample
	
	lw	s0,(4)(sp)
	lw	s1,(8)(sp)
	addi sp,sp,40
	ret
	