/*******************************************************************************
* Function Name  : fft_64
* Description    : FFT 64 filter
* Input          : - a0: Ouput freq data (f)
*                  - a1: Input time data (s)
*                  - a2: the number of input samples
* Output         : None
* Return         : None
*******************************************************************************/

.equ NPT, 256

.extern FFT_TABLE

.global cfft_r4_256
.type   cfft_r4_256, %function
cfft_r4_256:
	add sp,sp,-40
	sw	s0, 0(sp)
    sw	s1, 4(sp)
	sw  ra, 8(sp)
	sw  x4, 24(sp)
    sw	a0, 12(sp)
    sw	a1, 16(sp)
    sw	a2, 20(sp)

	/*
	x5-x12 -> A B C D (complex)
	x13    -> index
	x14    -> tmp
	x15    -> out data
	*/
    mv x13, zero
	mv x15, a0

preloop:
	/* get bit-reversed index */
    lw	x14, 16(sp)
	la	x5,BN_TABLE
	add x5,x5,x13
	lbu  x6,(x5)
	
	slli x6,x6,2
	add x14,x14,x6
	
	lh  x5,0(x14)        /* Ar */
	lh  x6,2(x14)		 /* Ai */
	lh  x9,0+NPT(x14)	 /* Cr */
	lh  x10,2+NPT(x14)	 /* Ci */
	lh  x7,0+NPT*2(x14)	 /* Br */
	lh  x8,2+NPT*2(x14)	 /* Bi */
	lh  x11,0+NPT*3(x14) /* Dr */
	lh  x12,2+NPT*3(x14) /* Di */
	
	/* (C,D) = (C+D, C-D) */
	add	x9, x9, x11
	add	x10, x10, x12
	slli x14,x11,1
	sub x11, x9, x14
	slli x14,x12,1
	sub x12, x10,x14
	
	/* (A,B) = (A+B)/4, (A-B)/4 */
	srai x5,x5,2
	srai x6,x6,2
	srai x14,x7,2
	add x5,x5,x14
	srai x14,x8,2
	add x6,x6,x14
	srai x14,x7,1
	sub x7,x5,x14
	srai x14,x8,1 
	sub x8,x6,x14  
	/* (A,C) = (A+C)/4, (A-C)/4 */
	srai x14,x9,2
	add x5,x5,x14 	
	srai x14,x10,2
	add x6,x6,x14	
	srai x14,x9,1
	sub x9,x5,x14	
	srai x14,x10,1
	sub x10,x6,x14	
	/* (B,D) = (B-i*D)/4, (B+i*D)/4 */
	srai x14,x12,2
	add x7,x7,x14 
	srai x14,x11,2
	sub x8,x8,x14
	srai x14,x12,1
	sub x12,x7,x14
	srai x14,x11,1
	add x11,x8,x14
	
	
	sh  x5,0(x15)
	sh  x6,2(x15)
	sh  x7,4(x15)
	sh  x8,6(x15)
	sh  x9,8(x15)
	sh  x10,10(x15)
	sh  x12,12(x15)  	/* inversion here */
	sh  x11,14(x15)
	add x15,x15,16
	
	add x13,x13,1
	li x14,NPT/4   		/* NPT/4 */
	blt x13,x14,preloop
	
    lw	x15, 12(sp) 	/* restore data pointer */
	li  x13,16      	/* index  */
	li  x1,NPT/16		/* butternbr=256/16= 16pts */
	sw  x1,28(sp)   
	/* load FFT Table */
	la x4,FFT_TABLE
	/*la x1,TableFFT_V7
    sw x1, 24(sp)*/

	/*
	x5-x12 -> data
	x13    -> index
	x14    -> tmp
	x15    -> data ptr
	x1     -> tmp2
	x4     -> FFT table
	*/
passloop:
	lw x1,28(sp)
    sw x15, 32(sp)
	sw x1, 36(sp)
	slli x14,x13,1
	add x14,x14,x13
	add x15,x15,x14
	li x14,1<<16
	sub x1,x1,x14
	
grouploop:

	slli x14,x13,16-2
	add x1,x1,x14
	sw x1,28(sp)
	
butterloop:
	/***XMUL1***/
	lh x5,0(x15)
	lh x6,2(x15)
	sub x15,x15,x13
	
	/*lw x1,24(sp)
	lh x11,0(x1)
	lh x12,2(x1)*/
	lh x11,0(x4)
	lh x12,2(x4)
	add x4,x4,4
	
	sub x1,x6,x5
	mul x14,x1,x12
	slli x12,x12,1
	add x1,x11,x12
	mul x12,x6,x11
	add x12,x12,x14
	mul x11,x5,x1
	add x11,x11,x14
	
	/***XMUL2***/
	lh x5,0(x15)
	lh x6,2(x15)
	sub x15,x15,x13
	
	/*lw x1,24(sp)
	lh x9,4(x1)
	lh x10,6(x1)*/
	lh x9,0(x4)
	lh x10,2(x4)
	add x4,x4,4
	
	
	sub x1,x6,x5
	mul x14,x1,x10
	slli x10,x10,1
	add x1,x9,x10
	mul x10,x6,x9
	add x10,x10,x14
	mul x9,x5,x1
	add x9,x9,x14
	
	/***XMUL3***/
	lh x5,(x15)
	lh x6,2(x15)
	sub x15,x15,x13
	
	/*lw x1,24(sp)
	lh x7,8(x1)
	lh x8,10(x1)
	add x1,x1,12
	sw x1,24(sp)*/
	lh x7,0(x4)
	lh x8,2(x4)
	add x4,x4,4
	
	
	sub x1,x6,x5
	mul x14,x1,x8
	slli x8,x8,1
	add x1,x7,x8
	mul x8,x6,x7
	add x8,x8,x14
	mul x7,x5,x1
	add x7,x7,x14
	
	/***XADD***/
	lh x5,(x15)
	lh x6,2(x15)
	
	add x9,x9,x11
	add x10,x10,x12
	slli x11,x11,1
	sub x11,x9,x11
	slli x12,x12,1
	sub x12,x10,x12
	
	srai x5,x5,2
	srai x6,x6,2
	srai x1,x7,(2+14)
	add x5,x5,x1
	srai x1,x8,(2+14)
	add x6,x6,x1
	srai x1,x7,(1+14)
	sub x7,x5,x1
	srai x1,x8,(1+14)
	sub x8,x6,x1
	srai x1,x9,(2+14)
	add x5,x5,x1
	srai x1,x10,(2+14)
	add x6,x6,x1
	srai x1,x9,(1+14)
	sub x9,x5,x1
	srai x1,x10,(1+14)
	sub x10,x6,x1
	srai x1,x12,(2+14)
	add x7,x7,x1
	srai x1,x11,(2+14)
	sub x8,x8,x1
	srai x1,x12,(1+14)
	sub x12,x7,x1
	srai x1,x11,(1+14)
	add x11,x8,x1
	
	sh x5,(x15)
	sh x6,2(x15)
	add x15,x15,x13
	sh x7,(x15)
	sh x8,2(x15)
	add x15,x15,x13
	sh x9,(x15)
	sh x10,2(x15)
	add x15,x15,x13
	sh x12,(x15)
	sh x11,2(x15)
	add x15,x15,4
	
	lw x1,28(sp)
	li x14,1<<16
	sub x1,x1,x14
	sw x1,28(sp)
	bge x1,zero,butterloop
	
	slli x14,x13,1
	add x14,x14,x13
	add x15,x15,x14
	/*
	use x5,x6 as temp
	x1=28(sp) x6=24(sp)
	*/
	add x1,x1,-1
	slli x5,x1,16
	beq x5,zero,next
	/*lw x6,24(sp)
	sub x6,x6,x14
	sw x6,24(sp)*/
	sub x4,x4,x14
	j grouploop
	next:
	
	lw x15,32(sp)
	lw x1,36(sp)
	slli x13,x13,2
	srai x1,x1,2
	sw x1,28(sp)
	bne x1,zero,passloop
	
	/* return */
	lw	s0, 0(sp)
	lw	s1, 4(sp)
	lw  ra, 8(sp)
	lw  x4,24(sp)
	add sp,sp,40
	ret
	
BN_TABLE:
		.byte 0,32,16,48,8,40,24,56,4,36,20,52,12,44,28,60
		.byte 2,34,18,50,10,42,26,58,6,38,22,54,14,46,30,62
		.byte 1,33,17,49,9,41,25,57,5,37,21,53,13,45,29,61
		.byte 3,35,19,51,11,43,27,59,7,39,23,55,15,47,31,63
.end
