
;	optimized mdct() for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

;		99/09/01 lamemdct.cx[Xɏ
%include "nasm.h"

	globaldef mdct_FPU

	externdef win_mdct
	externdef cos_s_mdct
	externdef cos_l_017_2_3
	externdef cos_l_5_6_8_9
	externdef cos_l11121415
	externdef cos_l_1_71016
	externdef cos_l_4_0
	externdef cos_l_4_6

	segment_data
	segment_code

; mdctœK[` for P55C FPU
; by K.SAKAI
; 99/09/01	850clk@PII, 1700clk@P55C
; 99/09/02	850clk@PII, 1370clk@P55C
;
;void mdct_FPU( float *in, float *out, int block_type )

		align	16
mdct_FPU:
		xor		edx,edx
		mov		dl,[esp+12]		; block_type
		and		dl,3
		cmp		dl,2
		mov		eax,[esp+4]			; in
		je		near .block_type2
		imul		edx,4*36
		add		edx,win_mdct

		push	ebx
		sub		esp,18*4
%assign	_P	18*4+4

;	for (k=0;k<9;k++){
;		fin[k] = winp[k] * in[k] - winp[17-k] * in[17-k];
;		fin[9+k] = winp[18+k] * in[18+k] + winp[35-k] * in[35-k];
;	}
; winp is edx
; fin is esp
		xor		ecx,ecx						; k
		xor		ebx,ebx						; -k
		jmp		short .lp0

		align	16
.lp0:
		fld		dword [edx+ecx*4]			; = win[k]
		fmul	dword [eax+ecx*4]			; *= in[k]
		fld		dword [edx+17*4+ebx*4]		; = win[17-k]
		fmul	dword [eax+17*4+ebx*4]		; *= in[17-k]
		fld		dword [edx+18*4+ecx*4]		; = win[18+k]
		fmul	dword [eax+18*4+ecx*4]		; *= in[18+k]
		fld		dword [edx+35*4+ebx*4]		; = win[35-k]
		fmul	dword [eax+35*4+ebx*4]		; *= in[35-k]

		faddp	st1,st0
		fstp	dword [esp+ecx*4+9*4]
		fsubp	st1,st0
		fstp	dword [esp+ecx*4]

		inc		ecx
		dec		ebx
		cmp		cl,9
		jl		.lp0

; m = 0, 17, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15
;	sum  = ( fin[ 0] ) * cos_l[m][ 0]; /* 17 */
;	sum += ( fin[ 1] ) * cos_l[m][ 1]; /* 15 */
;	sum += ( fin[ 2] ) * cos_l[m][ 2]; /* 13 */
;	sum += ( fin[ 3] ) * cos_l[m][ 3]; /* 11 */
;	sum += ( fin[ 4] ) * cos_l[m][ 4]; /*  9 */
;	sum += ( fin[ 5] ) * cos_l[m][ 5]; /*  7  */
;	sum += ( fin[ 6] ) * cos_l[m][ 6]; /*  5  */
;	sum += ( fin[ 7] ) * cos_l[m][ 7]; /*  3 */
;	sum += ( fin[ 8] ) * cos_l[m][ 8]; /*  1  */
;	sum += ( fin[ 9] ) * cos_l[m][ 9]; /* 19*/
;	sum += ( fin[10] ) * cos_l[m][10]; /* 21 */
;	sum += ( fin[11] ) * cos_l[m][11]; /* 23 */
;	sum += ( fin[12] ) * cos_l[m][12]; /* 25 */
;	sum += ( fin[13] ) * cos_l[m][13]; /* 27 */
;	sum += ( fin[14] ) * cos_l[m][14]; /* 29 */
;	sum += ( fin[15] ) * cos_l[m][15]; /* 31*/
;	sum += ( fin[16] ) * cos_l[m][16]; /* 33 */
;	sum += ( fin[17] ) * cos_l[m][17]; /* 35 */
;	out[m]=sum;

		mov		ebx,[esp+_P+8]		; = out

; m = 0, 17, 2, 3
		mov		edx,cos_l_017_2_3+4*4
		mov		ecx,17
		lea		eax,[esp+1*4]

		fld		dword [edx- 4*4]	; cos_l[ 0][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 3*4]	; cos_l[17][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 2*4]	; cos_l[ 2][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 1*4]	; cos_l[ 3][0]
		fmul	dword [esp+ 0*4]	; fin[0]

		jmp		short .lp1

		align	16
.lp1:
		fld		dword [edx+ 0*4]	; cos_l[ 0][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 1*4]	; cos_l[17][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 2*4]	; cos_l[ 2][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 3*4]	; cos_l[ 3][k]
		fmul	dword [eax]			; fin[k]
		add		edx,4*4
		add		eax,1*4
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0

		dec		ecx
		jnz		.lp1

		fstp	dword [ebx+ 3*4]
		fstp	dword [ebx+ 2*4]
		fstp	dword [ebx+17*4]
		fstp	dword [ebx+ 0*4]

; m = 5, 6, 8, 9
		mov		edx,cos_l_5_6_8_9+4*4
		mov		ecx,17
		lea		eax,[esp+1*4]

		fld		dword [edx- 4*4]	; cos_l[ 5][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 3*4]	; cos_l[ 6][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 2*4]	; cos_l[ 8][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 1*4]	; cos_l[ 9][0]
		fmul	dword [esp+ 0*4]	; fin[0]

		jmp		short .lp2

		align	16
.lp2:
		fld		dword [edx+ 0*4]	; cos_l[ 5][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 1*4]	; cos_l[ 6][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 2*4]	; cos_l[ 8][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 3*4]	; cos_l[ 9][k]
		fmul	dword [eax]			; fin[k]
		add		edx,4*4
		add		eax,1*4
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0

		dec		ecx
		jnz		.lp2

		fstp	dword [ebx+ 9*4]
		fstp	dword [ebx+ 8*4]
		fstp	dword [ebx+ 6*4]
		fstp	dword [ebx+ 5*4]

; m = 11, 12, 14, 15
		mov		edx,cos_l11121415+4*4
		mov		ecx,17
		lea		eax,[esp+1*4]

		fld		dword [edx- 4*4]	; cos_l[11][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 3*4]	; cos_l[12][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 2*4]	; cos_l[14][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 1*4]	; cos_l[15][0]
		fmul	dword [esp+ 0*4]	; fin[0]

		jmp		short .lp3

		align	16
.lp3:
		fld		dword [edx+ 0*4]	; cos_l[11][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 1*4]	; cos_l[12][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 2*4]	; cos_l[14][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 3*4]	; cos_l[15][k]
		fmul	dword [eax]			; fin[k]
		add		edx,4*4
		add		eax,1*4
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0

		dec		ecx
		jnz		.lp3

		fstp	dword [ebx+15*4]
		fstp	dword [ebx+14*4]
		fstp	dword [ebx+12*4]
		fstp	dword [ebx+11*4]

; fin[ 0] = fin[ 0] + fin[ 5] + fin[15]
; fin[ 1] = fin[ 1] + fin[ 4] + fin[16]
; fin[ 2] = fin[ 2] + fin[ 3] + fin[17]
		fld		dword [esp+0*4]
		fld		dword [esp+1*4]
		fld		dword [esp+2*4]
		fxch	st2						; fin[ 2], fin[ 1], fin[ 0]
		fadd	dword [esp+5*4]
		fxch	st1						; fin[ 2], fin[ 0], fin[ 1]
		fadd	dword [esp+4*4]
		fxch	st2						; fin[ 1], fin[ 0], fin[ 2]
		fadd	dword [esp+3*4]
		fxch	st1						; fin[ 1], fin[ 2], fin[ 0]
		fadd	dword [esp+15*4]
		fxch	st2						; fin[ 0], fin[ 2], fin[ 1]
		fadd	dword [esp+16*4]
		fxch	st1						; fin[ 0], fin[ 1], fin[ 2]
		fadd	dword [esp+17*4]
		fxch	st2						; fin[ 2], fin[ 1], fin[ 0]
		fstp	dword [esp+ 0*4]
		fstp	dword [esp+ 1*4]
		fstp	dword [esp+ 2*4]

; fin[ 3] = fin[ 6] - fin[ 9] + fin[14]
; fin[ 4] = fin[ 7] - fin[10] + fin[13]
; fin[ 5] = fin[ 8] - fin[11] + fin[12]
		fld		dword [esp+ 6*4]
		fld		dword [esp+ 7*4]
		fld		dword [esp+ 8*4]
		fxch	st2						; fin[ 8], fin[ 7], fin[ 6]
		fsub	dword [esp+ 9*4]
		fxch	st1						; fin[ 8], fin[ 6], fin[ 7]
		fsub	dword [esp+10*4]
		fxch	st2						; fin[ 7], fin[ 6], fin[ 8]
		fsub	dword [esp+11*4]
		fxch	st1						; fin[ 7], fin[ 8], fin[ 6]
		fadd	dword [esp+14*4]
		fxch	st2						; fin[ 6], fin[ 8], fin[ 7]
		fadd	dword [esp+13*4]
		fxch	st1						; fin[ 6], fin[ 7], fin[ 8]
		fadd	dword [esp+12*4]
		fxch	st2						; fin[ 8], fin[ 7], fin[ 6]
		fstp	dword [esp+ 3*4]
		fstp	dword [esp+ 4*4]
		fstp	dword [esp+ 5*4]

; m = 1, 7, 10, 16
;	sum  = ( fin[ 0]+fin[ 5]+fin[15] ) * cos_l[m][0];  /* mfc=15 0*/
;	sum += ( fin[ 1]+fin[ 4]+fin[16] ) * cos_l[m][1];  /* mfc=9 1*/
;	sum += ( fin[ 2]+fin[ 3]+fin[17] ) * cos_l[m][2];  /* mfc=3 2*/
;	sum += ( fin[ 6]-fin[ 9]+fin[14] ) * cos_l[m][6]; /* mfc=21 6*/
;	sum += ( fin[ 7]-fin[10]+fin[13] ) * cos_l[m][7]; /* mfc=27 7*/
;	sum += ( fin[ 8]-fin[11]+fin[12] ) * cos_l[m][8]; /* mfc = 28 8*/
;	out[m]=sum;
		
		mov		edx,cos_l_1_71016+4*4
		mov		ecx,5
		lea		eax,[esp+1*4]
		fld		dword [edx- 4*4]	; cos_l[ 0][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 3*4]	; cos_l[17][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 2*4]	; cos_l[ 2][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		fld		dword [edx- 1*4]	; cos_l[ 3][0]
		fmul	dword [esp+ 0*4]	; fin[0]
		jmp		short .lp4

		align	16
.lp4:
		fld		dword [edx+ 0*4]	; cos_l[ 0][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 1*4]	; cos_l[17][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 2*4]	; cos_l[ 2][k]
		fmul	dword [eax]			; fin[k]
		fld		dword [edx+ 3*4]	; cos_l[ 3][k]
		fmul	dword [eax]			; fin[k]
		add		edx,4*4
		add		eax,1*4
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0
		faddp	st4,st0

		dec		ecx
		jnz		.lp4

		fstp	dword [ebx+16*4]
		fstp	dword [ebx+10*4]
		fstp	dword [ebx+ 7*4]
		fstp	dword [ebx+ 1*4]

; fin[0] = (fin[0]+fin[5]+fin[15])-(fin[1]+fin[4]+fin[16])+(fin[8]-fin[11]+fin[12]);
; fin[6] = (fin[6]-fin[9]+fin[14])-(fin[2]+fin[3]+fin[17])+(fin[7]-fin[10]+fin[13]);
		fld		dword [esp+0*4]
		fld		dword [esp+3*4]
		fxch
		fsub	dword [esp+1*4]
		fxch
		fsub	dword [esp+2*4]
		fxch
		fadd	dword [esp+5*4]
		fxch
		fadd	dword [esp+4*4]			; fin[0], fin[6]
		fld		dword [cos_l_4_0]
		fld		dword [cos_l_4_6]		; fin[0], fin[6], cos_l_4_0, cos_l_4_6

;		/* 4 */
; sum  = ((fin[0]+fin[5]+fin[15])-(fin[1]+fin[4]+fin[16])+(fin[8]-fin[11]+fin[12]))*cos_l[4][0];
; sum += ((fin[6]-fin[9]+fin[14])-(fin[2]+fin[3]+fin[17])+(fin[7]-fin[10]+fin[13]))*cos_l[4][6];
		fld		st3						; fin[0], fin[6], cos_l_4_0, cos_l_4_6, fin[0]
		fmul	st0,st2
		fld		st3
		fmul	st0,st2
		faddp	st1,st0
;		out[4]=sum;
		fstp	dword [ebx+4*4]			; fin[0], fin[6], cos_l_4_0, cos_l_4_6


;		/* 13 */
; cos_l13_0 = -cos_l_4_6
; cos_l13_6 = +cos_l_4_0
; sum  = ((fin[0]+fin[5]+fin[15])-(fin[1]+fin[4]+fin[16])+(fin[8]-fin[11]+fin[12]))*cos_l[13][0];
; sum += ((fin[6]-fin[9]+fin[14])-(fin[2]+fin[3]+fin[17])+(fin[7]-fin[10]+fin[13]))*cos_l[13][6];
		fmulp	st3,st0
		fmulp	st1,st0
		fsubrp	st1,st0
;		out[13]=sum;
		fstp		dword [ebx+13*4]

		add		esp,18*4
		pop		ebx
		ret

; block_type == 2
;		for ( m = 0; m < 6; m++ ){
;			for ( k = 0; k < 12; k++ ){
;				sum += in[k + 6] * cos_s_mdct[m][k];
;			}
;			out[ 3 * m + 0] = sum;
;
;			for ( k = 0; k < 12; k++ ){
;				sum += in[k + 12] * cos_s_mdct[m][k];
;			}
;			out[ 3 * m + 1] = sum;
;
;			for ( k = 0; k < 12; k++ ){
;				sum += in[k + 18] * cos_s_mdct[m][k];
;			}
;			out[ 3 * m + 2] = sum;
		align	16
.block_type2:
		push	ebx
		mov		ebx,[esp+4+8]		; = out
		mov		ecx,cos_s_mdct			; = cos_s_mdct[m]

.lp8:
		xor		edx,edx				; k

		fld		dword [eax+ 6*4]
		fmul	dword [ecx]	; = cos_s_mdct[m][0]
		fld		dword [eax+12*4]
		fmul	dword [ecx]	; = cos_s_mdct[m][0]
		fld		dword [eax+18*4]			; out[3*m+0], out[3*m+1], out[3*m+2]
		fmul	dword [ecx]	; = cos_s_mdct[m][0]
		inc		edx
		jmp		short .lp9

		align	16
.lp9:
		fld		dword [eax+edx*4+ 6*4]	; = in[k+ 6]
		fmul	dword [ecx+edx*4]	; = cos_s_mdct[m][k]
		fld		dword [eax+edx*4+12*4]	; = in[k+12]
		fmul	dword [ecx+edx*4]	; = cos_s_mdct[m][k]
		fld		dword [eax+edx*4+18*4]	; = in[k+18]
		fmul	dword [ecx+edx*4]	; = cos_s_mdct[m][k]
		faddp	st3,st0
		faddp	st3,st0
		faddp	st3,st0

		inc		edx
		cmp		edx,12
		jl		.lp9

		fstp	dword [ebx+2*4]
		fstp	dword [ebx+1*4]
		fstp	dword [ebx+0*4]
		add		ecx,12*4
		add		ebx,3*4
		cmp		ecx,cos_s_mdct+12*6*4
		jl		.lp8

		pop		ebx
		ret

		end

