/* ------------------------------ fftg.c ------------------------------------ */
/*                                                                            */
/* Author:      Eyal Lebedinsky                                               */
/* Date:        May 1990                                                      */
/* Version:     9 June 1991                                                   */
/*                                                                            */
/* Program generator for integer DFFT. This program was developed for a very  */
/* specific need and no attempt was made to make it general. Still, the basic */
/* algorithm is a standard one from the literature. For maximum performance   */
/* of the integer calculation a special mechanism was employed to track shift */
/* operations that could be delayed. Coefficients are kept as 16 bits with 1  */
/* sign bit, 1 integer bit and 14 fractional bits. So a typicall multiply     */
/* will do 16bit*16bits=32bits, then shift down one bit and use the HIGH 16   */
/* bits as the result. Before adding two numbers they must be scaled down one */
/* bit to avoid overflow. Adding more numbers calls for more scaling. The     */
/* delayed scaling allows the intermediate array to hold numbers that are     */
/* scaled to a varying degree until a clash occurs. Well, this space is too   */
/* short to elaborate any further and if you did similar kind of work then    */
/* you should know what I am talking about.                                   */
/*                                                                            */
/* usage: fftgc File Size EpName                                              */
/*      File - output file name.                                              */
/*      Size - sample order: 8 for 256 points, 10 for 1024...                 */
/*      EpName - name of the generated function ('_' added)                   */
/* Now assemble the output file and then use as:                              */
/*                                                                            */
/* #define M 8                  [or whatever size]                            */
/* #define N (1 << M)                                                         */
/* short x[N];                  [input/output array]                          */
/* short qf[(N/2)+1];           [power spectrum]                              */
/* extern void far fft ()                                                     */
/*                                                                            */
/* [then call it as: fft();]                                                  */
/*                                                                            */
/* This is a modified version of Sorensen et al, IEEE ASSP-35 no 6.           */
/*                                                                            */
/* - uses 3+3 complex mult.                                                   */
/* - no pre scrambling. descrambling done when power spectrum calculated.     */
/* - moved to 0-origin as proper for c.                                       */
/* - trig values in table.                                                    */
/* - the case for sin==cos treated separately.                                */
/* - uses delayed scaling down.                                               */
/* - does length 256 fft with 642 mults.                                      */
/*                                                                            */
/*                                                                            */
/* This program is released into the public domain.                           */
/*                                                                            */
/*----------------------------------------------------------------------------*/

#include <stdio.h>
#include <math.h>

#define	MAXPOINTS	1024
#define SC15 32768				/* scaling factor 2^15 */

/***
About SC15:
	Fractions are kept as 16 bit quantities with one sign bit, no integer
	bits and 15 fractional bits. SC15 is used to convert a floating point
	number (in the range -1 .. 1) into such an integer.

	There is no integer part because all fractions are really kept already
	scaled down by a factor of 2. This is a result of the fact that all
	fractions are constants (sin() and cos() values) and whenever we
	multiply a point by a fraction we process to scale the result down by
	one bit anywat - so the reduce constants save us some processing time.
***/

static short mm[MAXPOINTS+1];		/* digit reverse indices */

/***
About mm[]:
	The values in the sample points array x[] are not scramled at any
	stage. Instead, the array mm[] indicated where a point should be
	if we DID do the scrambling. All indices into x[] use mm[] and
	this is how the power spectrum extracts the data. This means that
	you CANNOT use the x[] array after calling fft() because the points
	are not in place. If you want to change the output of fft() then just
	modify the last part of this program (the power spectrum logic) to
	calculate whatever you need.
***/

static int cntx[9];		/* stats counters */

/***
About cntx[]:
	This array gathers statistics:
		0 memory loads
		1 memory stores
		2 register shifts
		3 adds
		4 mults
		5 memory shifts due to smad() correction
***/

static char *file_name, *ep_name;

extern void	start_fft (), end_fft ();
extern void	fft1 (),				/* scaling */
		fft2 (), fft3 (), fft4 (), fft5 (),	/* SRFFT */
		fft7 (), fft8 ();			/* power spectrum */

/***
about ad[], mad and smad():
	Because we do integer arithmetic, adding two numbers can overflow the
	integer magnitude. For this reason the integers are scaled down (by one
	bit) before the addition. Sometimes we add more that two numbers and it
	is necessary to scale further.

	The FFT algorithm does not use all of the points in the same way in one
	pass, which means that some data needs to be scaled and some does not
	need to. Actualy, in the first pass some points are not used at all.
	This means that SOME of the points would have been scaled down. If we
	add (in a later stage) such a point with an un-scaled point the result
	is invalid. For this purpose the array ad[] keeps track of how far each
	point was scaled, smad() checks that a point is properly scaled and mad
	is the needed scaling at this stage. As the FFT progresses, some points
	are kept at a different scaling untill they are used in an expression
	with other points. If smad() finds a point out of adjustment the it
	issues a scaling instruction using the fft1() function.

	The program was adjusted to optimize the scaling (the various
	fft2() ... fft5() have internal scaling rules) and you may notice that
	smad() hardly ever needs to do any correction (usually there is only one
	final shift). This makes the whole ad[] almost redundant but it was left
	in for future use.

	If your smaple points are less that 16 bits then you still need to keep
	scaling you data. You need to have enough spare bits to avoid the
	scaling, and the number depends on the size of the sample - the larger
	the sample the more stages the FFT has and the more spare bits you
	need.
***/

static short ad[MAXPOINTS+1];	/* how far each point is adjusted */
static int mad; 		/* maximum adjust at this level */

static void
smad (i, j)		 	       /* set proper adjust	       */
int	i, j;
{
	while (ad[i] < (mad-j)) {
		fprintf (stderr, "x[%u] >>= %d (%d-%d)\n", i, 1, mad, j);
		fft1 (mm[i]);
		++ad[i];
		cntx[0] += 1;
		cntx[1] += 1;
		cntx[2] += 1;
		cntx[5] += 1;
	}
}

main (argc, argv)
int	argc;
char	*argv[];
{
	int n, m, j, i, k, is, id, n1, n2, n4, n8, ind,
		i0, i1, i2, i3, i4, i5, i6, i7, i8,
		cc1, ss1, cc3, ss3;
	float e, a, a3;

/* ---------- Handle arguments ------------------------------------------- */

	if (argc < 4) {
		printf ("usage: fftg File PowerOf2 EpName [stderrFile]\n");
		exit (0);
	}

	file_name = argv[1];
	sscanf (argv[2], "%u", &m);
	ep_name = argv[3];
	if (argc > 4)
		freopen (argv[4], "wt", stderr);

/* ---------- Initialise  ------------------------------------------------ */

	start_fft (file_name, ep_name);

	n = 1 << m;

	for (i = 0; i < 9; ++i)
		cntx[i] = 0;

	mad = 0;
	for (i = 1; i <= n; ++i)
		ad[i] = 0;

/* ---------- Digit reverse counter -------------------------------------- */

	for (i = 1; i <= n; ++i) {
		mm[i] = i-1;		/* '-1' for shifting to 0-origin    */
	}

	j = 1;
	n1 = n - 1;
	for (i = 1; i <= n1; ++i) {
		if (i < j) {
			mm[i] = j-1;	/* '-1' for shifting to 0-origin    */
			mm[j] = i-1;	/* '-1' for shifting to 0-origin    */
		}

		for (k = n / 2; k < j; k /= 2)
			j -= k;
		j += k;
	}

/* ---------- Length two butterflies ------------------------------------- */

	for (is = 1, id = 4; is < n; is = 2 * id - 1, id = 4 * id) {
		for (i0 = is; i0 <= n; i0 += id) {
			i1 = i0 + 1;

			smad (i0, 0);
			smad (i1, 0);
			fft2 (mm[i0], mm[i1]);
			++ad[i0];
			++ad[i1];
			cntx[0] += 2;
			cntx[1] += 2;
			cntx[2] += 2;
			cntx[3] += 2;
		}
	}

/* ---------- L shaped butterflies --------------------------------------- */

	n2 = 2;
	for (k = 2; k <= m; ++k) {
		++mad;
		n2 = n2 * 2;
		n4 = n2 / 4;
		n8 = n2 / 8;
		e = 6.2831853071719586 / n2;		/* 2 * pi */
		cc1 = (SC15/2) * 0.7071067811865475;	/* sqrt (0.5) */

		for (is = 0, id = n2 * 2; is < n; is = 2 * id - n2, id *= 4) {
			for (i = is; i <= n - 1; i += id) {
				i1 = i + 1;
				i2 = i1 + n4;
				i3 = i2 + n4;
				i4 = i3 + n4;

				smad (i1, 0);
				smad (i3, 1);
				smad (i4, 1);
				fft3 (mm[i1], mm[i3], mm[i4]);
				ad[i1] += 1;
				ad[i3] += 2;
				ad[i4] += 2;
				cntx[0] += 3;
				cntx[1] += 3;
				cntx[2] += 5;
				cntx[3] += 4;

				if (n4 != 1) {
					i1 = i1 + n8;
					i2 = i2 + n8;
					i3 = i3 + n8;
					i4 = i4 + n8;

					smad (i1, 1);
					smad (i2, 0);
					smad (i3, 1);
					smad (i4, 1);
					fft4 (mm[i1], mm[i2], mm[i3], mm[i4],
						cc1);
					ad[i1] += 2;
					ad[i2] += 1;
					ad[i3] += 2;
					ad[i4] += 2;
					cntx[0] += 4;
					cntx[1] += 4;
					cntx[2] += 2;
					cntx[3] += 6;
					cntx[4] += 2;
				}
			}
		}

		for (j = 2, a = e; j <= n8; ++j, a += e) {
			a3 = 3 * a;
			cc1 = (SC15/2) * cos (a);
			ss1 = (SC15/2) * sin (a);
			cc3 = (SC15/2) * cos (a3);
			ss3 = (SC15/2) * sin (a3);

			for (is = 0, id = n2 * 2; is < n;
						is = 2 * id - n2, id *= 4) {
				for (i = is; i <= n - 1; i += id) {
					i1 = i + j;
					i2 = i1 + n4;
					i3 = i2 + n4;
					i4 = i3 + n4;
					i5 = i  + n4 - j + 2;
					i6 = i5 + n4;
					i7 = i6 + n4;
					i8 = i7 + n4;

					smad (i1, 0);
					smad (i2, 0);
					smad (i3, 2);
					smad (i4, 2);
					smad (i5, 0);
					smad (i6, 0);
					smad (i7, 1);
					smad (i8, 1);
					if (ind = (ad[i3] < (mad-1))) {
						++ad[i3];
						++ad[i4];
						cntx[2] += 1;
					}
					fft5 (mm[i1], mm[i2], mm[i3], mm[i4],
						mm[i5], mm[i6], mm[i7], mm[i8],
						ss1-cc1, -ss1-cc1, cc1,
						ss3-cc3, -ss3-cc3, cc3, ind);
					ad[i1] += 1;
					ad[i2] += 1;
					ad[i3] += 2;
					ad[i4] += 2;
					ad[i5] += 1;
					ad[i6] += 1;
					ad[i7] += 2;
					ad[i8] += 2;
					cntx[0] += 8;
					cntx[1] += 8;
					cntx[2] += 4;
					cntx[3] += 18;
					cntx[4] += 6;
				}
			}
		}
	}

/* ---------- force final adjustment ------------------------------------- */

	++mad;
	for (i = 1; i <= n; ++i)
		smad (i, 0);

/* ---------- Power spectrum (no sqrt) ----------------------------------- */

	fft7 (0, mm[1]);

	for (i = 1; i < n/2; ++i)
		fft8 (i, mm[i+1], mm[n-i+1]);

	fft7 ((n/2), mm[n/2+1]);

/* ---------- end and show stats ----------------------------------------- */

	end_fft (file_name, ep_name);

	for (i = 0; i < 9; ++i)
		fprintf (stderr, " %5u", i);
	fprintf (stderr, "\n  Load Store Shift   Add  Mult   Adj\n");
	for (i = 0; i < 9; ++i)
		fprintf (stderr, " %5u", cntx[i]);
	fprintf (stderr, "\n");

	exit (0);
}
