/*
The contents of this file contain text and code describing and 
implementing the 'DES' encryption algorithm. Despite the fact 
that this information is freely available overseas, it remains 
a violation of ITAR and/or EAR to export this information 
from inside the US or Canada to outside the US or Canada, or 
to pass it to a non-US or non-Canadian citizen within the US 
or Canada. The US Government evidently defines 'Export' to 
include placing this information on a non-restricted FTP server 
or Web site. Please do not do so, and be sure that any person you
pass this on to is made aware of this restriction.
									Peter Trei
									ptrei@acm.org

 * THIS SOFTWARE IS PROVIDED BY PETER TREI ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.

This software is copyright (c) 1997 Peter Trei (ptrei@acm.org), except for
those portions written by Phil Karn, which retain their
original ownership.

This software may be redistributed freely for use in the RSA DES Challenge,
but please obey the restrictions imposed by the US Government, and make
sure that anyone you pass it to is also aware of them.

This software may not be used for commercial purposes without the written
permission of Peter Trei and the other owners.

Please redistribute only as a complete, unmodified package, including 
source code, and ptrei@acm.org's PGP signature file and key.

 */


/* Microsoft Visual C++ inline assembler version of des() function */
#if 0
#pragma inline
#endif
#include "deskr.h"
#include "desspa.h"

/* Primitive function F. Input is r, output is XORed into l.
 * Subkeys are taken from the array pointed to by es:si.
 * eax and ebx are scratch, ebx is assumed to be initialized to 0.
 *
 * There's an important difference between this version and the portable C
 * version: here, each subkey is LEFT JUSTIFIED within a byte, i.e., left
 * shifted 2 bits. This lets us avoid the two left shifts we'd otherwise have
 * to do here to get double word indexes for addressing each S-box (an array
 * of 4-byte longs). Several other places in the code take this into account:
 *
 * the initial permutation left-shifts r and l by two additional bits
 * (for a total of three);
 *
 * the values in Spboxa[] generated by gensp.c are also shifted left by
 * two additional bits (for a total of three);
 *
 * deskey() left shifts each key schedule byte left by two places.
 *
 * Since this code is executed 16 times in every DES encrypt or decrypt
 * operation, it is very time-critical. It has been very heavily
 * optimized for the 486. 32-bit operations are done whenever possible,
 * and indexed addressing is used instead of incremented pointers.
 */
/* Gstd is Phil Karn's original code, translated from Borland to MS
   C++ inline assembler. It is not used.*/
#define	Gstd(l,r,key)\
	{\
	 /* eax = ((r >>> 4) ^ key[0]) & 0xfcfcfcfc */\
									/*	u	v	cyc	*/\
	__asm mov eax,r					/*	+	+	1	*/\
	__asm ror eax,4					/*  +  	-	1	*/\
	__asm xor eax,[esi+key]			/*	+	+	2	*/\
	__asm and eax,0fcfcfcfch		/*  +	+	1	*/\
\
	/* eax now contains four S-box offsets all ready for use*/\
	__asm mov bl,al					/*	+	+	1	*/\
	__asm xor l, Spboxa[ebx+0600h]	/*  +	+	2	*/\
	__asm mov bl,ah					/*	+	+	1	*/\
	__asm ror eax,16				/*  +  	-	1	*/\
	__asm xor l, Spboxa[ebx+0400h]	/*  +	+	2	*/\
	__asm mov bl,al					/*	+	+	1	*/\
	__asm xor l, Spboxa[ebx+0200h]	/*  +	+	2	*/\
	__asm mov bl,ah					/*	+	+	1	*/\
	__asm xor l, Spboxa[ebx]		/*  +	+	2	*/\
\
	/* eax = (r ^ key[1]) & 0xfcfcfcfc */\
	__asm mov eax,[esi+key+4]		/*  +	+	2	*/\
	__asm xor eax,r					/*	+	+	1	*/\
	__asm and eax,0fcfcfcfch		/*	+	+	1	*/\
\
	/* eax now contains four S-box offsets all ready for use*/\
	__asm mov bl,al					/*	+	+	1	*/\
 	__asm xor l, Spboxa[ebx+0700h]  /*  +	+	2	*/\
	__asm mov bl,ah					/*	+	+	1	*/\
	__asm ror eax,16 				/*  +  	-	1	*/\
 	__asm xor l, Spboxa[ebx+0500h]  /*  +	+	2	*/\
	__asm mov bl,al					/*	+	+	1	*/\
 	__asm xor l, Spboxa[ebx+0300h]  /*  +	+	2	*/\
	__asm mov bl,ah					/*	+	+	1	*/\
 	__asm xor l, Spboxa[ebx+0100h]  /*  +	+	2	*/\
	}

/* this is the Read Time Stamp Counter instruction, which dumps 
 * a 64 bit clock count into edx:eax. I have not managed to use
 * it successfully. */ 
#define RDTSC()\
	{\
	__asm _emit 0xF\
	__asm _emit 0x31\
	}

#define STARTCLOCK(tics)\
	{\
	_asm _emit 0xF /* this is the RDTSC command */\
	_asm _emit 0x31\
	__asm mov [tics], eax\
	__asm cld\
	__asm nop\
	__asm nop\
	__asm nop\
	__asm nop\
	__asm nop\
	__asm nop\
	__asm nop\
	__asm nop\
	}

#define STOPCLOCK(tics)\
	{\
	__asm clc\
	_asm _emit 0xF /* this is the RDTSC command */\
	_asm _emit 0x31\
    __asm sub eax,[tics]\
    __asm sub eax,15\
	__asm mov tics, eax\
	}

/* This is my version of the DES round. It  interleaves 
   the odd and even byte operations of the original, with the goal 
   of making them pair better in the  Pentium U and V pipes.

   There are two main differences from the original. The odd and
   even indices into the SPbox tables are handled by separate sets
   of registers, eliminating register contention, and the two sets
   or operations are interleaved in a way which maximizes the 
   pairing of instructions of similar duration. I suspect that it
   may be possible to squeeze 3 more cycles out of this, but I have
   not yet succeeded in doing so. It appears to operate in 25 clock
   cycles on a Pentium.
*/
/* we use the F macro */
#define	K(l,r,key)\
	{\
\
	__asm mov edx, dword ptr [sks]  /* 1   uv   */\
	__asm mov eax,r					/* 1   uv A */\
	__asm ror eax,4					/* 1   np A */\
\
	__asm mov ecx,[edx+key+4]		/* 1   uv B */\
	__asm xor eax,[edx+key]			/* 1   uv A */\
	__asm xor ecx,r					/* 1   uv B */\
	__asm and eax,0fcfcfcfch		/* 1   uv A */\
	__asm and ecx,0fcfcfcfch		/* 1   uv B */\
	__asm xor edx,edx				/* 1   uv B */\
	__asm mov bl,al					/* 1   uv A */\
  	__asm mov dl,cl					/* 2   uv B */\
	__asm xor l, Spboxa[ebx+0600h]	/* 2   uv A */\
	__asm mov ebp, Spboxa[edx+0700h]/* 1   uv B */\
  	__asm mov bl,ah					/* 1   uv A */\
	__asm mov dl,ch					/* 1   uv B */\
	__asm ror eax,16				/* 1   np A */\
	__asm ror ecx,16 				/* 1   np B */\
	__asm xor l, Spboxa[ebx+0400h]	/* 2   uv A */\
	__asm xor ebp, Spboxa[edx+0500h]/* 2   uv B */\
  	__asm mov bl,al					/* 1   uv A */\
	__asm mov dl,cl					/* 1   uv B */\
	__asm xor l, Spboxa[ebx+0200h]	/* 2   uv A */\
	__asm xor ebp, Spboxa[edx+0300h]/* 2   uv B */\
	__asm mov bl,ah					/* 1   uv A */\
	__asm mov dl,ch					/* 1   uv B */\
	__asm xor l, Spboxa[ebx]		/* 2   uv A */\
	__asm xor ebp, Spboxa[edx+0100h]/* 2   uv B */\
	__asm xor l, ebp				/* 1   uv B */\
	}
/*
l is being modified on this round. it is xored to many times.
r is accessed only twice, for read only. It needs to be preserved on the way out.
for each half:

*/
/* this is my round, with changes inspired by sven mikklesen's version
it should be entered with edx clear, and ebp pointing to the key structure */
#define	F(l,r,key)\
	{\
\
	__asm xor ebx,ebx				/* 1   uv B */\
	__asm mov eax,r					/* 1   uv A */\
	__asm mov ecx,[ebp+key+4]		/* 1   uv B */\
	__asm ror eax,4					/* 1   np A */\
	__asm xor eax,[ebp+key]			/* 1   uv A */\
	__asm xor ecx,r					/* 1   uv B */\
	__asm and eax,0fcfcfcfch		/* 1   uv A */\
	__asm and ecx,0fcfcfcfch		/* 1   uv B */\
/* eax now contains even sbox indices: b0i, b2i, b4i, b6i */\
/* ecx now contains odd  sbox indices: b1i, b3i, b5i, b7i */\
	__asm mov bl, al					/* 1     move b6i to ebx */\
	__asm mov dl, ah					/* 2     move b4i to edx */\
	__asm mov ebp, Spboxa[ebx+0600h]	/* 3     move b6 to ebp  */\
	__asm mov bl, cl					/* 4     move b7i to ebx */\
	__asm xor l, ebp					/* 5     xor b6 into l   */\
	__asm mov ebp, Spboxa[edx+0400h]	/* 6     move b4 to ebp  */\
	__asm xor l, ebp					/* 7     xor b4 into l   */\
	__asm mov dl, ch					/* 8     move b5i to edx */\
	__asm shr eax, 16					/* 9     shift eax. It now contains 0 0 b0i, b2i */\
	__asm mov ebp, Spboxa[ebx+0700h]	/* 10    move b7 to ebp  */\
	__asm xor l, ebp					/* 11    xor b7 into l   */\
	__asm mov bl, ah					/* 12    move b0i to ebx */\
	__asm shr ecx, 16					/* 13    shift ecx. It now contains 0 0 b1i, b3i */\
	__asm mov ebp, Spboxa[edx+0500h]	/* 14,15 move b5 to ebp  */\
	__asm xor l, ebp					/* 16    xor b5 into l   */\
	__asm mov ebp, dword ptr [sks]		/* 17    restore key schedule pointer */\
    __asm mov dl, ch					/* 18    move b1i to edx */\
	__asm and ecx, 0FFh					/* 20    ecx now contains b3i */\
	__asm and eax, 0FFh					/* 19    eax now contains b2i */\
	__asm mov ebx, Spboxa[ebx]			/* 21    mov b0 to ebx   */\
	__asm xor l, ebx					/* 22    xor b0 into l   */\
	__asm mov ebx, Spboxa[ecx+0300h]	/* 23    mov b3 to ebx   */\
	__asm xor l, ebx                    /* 24    xor b3 into l   */\
	__asm mov ebx, Spboxa[eax+0200h]    /* 25    mov b2 to ebx   */\
	__asm xor l, ebx                    /* 26    xor b2 into l   */\
	__asm mov ebx, Spboxa[edx+0100h]    /* 27    mov b1 to ebx   */\
	__asm xor l, ebx                    /* 28    xor b1 into l   */\
	}


int Asmversion = 1;	/* Let deskey() know we need shifted keys */

/* Encrypt or decrypt a block of data in ECB mode with the key schedule
 * provided (encryption/decryption is selected by the key schedule)
 * this take a repeat count, which can be left at 1, or ratcheted up 
 * for time test purposes.
 */ 
void
des(ks,block/*,count*/)
unsigned long ks[16][2];	/* Key schedule */
unsigned char block[8];		/* Data block */


{
	extern unsigned long Spboxa[8][64];
	static unsigned long stktop;
	static unsigned long scount;
	static unsigned long sks;
	static unsigned long ticks;
	unsigned long count;
count = 1;
	scount = count;
	sks = (unsigned long)&ks[0][0];
#ifdef WIN32
#if 0
	STARTCLOCK(ticks);
#endif
	__asm {


	/* Fetch 8 bytes from user's buffer in "block" and place in ecx and edx,
	 * in big-endian order. Uses es, si.
	 * There's a very nice BSWAP instruction that executes in only
	 * 1 cycle, but it is only available on the 486. :-(
	 */
    mov esi, dword ptr [block];	 
	mov ecx,[esi];				 
	bswap ecx;					 
	mov edx, [esi+4];			 
	bswap edx;
	
	/* Hoey's clever initial permutation algorithm, translated to assembler
	 * (see Schneier p 478)	
	 *
	 * The convention here is *different* from the C version. The permuted
	 * values of "left" and "right" are rotated left by two additional
	 * bits so we can avoid the two shifts that would otherwise be
	 * required in each round to convert a S-box input to a memory offset
	 * for Spboxa[].
	 */
	/* work = ((left >> 4) ^ right) & 0x0f0f0f0f */
a:	mov eax,ecx;
	shr eax,4;
	xor eax,edx;
	and eax,0f0f0f0fh;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 4 */
	shl eax,4;
	xor ecx,eax;

	/* work = ((left >> 16) ^ right) & 0xffff */
	mov eax,ecx;
	shr eax,16;
	xor eax,edx;
	and eax,0ffffh;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 16 */
	shl eax,16;
	xor ecx,eax;

	/* work = ((right >> 2) ^ left) & 0x33333333 */
	mov eax,edx;
	shr eax,2;
	xor eax,ecx;
	and eax,033333333h;

	/* left ^= work */
	xor ecx,eax;
	shl eax,2;

	xor edx,eax;	/* right ^= (work << 2) */

	/* work = ((right >> 8) ^ left) & 0xff00ff */
	mov eax,edx;
	shr eax,8;
	xor eax,ecx;
	and eax,0ff00ffh;

	xor ecx,eax;	/* left ^= work */

	/* right ^= (work << 8) */
	shl eax,8;
	xor edx,eax;

	rol edx,1;	/* right <<<= 1 */

	/* work = (left ^ right) & 0xaaaaaaaa */
	mov eax,ecx;
	xor eax,edx;
	and eax,0aaaaaaaah;

	xor ecx,eax;	/* left ^= work */
	xor edx,eax;	/* right ^= work */

	rol ecx,3;	/* left <<<= 3 np */
	rol edx,2;	/* right <<<= 2 np */

	/* Set up for the rounds */
	// les si,ss:ks;	/* es:si = key schedule rm */
	
	mov ebx,0;	/* u 1 cyc 1 Upper 3 bytes must be zero */

	mov edi, edx; /* v 1 cyc 1 CHANGE */
	mov esi, ecx; /* u 1 cyc 2 CHANGE */
#if 0
	mov edx, dword ptr [ks]; /* es:si = key schedule pm */  /* CHANGE */
	mov stktop,esp; /* v 1 cyc 2 */
#endif
	push ebp;
	xor edx, edx;           /* testing new round */
	mov ebp, dword ptr [ks]; /* testing new round */
	/* Do the rounds */
	F(esi,edi,0);  /*round 0  */
	F(edi,esi,8);  /*round 1  */
	F(esi,edi,16); /*round 2  */
	F(edi,esi,24); /*round 3  */
	F(esi,edi,32); /*round 4  */
	F(edi,esi,40); /*round 5  */
	F(esi,edi,48); /*round 6  */
	F(edi,esi,56); /*round 7  */
	F(esi,edi,64); /*round 8  */
	F(edi,esi,72); /*round 9  */
	F(esi,edi,80); /*round 10 */
	F(edi,esi,88); /*round 11 */
	F(esi,edi,96); /*round 12 */
	F(edi,esi,104);/*round 13 */
	F(esi,edi,112);/*round 14 */
	F(edi,esi,120);/*round 15 */
	//mov esp, stktop;
	pop ebp;


	mov ecx,esi; /* CHANGE */
	mov edx,edi; /* CHANGE */

	/* Inverse permutation */
	ror ecx,2;	/* left >>>= 2 */
	ror edx,3;	/* right >>>= 3 */

	/* work = (left ^ right) & 0xaaaaaaaa */
	mov eax,ecx;
	xor eax,edx;
	and eax,0aaaaaaaah;

	xor ecx,eax;	/* left ^= work */
	xor edx,eax;	/* right ^= work */
	ror ecx,1;	/* left >>>= 1 */

	/* work = (left >> 8) ^ right) & 0xff00ff */
	mov eax,ecx;
	shr eax,8;
	xor eax,edx;
	and eax,0ff00ffh;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 8 */
	shl eax,8;
	xor ecx,eax;

	/* work = ((left >> 2) ^ right) & 0x33333333 */
	mov eax,ecx;
	shr eax,2;
	xor eax,edx;
	and eax,33333333h;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 2 */
	shl eax,2;
	xor ecx,eax;

	/* work = ((right >> 16) ^ left) & 0xffff */
	mov eax,edx;
	shr eax,16;
	xor eax,ecx;
	and eax,0ffffh;

	xor ecx,eax;	/* left ^= work */

	/* right ^= work << 16 */
	shl eax,16;
	xor edx,eax;

	/* work = ((right >> 4) ^ left) & 0x0f0f0f0f */
	mov eax,edx;
	shr eax,4;
	xor eax,ecx;
	and eax,0f0f0f0fh;

	xor ecx,eax;	/* left ^= work */

	/* right ^= work << 4 */
	shl eax,4;
	xor edx,eax;

	/* Write ecx and edx into user's buffer "block" in big-endian order
	 * after final swap
	 * Uses es, si
	 */
    mov esi, dword ptr [block];	 
	bswap edx;					 
	mov [esi],edx;				 
	bswap ecx;					 
	mov [esi+4],ecx;

	dec scount;
 	jnz a;
	}
	STOPCLOCK(ticks);
#endif /* WIN32 */
}

/* The 1 and 11 round timetests are used when timing variations in
the DES round. Each can be called with a count (typically a million),
and the difference in clock time for each one to run is equal to the
time to run 10*count tests.
*/


/**********  1 round timetest ********/

void
des1(ks,block,count)
unsigned long ks[16][2];	/* Key schedule */
unsigned char block[8];		/* Data block */
unsigned long count;
{
	extern unsigned long Spboxa[8][64];
	static unsigned long stktop;
	static unsigned long scount;
	static unsigned long sks;
	static unsigned long ticks;

	scount = count;
	sks = (unsigned long)&ks[0][0];

#ifdef WIN32
	STARTCLOCK(ticks);
	__asm {
	/* Fetch 8 bytes from user's buffer in "block" and place in ecx and edx,
	 * in big-endian order. Uses es, si.
	 * There's a very nice BSWAP instruction that executes in only
	 * 1 cycle, but it is only available on the 486. :-(
	 */
    mov esi, dword ptr [block];	 
	mov ecx,[esi];				 
	bswap ecx;					 
	mov edx, [esi+4];			 
	bswap edx;
	
	
b:	mov ebx,0;	/* u 1 cyc 1 Upper 3 bytes must be zero */

	mov edi, edx; /* v 1 cyc 1 CHANGE */
	mov esi, ecx; /* u 1 cyc 2 CHANGE */
#if 0
	mov edx, dword ptr [ks]; /* es:si = key schedule pm */  /* CHANGE */
	mov stktop,esp; /* v 1 cyc 2 */
#endif
	push ebp;
	xor edx, edx;            /* testing new round */
	mov ebp, dword ptr [ks]; /* testing new round */

	/* Do the rounds */
	F(esi,edi,0);  /*round 0  */
	//mov esp, stktop;
	pop ebp;


	mov ecx,esi; /* CHANGE */
	mov edx,edi; /* CHANGE */


	dec scount;
 	jnz b;
	}
	STOPCLOCK(ticks);
#endif /* WIN32 */
}


/**********  11 round timetest ********/
void
des11(ks,block,count)
unsigned long ks[16][2];	/* Key schedule */
unsigned char block[8];		/* Data block */
unsigned long count;
{
	extern unsigned long Spboxa[8][64];
	static unsigned long stktop;
	static unsigned long scount;
	static unsigned long sks;
	static unsigned long ticks;

	scount = count;
	sks = (unsigned long)&ks[0][0];
#ifdef WIN32
#if 0
	STARTCLOCK(ticks);
#endif
	__asm {
	/* Fetch 8 bytes from user's buffer in "block" and place in ecx and edx,
	 * in big-endian order. Uses es, si.
	 * There's a very nice BSWAP instruction that executes in only
	 * 1 cycle, but it is only available on the 486. :-(
	 */
    mov esi, dword ptr [block];	 
	mov ecx,[esi];				 
	bswap ecx;					 
	mov edx, [esi+4];			 
	bswap edx;
	
c:	mov ebx,0;	/* u 1 cyc 1 Upper 3 bytes must be zero */

	mov edi, edx; /* v 1 cyc 1 CHANGE */
	mov esi, ecx; /* u 1 cyc 2 CHANGE */
#if 0
	mov edx, dword ptr [ks]; /* es:si = key schedule pm */  /* CHANGE */
	mov stktop,esp; /* v 1 cyc 2 */
#endif
	push ebp;
	xor edx, edx;            /* testing new round */
	mov ebp, dword ptr [ks]; /* testing new round */

	/* Do the rounds */
	F(esi,edi,0);  /*round 0  */
	F(edi,esi,8);  /*round 1  */
	F(esi,edi,16); /*round 2  */
	F(edi,esi,24); /*round 3  */
	F(esi,edi,32); /*round 4  */
	F(edi,esi,40); /*round 5  */
	F(esi,edi,48); /*round 6  */
	F(edi,esi,56); /*round 7  */
	F(esi,edi,64); /*round 8  */
	F(edi,esi,72); /*round 9  */
	F(esi,edi,80); /*round 10 */
	//mov esp, stktop;
	pop ebp;


	mov ecx,esi; /* CHANGE */
	mov edx,edi; /* CHANGE */

	dec scount;
 	jnz c;
	}
#if 0
	STOPCLOCK(ticks);
#endif
#endif /* WIN32 */
}
/*************************************/
void initial_perm(block)
	unsigned char block[8];
	/* perform the initial perm on block, and put the
	   result in left and right */
{
#ifdef WIN32
	__asm{

	/* Fetch 8 bytes from user's buffer in "block" and place in ecx and edx,
	 * in big-endian order. Uses es, si.
	 * There's a very nice BSWAP instruction that executes in only
	 * 1 cycle, but it is only available on the 486. :-(
	 */
    mov esi, dword ptr [block];	 
	mov ecx,[esi];				 
	bswap ecx;		/* wont work on a 386 */			 
	mov edx, [esi+4];			 
	bswap edx;
	
	/* Hoey's clever initial permutation algorithm, translated to assembler
	 * (see Schneier p 478)	
	 *
	 * The convention here is *different* from the C version. The permuted
	 * values of "left" and "right" are rotated left by two additional
	 * bits so we can avoid the two shifts that would otherwise be
	 * required in each round to convert a S-box input to a memory offset
	 * for Spboxa[].
	 */
	/* work = ((left >> 4) ^ right) & 0x0f0f0f0f */
	mov eax,ecx;
	shr eax,4;
	xor eax,edx;
	and eax,0f0f0f0fh;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 4 */
	shl eax,4;
	xor ecx,eax;

	/* work = ((left >> 16) ^ right) & 0xffff */
	mov eax,ecx;
	shr eax,16;
	xor eax,edx;
	and eax,0ffffh;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 16 */
	shl eax,16;
	xor ecx,eax;

	/* work = ((right >> 2) ^ left) & 0x33333333 */
	mov eax,edx;
	shr eax,2;
	xor eax,ecx;
	and eax,033333333h;

	/* left ^= work */
	xor ecx,eax;
	shl eax,2;

	xor edx,eax;	/* right ^= (work << 2) */

	/* work = ((right >> 8) ^ left) & 0xff00ff */
	mov eax,edx;
	shr eax,8;
	xor eax,ecx;
	and eax,0ff00ffh;

	xor ecx,eax;	/* left ^= work */

	/* right ^= (work << 8) */
	shl eax,8;
	xor edx,eax;

	rol edx,1;	/* right <<<= 1 */

	/* work = (left ^ right) & 0xaaaaaaaa */
	mov eax,ecx;
	xor eax,edx;
	and eax,0aaaaaaaah;

	xor ecx,eax;	/* left ^= work */
	xor edx,eax;	/* right ^= work */

	rol ecx,3;	/* left <<<= 3 np */
	rol edx,2;	/* right <<<= 2 np */

	mov left, ecx;
	mov right, edx;
	}
#endif /* WIN32 */
}

void final_perm(block)
unsigned char block[8];		/* Data block */
{
#ifdef WIN32
	__asm {
	mov ecx, left;
	mov edx, right;
	
	/* Inverse permutation */
	ror ecx,2;	/* left >>>= 2 */
	ror edx,3;	/* right >>>= 3 */

	/* work = (left ^ right) & 0xaaaaaaaa */
	mov eax,ecx;
	xor eax,edx;
	and eax,0aaaaaaaah;

	xor ecx,eax;	/* left ^= work */
	xor edx,eax;	/* right ^= work */
	ror ecx,1;	/* left >>>= 1 */

	/* work = (left >> 8) ^ right) & 0xff00ff */
	mov eax,ecx;
	shr eax,8;
	xor eax,edx;
	and eax,0ff00ffh;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 8 */
	shl eax,8;
	xor ecx,eax;

	/* work = ((left >> 2) ^ right) & 0x33333333 */
	mov eax,ecx;
	shr eax,2;
	xor eax,edx;
	and eax,33333333h;

	xor edx,eax;	/* right ^= work */

	/* left ^= work << 2 */
	shl eax,2;
	xor ecx,eax;

	/* work = ((right >> 16) ^ left) & 0xffff */
	mov eax,edx;
	shr eax,16;
	xor eax,ecx;
	and eax,0ffffh;

	xor ecx,eax;	/* left ^= work */

	/* right ^= work << 16 */
	shl eax,16;
	xor edx,eax;

	/* work = ((right >> 4) ^ left) & 0x0f0f0f0f */
	mov eax,edx;
	shr eax,4;
	xor eax,ecx;
	and eax,0f0f0f0fh;

	xor ecx,eax;	/* left ^= work */

	/* right ^= work << 4 */
	shl eax,4;
	xor edx,eax;

	/* Write ecx and edx into user's buffer "block" in big-endian order
	 * after final swap
	 * Uses es, si
	 */
    mov esi, dword ptr [block];	 
	bswap edx;				/* this wont work on a 386 */	 
	mov [esi],edx;				 
	bswap ecx;					 
	mov [esi+4],ecx;
	}
#endif /* WIN32 */
}

void do_first_round(ks)
unsigned long ks[16][2];	/* Key schedule */
{
	static unsigned long sks;
	sks = (unsigned long)&ks[0][0];

#ifdef WIN32
	__asm{
	mov ebx,0;	/* u 1 cyc 1 Upper 3 bytes must be zero */

	mov edi, right; /* v 1 cyc 1 CHANGE */
	mov esi, left; /* u 1 cyc 2 CHANGE */
	push ebp;
	xor edx, edx;            /* testing new round */
	mov ebp, dword ptr [sks]; /* testing new round */

	/* Do the round */
	F(esi,edi,0);  /*round 0  */

	pop ebp;
	mov left, esi;
	mov right, edi;
	}
#endif /* WIN32 */
}

void do_last_round(ks)
unsigned long ks[16][2];	/* Key schedule */
{
	static unsigned long sks;
	sks = (unsigned long)&ks[0][0];

#ifdef WIN32
	__asm{
	mov ebx,0;	/* u 1 cyc 1 Upper 3 bytes must be zero */

	mov edi, right; /* v 1 cyc 1 CHANGE */
	mov esi, left; /* u 1 cyc 2 CHANGE */
	push ebp;
	xor edx, edx;            /* testing new round */
	mov ebp, dword ptr [sks]; /* testing new round */

	/* Do the round */
	F(edi,esi,120);/*round 15 */

	pop ebp;
	mov left, esi;
	mov right, edi;
	}
#endif /* WIN32 */

}

void do_middle_rounds(ks)
unsigned long ks[16][2];	/* Key schedule */
{
	static unsigned long sks;
	sks = (unsigned long)&ks[0][0];

#ifdef WIN32
	__asm{
	mov ebx,0;	/* u 1 cyc 1 Upper 3 bytes must be zero */

	mov edi, right; /* v 1 cyc 1 CHANGE */
	mov esi, left; /* u 1 cyc 2 CHANGE */
	push ebp;
	/* Do the rounds */
	xor edx, edx;            /* testing new round */
	mov ebp, dword ptr [sks]; /* testing new round */

	F(edi,esi,8);  /*round 1  */
	F(esi,edi,16); /*round 2  */
	F(edi,esi,24); /*round 3  */
	F(esi,edi,32); /*round 4  */
	F(edi,esi,40); /*round 5  */
	F(esi,edi,48); /*round 6  */
	F(edi,esi,56); /*round 7  */
	F(esi,edi,64); /*round 8  */
	F(edi,esi,72); /*round 9  */
	F(esi,edi,80); /*round 10 */
	F(edi,esi,88); /*round 11 */
	F(esi,edi,96); /*round 12 */
	F(edi,esi,104);/*round 13 */
	F(esi,edi,112);/*round 14 */

	pop ebp;
	mov left, esi;
	mov right, edi;
	}
#endif /* WIN32 */

}



