/* 386 assembler version of triple DES in GNU 'as' format
   index register loads and uses are interleaved to avoid address
   generation interlocks (AGIs) as much as possible
 */

#define	F(l,r,key)\
	movl r,%eax;\
	rorl $4,%eax;\
	xorl key(%esi),%eax;\
	andl $0xfcfcfcfc,%eax;\
	movb %al,%bl;\
	movb %ah,%cl;\
	rorl $16,%eax;\
	xorl _Spboxa+6*256(%ebx),l;\
	movb %al,%bl;\
	xorl _Spboxa+4*256(%ecx),l;\
	movb %ah,%cl;\
	xorl _Spboxa+2*256(%ebx),l;\
	movl 4+key(%esi),%eax;\
	xorl _Spboxa(%ecx),l;\
	xorl r,%eax;\
	andl $0xfcfcfcfc,%eax;\
	movb %al,%bl;\
	movb %ah,%cl;\
	rorl $16,%eax;\
	xorl _Spboxa+7*256(%ebx),l;\
	movb %al,%bl;\
	xorl _Spboxa+5*256(%ecx),l;\
	movb %ah,%cl;\
	xorl _Spboxa+3*256(%ebx),l;\
	xorl _Spboxa+256(%ecx),l

/* Tell deskey() that it's linked with the assembler version */

.globl _Asmversion
.data
	.align 4
_Asmversion:
	.long 1

.text
	.align 2
.globl _des3
_des3:
	pushl %ebp
	movl %esp,%ebp
	pushl %esi
	pushl %edi
	pushl %ebx
	/* 8(%ebp) is key schedule pointer, 12(%ebp) is buffer pointer */

	/* Fetch 8 bytes from user's buffer in block and place in edi and edx,
	 * in big-endian order. Uses esi.
	 * There's a very nice BSWAP instruction that executes in only
	 * 1 cycle, but it is only available on the 486. :-(
	 */
	movl 12(%ebp),%esi	/* esi = block */
	movl (%esi),%eax	/* eax = ((long *)block)[0] */
	xchgb %al,%ah		/* bswap eax */
	roll $16,%eax
	xchgb %al,%ah
	movl %eax,%edi		/* edi = left */
	movl 4(%esi),%edx	/* edx = ((long *)block)[1] */
	xchgb %dl,%dh		/* bswap edx */
	roll $16,%edx
	xchgb %dl,%dh		/* edx = right */

	/* Hoey's clever initial permutation algorithm, translated to assembler
	 * (see Schneier p 478)	
	 *
	 * The convention here is *different* from the C version. The permuted
	 * values of left and right are rotated left by two additional
	 * bits so we can avoid the two shifts that would otherwise be
	 * required in each round to convert a S-box input to a memory offset
	 * for Spbox[].
	 */
	/* work = ((left >> 4) ^ right) & 0x0f0f0f0f */
	movl %edi,%eax
	shrl $4,%eax
	xorl %edx,%eax
	andl $0x0f0f0f0f,%eax

	xorl %eax,%edx		/* right ^= work */

	/* left ^= work << 4 */
	shll $4,%eax
	xorl %eax,%edi

	/* work = ((left >> 16) ^ right) & 0x0000ffff */
	movl %edi,%eax
	shrl $16,%eax
	xorl %edx,%eax
	andl $0xffff,%eax

	xorl %eax,%edx		/* right ^= work */

	/* left ^= work << 16 */
	shll $16,%eax
	xorl %eax,%edi

	/* work = ((right >> 2) ^ left) & 0x33333333 */
	movl %edx,%eax
	shrl $2,%eax
	xorl %edi,%eax
	andl $0x33333333,%eax

	/* left ^= work */
	xorl %eax,%edi
	shll $2,%eax

	xorl %eax,%edx		/* right ^= (work << 2) */

	/* work = ((right >> 8) ^ left) & 0x00ff00ff */
	movl %edx,%eax
	shrl $8,%eax
	xorl %edi,%eax
	andl $0x00ff00ff,%eax

	xorl %eax,%edi		/* left ^= work */

	/* right ^= (work << 8) */
	shll $8,%eax
	xorl %eax,%edx

	roll $1,%edx		/* right <<<= 1 */

	/* work = (left ^ right) & 0xaaaaaaaa */
	movl %edi,%eax
	xorl %edx,%eax
	andl $0xaaaaaaaa,%eax

	xorl %eax,%edi		/* left ^= work */
	xorl %eax,%edx		/* right ^= work */

	roll $3,%edi		/* left <<<= 3 */
	roll $2,%edx		/* right <<<= 2 */

	/* Set up for the rounds */
	movl 8(%ebp),%esi	/* esi = key schedule */
	xorl %ebx,%ebx		/* Upper 3 bytes of indexes must be zero */
	xorl %ecx,%ecx

	/* Do the rounds */
	F(%edi,%edx,0)
	F(%edx,%edi,8)
	F(%edi,%edx,16)
	F(%edx,%edi,24)
	F(%edi,%edx,32)
	F(%edx,%edi,40)
	F(%edi,%edx,48)
	F(%edx,%edi,56)
	F(%edi,%edx,64)
	F(%edx,%edi,72)
	F(%edi,%edx,80)
	F(%edx,%edi,88)
	F(%edi,%edx,96)
	F(%edx,%edi,104)
	F(%edi,%edx,112)
	F(%edx,%edi,120)

	F(%edx,%edi,128)
	F(%edi,%edx,136)
	F(%edx,%edi,144)
	F(%edi,%edx,152)
	F(%edx,%edi,160)
	F(%edi,%edx,168)
	F(%edx,%edi,176)
	F(%edi,%edx,184)
	F(%edx,%edi,192)
	F(%edi,%edx,200)
	F(%edx,%edi,208)
	F(%edi,%edx,216)
	F(%edx,%edi,224)
	F(%edi,%edx,232)
	F(%edx,%edi,240)
	F(%edi,%edx,248)

	F(%edi,%edx,256)
	F(%edx,%edi,264)
	F(%edi,%edx,272)
	F(%edx,%edi,280)
	F(%edi,%edx,288)
	F(%edx,%edi,296)
	F(%edi,%edx,304)
	F(%edx,%edi,312)
	F(%edi,%edx,320)
	F(%edx,%edi,328)
	F(%edi,%edx,336)
	F(%edx,%edi,344)
	F(%edi,%edx,352)
	F(%edx,%edi,360)
	F(%edi,%edx,368)
	F(%edx,%edi,376)

	/* Inverse permutation */
	rorl $2,%edi	/* left >>>= 2 */
	rorl $3,%edx	/* right >>>= 3 */

	/* work = (left ^ right) & 0xaaaaaaaa */
	movl %edi,%eax
	xorl %edx,%eax
	andl $0xaaaaaaaa,%eax

	xorl %eax,%edi	/* left ^= work */
	xorl %eax,%edx	/* right ^= work */
	rorl $1,%edi	/* left >>>= 1 */

	/* work = (left >> 8) ^ right) & 0x00ff00ff */
	movl %edi,%eax
	shrl $8,%eax
	xorl %edx,%eax
	andl $0x00ff00ff,%eax

	xorl %eax,%edx	/* right ^= work */

	/* left ^= work << 8 */
	shll $8,%eax
	xorl %eax,%edi

	/* work = ((left >> 2) ^ right) & 0x33333333 */
	movl %edi,%eax
	shrl $2,%eax
	xorl %edx,%eax
	andl $0x33333333,%eax

	xorl %eax,%edx	/* right ^= work */

	/* left ^= work << 2 */
	shll $2,%eax
	xorl %eax,%edi

	/* work = ((right >> 16) ^ left) & 0x0000ffff */
	movl %edx,%eax
	shrl $16,%eax
	xorl %edi,%eax
	andl $0xffff,%eax

	xorl %eax,%edi	/* left ^= work */

	/* right ^= work << 16 */
	shll $16,%eax
	xorl %eax,%edx

	/* work = ((right >> 4) ^ left) & 0x0f0f0f0f */
	movl %edx,%eax
	shrl $4,%eax
	xorl %edi,%eax
	andl $0x0f0f0f0f,%eax

	xorl %eax,%edi	/* left ^= work */

	/* right ^= work << 4 */
	shll $4,%eax
	xorl %eax,%edx

	/* Write edi and edx into user's buffer block in big-endian order
	 * after final swap
	 */
	movl 12(%ebp),%esi

	xchgb %dl,%dh		/* bswap edx */
	roll $16,%edx
	xchgb %dl,%dh
	movl %edx,(%esi)

	movl %edi,%eax
	xchgb %al,%ah		/* bswap eax */
	roll $16,%eax
	xchgb %al,%ah
	mov %eax,4(%esi)

	popl %ebx
	popl %edi
	popl %esi
	leave
	ret
