	.TITLE	 	Twofish for 6805
	.SUBTITLE	Author: Doug Whiting, Hi/fn, May 1998
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;	Copyright 1998, Hi/fn.  All rights reserved	
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
	.symbols
	.linklist
	.debug asm
	.spaces	on
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;	Speed/space tradeoff settings
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Set all to 1 to minimize time, 0 to minimize code size
;
UNROLL_SUBKEY	.equ	1	;use two f32s in computeSubkey
USE_ALPHA_TAB	.equ	0	;use 256-byte alpha lookup table
USE_MDS_TAB	.equ	1	;use two 256-byte lookups  (EF, 5B)
UNROLL_ROUND	.equ	1	;use unique round code
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;	Constant definitions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
BLK_SIZE	equ	16
KEY_SIZE	equ	16
ROUNDS		equ	16
GF_FDBK		equ	(169h.SHR.1)

;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	.PAGE0		;RAM variables
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
RAM0	.var	$
Text	.blkb	BLK_SIZE	;encryption block goes here
K32e	.blkb	KEY_SIZE/4	;encryption key   goes here
K32o	.blkb	KEY_SIZE/4	;(even dword,odd dword,RS dword)
SboxKey	.blkb	KEY_SIZE/4
KBUMP	.var	$-K32e
	.blkb	KBUMP		;(even dword,odd dword,RS dword)
sk0	.blkb	4		;round subkey
sk1	.blkb	4		
t0	.blkb	4		;t0 MUST follow sk1 directly!
t1	.blkb	4
sPtr	.blkb	1
round	.blkb	1		;round number
tmp	.blkb	2

RAM_size .var	$-RAM0

; variables for the test code
kTabPtr	.blkb	1
tmpPtr	.blkb	1
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	.CODE
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
	; test data structure
keyTable:
	include	vector.inc	;load vectors generated by C code

;
;----------------------------------------------------------------
; MDS matrix:
;
;      01  EF  5B  5B		; 5B = alpha**-2 + 1
;      5B  EF  EF  01		; EF = alpha**-2 + alpha**-1 + 1
;      EF  5B  01  EF
;      EF  01  EF  5B
;----------------------------------------------------------------
;
; Load src byte, run through S-box.  Return result in X
;
codeSize	.var $
Sbox8ByteCnt	.var 0
Sbox8CallCnt	.var 0
f32ByteCnt	.var 0
f32Size		.var 0
swap1ByteCnt	.var 0
op1ByteCnt	.var 0
rotByteCnt	.var 0

  .ife USE_ALPHA_TAB.and.(.not.USE_MDS_TAB)
mulAlpha .macro			;do the alpha thing inline
	lsra
	bcc	$+4
	eor	#GF_FDBK
    .endm
  .endif

; key < 4 --> use X as ptr to key
Sbox8	.macro	src,q0,q1,q2,key
tmpS8		.var $
  .if key.lt.4			;use X as key ptr?
      .if key.eq.0
	stx	sPtr
      .else
	ldx	sPtr
      .endif
	lda	key+KBUMP,X
	ldx	src
	eor	q0,X
	tax
	lda	q1,X
	ldx	sPtr
	eor	key,X
  .else
	ldx	src
	lda	q0,X
	eor	key+KBUMP
	tax
	lda	q1,X
	eor	key
  .endif
	tax
  .if USE_ALPHA_TAB.or.USE_MDS_TAB
	ldx	q2,X		;return result in X
  .else
	lda	q2,X		;return result in A
  .endif
Sbox8CallCnt	.var Sbox8CallCnt+1
Sbox8Size	.var $-tmpS8
Sbox8ByteCnt	.var Sbox8ByteCnt+Sbox8Size
	.endm	

; key == 0 --> use X as ptr to key
f32	.macro	srcB0,rotCnt,step,dst,key
tmpf32		.var $
	;--- 1st byte, first MDS column = 01 5B EF EF
	Sbox8	srcB0+((rotCnt+step*0).AND.3),p0,p0,p1,key
  .if USE_MDS_TAB
	stx	dst		;X = 01
	lda	tab5B,X
	sta	dst+1
	lda	tabEF,X
	sta	dst+2
	sta	dst+3
  .else
    .if USE_ALPHA_TAB
	stx	dst		;X = 01
	ldx	alpha,X
	stx	tmp+1
	lda	alpha,X
    .else 
	sta	dst		;A = 01.  Store it
	mulAlpha		;a=a*alpha**-1
	sta	tmp+1
	mulAlpha
    .endif
	eor	dst		;now A = 5B
	sta	dst+1
	eor	tmp+1		;now A = EF
	sta	dst+2
	sta	dst+3
  .endif
	;--- 2nd byte, 2nd MDS column = EF EF 5B 01
	Sbox8	srcB0+((rotCnt+step*1).AND.3),p1,p0,p0,key+1
  .if USE_MDS_TAB
	lda	dst
	eor	tabEF,X
	sta	dst
	lda	dst+1
	eor	tabEF,X
	sta	dst+1
	lda	dst+2
	eor	tab5B,X
	sta	dst+2
	txa
	eor	dst+3
	sta	dst+3
  .else
    .if USE_ALPHA_TAB
	stx	tmp
	txa
	eor	dst+3
	sta	dst+3
	ldx	alpha,X
	stx	tmp+1		;tmp+1 = alpha**-1
	lda	alpha,X
    .else
	sta	tmp		;save the 01 thing
	eor	dst+3
	sta	dst+3		;xor the 01
	lda	tmp
	mulAlpha
	sta	tmp+1
	mulAlpha
    .endif
	eor	tmp		;now A = 5B
	tax			;save 5B in X
	eor	dst+2
	sta	dst+2
	txa			;A = 5B
	eor	tmp+1		;now A = EF
	tax			;save 5F in X
	eor	dst+1
	sta	dst+1
	txa	
	eor	dst
	sta	dst
  .endif
	;--- 3rd byte, 3rd MDS column = 5B EF 01 EF
	Sbox8	srcB0+((rotCnt+step*2).AND.3),p0,p1,p1,key+2
  .if USE_MDS_TAB
	lda	dst
	eor	tab5B,X
	sta	dst
	lda	dst+1
	eor	tabEF,X
	sta	dst+1
	txa	
	eor	dst+2
	sta	dst+2
	lda	dst+3
	eor	tabEF,X
	sta	dst+3
  .else
    .if USE_ALPHA_TAB
	stx	tmp
	txa
	eor	dst+2
	sta	dst+2
	ldx	alpha,X
	stx	tmp+1		;tmp+1 = alpha**-1
	lda	alpha,X
    .else
	sta	tmp		;save the 01 thing
	eor	dst+2
	sta	dst+2		;xor the 01
	lda	tmp
	mulAlpha
	sta	tmp+1
	mulAlpha
    .endif
	eor	tmp		;now A = 5B
	tax			;save 5B in X
	eor	dst
	sta	dst
	txa			;A = 5B
	eor	tmp+1		;now A = EF
	tax			;save 5F in X
	eor	dst+1
	sta	dst+1
	txa	
	eor	dst+3
	sta	dst+3
  .endif
	;--- 4th byte, 4th MDS column = 5B 01 EF 5B
	Sbox8	srcB0+((rotCnt+step*3).AND.3),p1,p1,p0,key+3
  .if USE_MDS_TAB
	lda	dst
	eor	tab5B,X
	sta	dst
	txa
	eor	dst+1
	sta	dst+1
	lda	dst+2
	eor	tabEF,X
	sta	dst+2
	lda	dst+3
	eor	tab5B,X
	sta	dst+3
  .else
    .if USE_ALPHA_TAB
	stx	tmp
	txa
	eor	dst+1
	sta	dst+1
	ldx	alpha,X
	stx	tmp+1		;tmp+1 = alpha**-1
	lda	alpha,X
    .else
	sta	tmp		;save the 01 thing
	eor	dst+1
	sta	dst+1		;xor the 01
	lda	tmp
	mulAlpha
	sta	tmp+1
	mulAlpha
    .endif
	eor	tmp		;now A = 5B
	tax			;save 5B in X
	eor	dst
	sta	dst
	txa			;A = 5B
	eor	dst+3
	sta	dst+3
	txa
	eor	tmp+1		;now A = EF
	eor	dst+2
	sta	dst+2
  .endif
f32Size		.var $-tmpf32
f32ByteCnt	.var f32ByteCnt+f32Size
	.endm


op1	.macro	opCode,a,b
tmpOp		.var $
	lda	a
	opCode	b
	sta	a
tmpOp		.var $-tmpOp
op1ByteCnt .var op1ByteCnt+tmpOp
	.endm

op4	.macro	opCode1,opCode2,a,b
	op1	opCode1,a  ,b
	op1	opCode2,a+1,b+1
	op1	opCode2,a+2,b+2
	op1	opCode2,a+3,b+3
	.endm

swap1	.macro	y,z
tmpSwap		.var	$
	ldx	y
	lda	z
	stx	z
	sta	y
tmpSwap		.var	$-tmpSwap
swap1ByteCnt .var swap1ByteCnt+tmpSwap
	.endm

swap4	.macro	y,z
	swap1	y  ,z
	swap1	y+1,z+1
	swap1	y+2,z+2
	swap1	y+3,z+3
	.endm

mov4	.macro	src,dst
	lda	src
	sta	dst
	lda	src+1
	sta	dst+1
	lda	src+2
	sta	dst+2
	lda	src+3
	sta	dst+3
	.endm

add4	.macro	a,b		;a=a+b (dword operation)
	op4	add,adc,a,b
	.endm

add4_1	.macro	a,b		;add4, but assume A already == b[0]
	add	a		;(save one opcode, three clocks)
	sta	a
	op1	adc,a+1,b+1
	op1	adc,a+2,b+2
	op1	adc,a+3,b+3
	.endm

xor4	.macro	a,b		;a=a^b (dword operation)
	op4	eor,eor,a,b
	.endm

rol4_1	.macro	a
tmpRot		.var $
	lda	a
	add	#80h		;set carry
	rol	a+1
	rol	a+2
	rol	a+3
	rol	a
tmpRot		.var $-tmpRot
rotByteCnt	.var rotByteCnt+tmpRot
	.endm

ror4_1	.macro	a
tmpRot		.var $
	lda	a+3
	lsra			;start the carry bit
	ror	a+2
	ror	a+1
	ror	a
	ror	a+3
tmpRot		.var $-tmpRot
rotByteCnt	.var rotByteCnt+tmpRot
	.endm

;----------------------------------------------------------------
; round xor function
;
RoundXor .macro
  .if UNROLL_ROUND
	xor4	Text+8 ,t0
	xor4	Text+12,t1
  .else
	jsr	rXor
  .endif
.endm

  .ife UNROLL_ROUND 
rXor:	xor4	Text+8 ,t0
	xor4	Text+12,t1
	rts
  .endif


;----------------------------------------------------------------
; computeSubkey: compute subkey values sk0,sk1 from 
;
;   Input:	round	=	subkey number
;   Output:	sk0,sk1	=	subkeys
;		round incremented by two
;		modifies sPtr also
;----------------------------------------------------------------
;
computeSubkey:
.if UNROLL_SUBKEY
	f32	round,0,0,sk0,K32e
	inc	round
	f32	round,0,0,sk1+2,K32o
	inc	round
.else	; just use one f32 invocation, copy result and loop
	ldx	#K32e
skLoop:	f32	round,0,0,sk1+2,0;0 as key ptr --> use X
	inc	round
	brclr	0,round,f32Done
	mov4	sk1+2,sk0	;move the result to sk0
	ldx	#K32o		;use other one
	jmp	skLoop
f32Done:	
.endif
   .if (sk1+4)-t0		;assert(sk1+4 == t0)
	error	;!!
   .endif
	lda	sk1+5		;handle the ROL(sk1,8)
	sta	sk1+1
	add4_1	sk0,sk1+1	;do the PHT
	add4	sk1+1,sk0
	
;	lda	sk1+4		;handle ROL(A+2*B,9) ;; already set from add
	lsla			;set the carry bit
	rol	sk1+1		;propagate through
	rol	sk1+2
	rol	sk1+3
	adc	#0		;merge in new carry
	sta	sk1		;and store result

	rts
subkeyByteCnt	.var	$-computeSubkey

;
;----------------------------------------------------------------
; roundFunc
;
;   Input:	round	=	subkey index
;		Text	=	text to process
;   Output:	t0,t1 computed
;----------------------------------------------------------------
;
roundSwap:
	swap4	Text  ,Text+8
	swap4	Text+4,Text+12
roundFunc:
	; compute the round keys first (so we can use t0/t1 as tmps)
	jsr	computeSubkey
;
	f32	Text  ,0,1,t0,SboxKey
	f32	Text+4,3,1,t1,SboxKey
	add4	t0,t1			;do the PHT thing
	add4	t1,t0		

	add4	t0,sk0
	add4	t1,sk1
	rts

;----------------------------------------------------------------
; encryptBlock:		Encryption routine
;
;   Input:	Text	=	plaintext block to encrypt
;		Key	=	key to use for encryption
;		SboxKey	=	RS remainder
;   Output:	Text	=	encrypted ciphertext
;----------------------------------------------------------------
;
encryptBlock:
	clra
	jsr	whiten		
	lda	#8			;set up for the round subkeys
	sta	round
	jsr	roundFunc		;skip swap the first time
	bra	eStart			;then enter the loo
eRound:	jsr	roundSwap
eStart:	rol4_1	Text+12			;x[3] = ROL(x[3],1)
	RoundXor
	ror4_1	Text+8			;x[2] = ROR(x[2],1)

	lda	round
	cmp	#8+2*ROUNDS
	bcs	eRound

	; here to do final output whitening
outW:	lda	#4
whiten:	sta	round
	jsr	computeSubkey
	xor4	Text  ,sk0
	xor4	Text+4,sk1
	jsr	computeSubkey
	xor4	Text+8,sk0
	xor4	Text+12,sk1
	rts
;
;----------------------------------------------------------------
; decryptBlock:		Decryption routine
;
;   Input:	Text	=	ciphertext block to decrypt
;		Key	=	key to use for decryption
;		SboxKey	=	RS remainder
;   Output:	Text	=	decrypted plaintext
;----------------------------------------------------------------
;
decryptBlock:
	bsr	outW			;undo output whitening
	lda	#8+2*ROUNDS-2		;set up for the round subkeys
	sta	round
	jsr	roundFunc		;skip swap the first time
	bra	dStart			;then enter the loop
dRound:	jsr	roundSwap
dStart:	rol4_1	Text+8			;x[2]=ROL(x[2],1)
	RoundXor
	ror4_1	Text+12			;x[3]=ROR(x[3],1)
	lda	round
	sub	#4			;back up (undo double increment)
	sta	round
	cmp	#8
	bcc	dRound
	; here to undo input whitening as final decryption step
	clra
	jmp	whiten	
;
;
;----------------------------------------------------------------
; RSremainder:		Compute remainder
;
;   Input:	X	=	ptr to 8 bytes of key 
;   Output:	remainder computed
;----------------------------------------------------------------
; Reed-Solomon code parameters: (12,8) reversible code
;	g(x) = x**4 + (a + 1/a) x**3 + a x**2 + (a + 1/a) x + 1
;   where a = primitive root of field generator 0x14D
;
RS_FDBK	.var	14Dh
RSremainder:
	stx	sk0		;use sk0 as temp location
	clr	8,X	
	clr	9,X	
	clr	10,X	
	clr	11,X	
	lda	#8
	sta	sk0+1		;use this as a counter
RSloop:	lda	7,X
	decx
	stx	sk0+2		;store ptr to next data byte
	ldx	sk0		;get back remainder ptr
	eor	11,X
; new code
	sta	sk0+3
	asla			;multiply by alpha
	bcc	noXor1	
	eor	#(RS_FDBK.AND.0FFH)
noXor1:	sta	sk0+4		;save alpha term
	lda	sk0+3
	lsra
	bcc	noXor2
	eor	#(RS_FDBK.SHR.1)
noXor2:	eor	sk0+4		;this is the alpha + 1/alpha term
	sta	sk0+5
	eor	10,X		;feedback and shift all at once
	sta	11,X
	lda	sk0+4
	eor	9,X
	sta	10,X
	lda	sk0+5
	eor	8,X
	sta	9,X
	lda	sk0+3
	sta	8,X
; end of new code
	ldx	sk0+2		;get back the pointer
	dec	sk0+1
	bne	RSloop
	rts

;----------------------------------------------------------------
;	Twofish lookup tables
tableStart	.var	$
	include	table.inc
	; display sizes (allow for easy grep)
RAM_size	.var RAM_size		;_size_
Sbox8Size	.var Sbox8Size		;_size_
Sbox8CallCnt	.var Sbox8CallCnt	;_size_
Sbox8ByteCnt	.var Sbox8ByteCnt	;_size_
subkeyByteCnt	.var subkeyByteCnt	;_size_
f32Size		.var f32Size		;_size_
tableSize	.var $-tableStart	;_size_
f32ByteCnt	.var f32ByteCnt		;_size_
swap1ByteCnt	.var swap1ByteCnt	;_size_
op1ByteCnt	.var op1ByteCnt		;_size_
rotByteCnt	.var rotByteCnt		;_size_
_sum_		.var tableSize+f32ByteCnt+swap1ByteCnt+op1ByteCnt+rotByteCnt
sumByteCnt	.var _sum_		;_size_
codeSize	.var $-codeSize		;_size_
;
;----------------------------------------------------------------
;
; main entry point for test code
;
;----------------------------------------------------------------
;
main:	clr	kTabPtr			;point into keyTable
	
encLoop:ldx	kTabPtr
	lda	keyTable,x
	bne	doTest			;is this list done?
done:	nop
	bra	main			;do it over again!
doTest:	inx				;bump past key length
	stx	kTabPtr			;copy over the test code
	; copy over the next example
	clr	tmpPtr
copyKey:ldx	kTabPtr			;copy key, split into even/odd
	lda	keyTable,x
	inx
	stx	kTabPtr
	ldx	tmpPtr
	sta	K32e,x
	inx
	stx	tmpPtr
	cpx	#KEY_SIZE/2		;time to skip SboxKey?
	bne	noSkipK
	txa
	add	#KEY_SIZE/4
	sta	tmpPtr

noSkipK:cpx	#KEY_SIZE+KEY_SIZE/4
	bne	copyKey
	clr	tmpPtr
copyBlk:ldx	kTabPtr
	lda	keyTable,x
	inx
	stx	kTabPtr
	ldx	tmpPtr
	sta	Text,x
	inx
	stx	tmpPtr
	cpx	#BLK_SIZE
	bne	copyBlk
	; compute remainder
	ldx	#K32e
	jsr	RSremainder
	ldx	#K32e+KBUMP
	jsr	RSremainder
	swap4	SboxKey,SboxKey+KBUMP

copied:	jsr	encryptBlock
	; should compare results here
	clr	tmpPtr
bp4:	nop
cmpLp:	ldx	kTabPtr			;compare cipherText
	lda	keyTable,X
	inx	
	stx	kTabPtr			;store bumped ptr
	ldx	tmpPtr
	cmp	Text,X
	bne	badEnc
	inx	
	stx	tmpPtr
	cpx	#BLK_SIZE
	bne	cmpLp
bp1:	nop				;compare ok!
	jsr	decryptBlock	

	lda	kTabPtr
	sub	#BLK_SIZE*2
	sta	kTabPtr			;back up
	clr	tmpPtr
cmpLp2:	ldx	kTabPtr			;compare cipherText
	lda	keyTable,X
	inx	
	stx	kTabPtr			;store bumped ptr
	ldx	tmpPtr
	cmp	Text,X
	bne	badDec
	inx	
	stx	tmpPtr
	cpx	#BLK_SIZE
	bne	cmpLp2
bp3:	nop
	lda	kTabPtr
	add	#BLK_SIZE
	sta	kTabPtr
	jmp	encLoop
bp2:	nop

badDec:	bra	$
badEnc:	bra	$
	
	.end	main
