[linux.git] / arch / i386 / crypto / aes-i586-asm.S

// -------------------------------------------------------------------------
// Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
// All rights reserved.
//
// LICENSE TERMS
//
// The free distribution and use of this software in both source and binary 
// form is allowed (with or without changes) provided that:
//
//   1. distributions of this source code include the above copyright 
//      notice, this list of conditions and the following disclaimer//
//
//   2. distributions in binary form include the above copyright
//      notice, this list of conditions and the following disclaimer
//      in the documentation and/or other associated materials//
//
//   3. the copyright holder's name is not used to endorse products 
//      built using this software without specific written permission.
//
//
// ALTERNATIVELY, provided that this notice is retained in full, this product
// may be distributed under the terms of the GNU General Public License (GPL),
// in which case the provisions of the GPL apply INSTEAD OF those given above.
//
// Copyright (c) 2004 Linus Torvalds <[email protected]>
// Copyright (c) 2004 Red Hat, Inc., James Morris <[email protected]>

// DISCLAIMER
//
// This software is provided 'as is' with no explicit or implied warranties
// in respect of its properties including, but not limited to, correctness 
// and fitness for purpose.
// -------------------------------------------------------------------------
// Issue Date: 29/07/2002

.file "aes-i586-asm.S"
.text

// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
	
#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)

// offsets to parameters with one register pushed onto stack

#define in_blk    8  // input byte array address parameter
#define out_blk  12  // output byte array address parameter
#define ctx      16  // AES context structure

// offsets in context structure

#define ekey     0   // encryption key schedule base address
#define nrnd   256   // number of rounds
#define dkey   260   // decryption key schedule base address

// register mapping for encrypt and decrypt subroutines

#define r0  eax
#define r1  ebx
#define r2  ecx
#define r3  edx
#define r4  esi
#define r5  edi

#define eaxl  al
#define eaxh  ah
#define ebxl  bl
#define ebxh  bh
#define ecxl  cl
#define ecxh  ch
#define edxl  dl
#define edxh  dh

#define _h(reg) reg##h
#define h(reg) _h(reg)

#define _l(reg) reg##l
#define l(reg) _l(reg)

// This macro takes a 32-bit word representing a column and uses
// each of its four bytes to index into four tables of 256 32-bit
// words to obtain values that are then xored into the appropriate
// output registers r0, r1, r4 or r5.  

// Parameters:
// table table base address
//   %1  out_state[0]
//   %2  out_state[1]
//   %3  out_state[2]
//   %4  out_state[3]
//   idx input register for the round (destroyed)
//   tmp scratch register for the round
// sched key schedule

#define do_col(table, a1,a2,a3,a4, idx, tmp)	\
	movzx   %l(idx),%tmp;			\
	xor     table(,%tmp,4),%a1;		\
	movzx   %h(idx),%tmp;			\
	shr     $16,%idx;			\
	xor     table+tlen(,%tmp,4),%a2;	\
	movzx   %l(idx),%tmp;			\
	movzx   %h(idx),%idx;			\
	xor     table+2*tlen(,%tmp,4),%a3;	\
	xor     table+3*tlen(,%idx,4),%a4;

// initialise output registers from the key schedule
// NB1: original value of a3 is in idx on exit
// NB2: original values of a1,a2,a4 aren't used
#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
	mov     0 sched,%a1;			\
	movzx   %l(idx),%tmp;			\
	mov     12 sched,%a2;			\
	xor     table(,%tmp,4),%a1;		\
	mov     4 sched,%a4;			\
	movzx   %h(idx),%tmp;			\
	shr     $16,%idx;			\
	xor     table+tlen(,%tmp,4),%a2;	\
	movzx   %l(idx),%tmp;			\
	movzx   %h(idx),%idx;			\
	xor     table+3*tlen(,%idx,4),%a4;	\
	mov     %a3,%idx;			\
	mov     8 sched,%a3;			\
	xor     table+2*tlen(,%tmp,4),%a3;

// initialise output registers from the key schedule
// NB1: original value of a3 is in idx on exit
// NB2: original values of a1,a2,a4 aren't used
#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
	mov     0 sched,%a1;			\
	movzx   %l(idx),%tmp;			\
	mov     4 sched,%a2;			\
	xor     table(,%tmp,4),%a1;		\
	mov     12 sched,%a4;			\
	movzx   %h(idx),%tmp;			\
	shr     $16,%idx;			\
	xor     table+tlen(,%tmp,4),%a2;	\
	movzx   %l(idx),%tmp;			\
	movzx   %h(idx),%idx;			\
	xor     table+3*tlen(,%idx,4),%a4;	\
	mov     %a3,%idx;			\
	mov     8 sched,%a3;			\
	xor     table+2*tlen(,%tmp,4),%a3;


// original Gladman had conditional saves to MMX regs.
#define save(a1, a2)		\
	mov     %a2,4*a1(%esp)

#define restore(a1, a2)		\
	mov     4*a2(%esp),%a1

// These macros perform a forward encryption cycle. They are entered with
// the first previous round column values in r0,r1,r4,r5 and
// exit with the final values in the same registers, using stack
// for temporary storage.

// round column values
// on entry: r0,r1,r4,r5
// on exit:  r2,r1,r4,r5
#define fwd_rnd1(arg, table)						\
	save   (0,r1);							\
	save   (1,r5);							\
									\
	/* compute new column values */					\
	do_fcol(table, r2,r5,r4,r1, r0,r3, arg);	/* idx=r0 */	\
	do_col (table, r4,r1,r2,r5, r0,r3);		/* idx=r4 */	\
	restore(r0,0);							\
	do_col (table, r1,r2,r5,r4, r0,r3);		/* idx=r1 */	\
	restore(r0,1);							\
	do_col (table, r5,r4,r1,r2, r0,r3);		/* idx=r5 */

// round column values
// on entry: r2,r1,r4,r5
// on exit:  r0,r1,r4,r5
#define fwd_rnd2(arg, table)						\
	save   (0,r1);							\
	save   (1,r5);							\
									\
	/* compute new column values */					\
	do_fcol(table, r0,r5,r4,r1, r2,r3, arg);	/* idx=r2 */	\
	do_col (table, r4,r1,r0,r5, r2,r3);		/* idx=r4 */	\
	restore(r2,0);							\
	do_col (table, r1,r0,r5,r4, r2,r3);		/* idx=r1 */	\
	restore(r2,1);							\
	do_col (table, r5,r4,r1,r0, r2,r3);		/* idx=r5 */

// These macros performs an inverse encryption cycle. They are entered with
// the first previous round column values in r0,r1,r4,r5 and
// exit with the final values in the same registers, using stack
// for temporary storage

// round column values
// on entry: r0,r1,r4,r5
// on exit:  r2,r1,r4,r5
#define inv_rnd1(arg, table)						\
	save    (0,r1);							\
	save    (1,r5);							\
									\
	/* compute new column values */					\
	do_icol(table, r2,r1,r4,r5, r0,r3, arg);	/* idx=r0 */	\
	do_col (table, r4,r5,r2,r1, r0,r3);		/* idx=r4 */	\
	restore(r0,0);							\
	do_col (table, r1,r4,r5,r2, r0,r3);		/* idx=r1 */	\
	restore(r0,1);							\
	do_col (table, r5,r2,r1,r4, r0,r3);		/* idx=r5 */

// round column values
// on entry: r2,r1,r4,r5
// on exit:  r0,r1,r4,r5
#define inv_rnd2(arg, table)						\
	save    (0,r1);							\
	save    (1,r5);							\
									\
	/* compute new column values */					\
	do_icol(table, r0,r1,r4,r5, r2,r3, arg);	/* idx=r2 */	\
	do_col (table, r4,r5,r0,r1, r2,r3);		/* idx=r4 */	\
	restore(r2,0);							\
	do_col (table, r1,r4,r5,r0, r2,r3);		/* idx=r1 */	\
	restore(r2,1);							\
	do_col (table, r5,r0,r1,r4, r2,r3);		/* idx=r5 */

// AES (Rijndael) Encryption Subroutine

.global  aes_enc_blk

.extern  ft_tab
.extern  fl_tab

.align 4

aes_enc_blk:
	push    %ebp
	mov     ctx(%esp),%ebp      // pointer to context

// CAUTION: the order and the values used in these assigns 
// rely on the register mappings

1:	push    %ebx
	mov     in_blk+4(%esp),%r2
	push    %esi
	mov     nrnd(%ebp),%r3   // number of rounds
	push    %edi
#if ekey != 0
	lea     ekey(%ebp),%ebp  // key pointer
#endif

// input four columns and xor in first round key

	mov     (%r2),%r0
	mov     4(%r2),%r1
	mov     8(%r2),%r4
	mov     12(%r2),%r5
	xor     (%ebp),%r0
	xor     4(%ebp),%r1
	xor     8(%ebp),%r4
	xor     12(%ebp),%r5

	sub     $8,%esp           // space for register saves on stack
	add     $16,%ebp          // increment to next round key
	sub     $10,%r3          
	je      4f              // 10 rounds for 128-bit key
	add     $32,%ebp
	sub     $2,%r3
	je      3f              // 12 rounds for 128-bit key
	add     $32,%ebp

2:	fwd_rnd1( -64(%ebp) ,ft_tab)	// 14 rounds for 128-bit key
	fwd_rnd2( -48(%ebp) ,ft_tab)
3:	fwd_rnd1( -32(%ebp) ,ft_tab)	// 12 rounds for 128-bit key
	fwd_rnd2( -16(%ebp) ,ft_tab)
4:	fwd_rnd1(    (%ebp) ,ft_tab)	// 10 rounds for 128-bit key
	fwd_rnd2( +16(%ebp) ,ft_tab)
	fwd_rnd1( +32(%ebp) ,ft_tab)
	fwd_rnd2( +48(%ebp) ,ft_tab)
	fwd_rnd1( +64(%ebp) ,ft_tab)
	fwd_rnd2( +80(%ebp) ,ft_tab)
	fwd_rnd1( +96(%ebp) ,ft_tab)
	fwd_rnd2(+112(%ebp) ,ft_tab)
	fwd_rnd1(+128(%ebp) ,ft_tab)
	fwd_rnd2(+144(%ebp) ,fl_tab)	// last round uses a different table

// move final values to the output array.  CAUTION: the 
// order of these assigns rely on the register mappings

	add     $8,%esp
	mov     out_blk+12(%esp),%ebp
	mov     %r5,12(%ebp)
	pop     %edi
	mov     %r4,8(%ebp)
	pop     %esi
	mov     %r1,4(%ebp)
	pop     %ebx
	mov     %r0,(%ebp)
	pop     %ebp
	mov     $1,%eax
	ret

// AES (Rijndael) Decryption Subroutine

.global  aes_dec_blk

.extern  it_tab
.extern  il_tab

.align 4

aes_dec_blk:
	push    %ebp
	mov     ctx(%esp),%ebp       // pointer to context

// CAUTION: the order and the values used in these assigns 
// rely on the register mappings

1:	push    %ebx
	mov     in_blk+4(%esp),%r2
	push    %esi
	mov     nrnd(%ebp),%r3   // number of rounds
	push    %edi
#if dkey != 0
	lea     dkey(%ebp),%ebp  // key pointer
#endif
	mov     %r3,%r0
	shl     $4,%r0
	add     %r0,%ebp
	
// input four columns and xor in first round key

	mov     (%r2),%r0
	mov     4(%r2),%r1
	mov     8(%r2),%r4
	mov     12(%r2),%r5
	xor     (%ebp),%r0
	xor     4(%ebp),%r1
	xor     8(%ebp),%r4
	xor     12(%ebp),%r5

	sub     $8,%esp         // space for register saves on stack
	sub     $16,%ebp        // increment to next round key
	sub     $10,%r3          
	je      4f              // 10 rounds for 128-bit key
	sub     $32,%ebp
	sub     $2,%r3
	je      3f              // 12 rounds for 128-bit key
	sub     $32,%ebp

2:	inv_rnd1( +64(%ebp), it_tab)	// 14 rounds for 128-bit key
	inv_rnd2( +48(%ebp), it_tab)
3:	inv_rnd1( +32(%ebp), it_tab)	// 12 rounds for 128-bit key
	inv_rnd2( +16(%ebp), it_tab)
4:	inv_rnd1(    (%ebp), it_tab)	// 10 rounds for 128-bit key
	inv_rnd2( -16(%ebp), it_tab)
	inv_rnd1( -32(%ebp), it_tab)
	inv_rnd2( -48(%ebp), it_tab)
	inv_rnd1( -64(%ebp), it_tab)
	inv_rnd2( -80(%ebp), it_tab)
	inv_rnd1( -96(%ebp), it_tab)
	inv_rnd2(-112(%ebp), it_tab)
	inv_rnd1(-128(%ebp), it_tab)
	inv_rnd2(-144(%ebp), il_tab)	// last round uses a different table

// move final values to the output array.  CAUTION: the 
// order of these assigns rely on the register mappings

	add     $8,%esp
	mov     out_blk+12(%esp),%ebp
	mov     %r5,12(%ebp)
	pop     %edi
	mov     %r4,8(%ebp)
	pop     %esi
	mov     %r1,4(%ebp)
	pop     %ebx
	mov     %r0,(%ebp)
	pop     %ebp
	mov     $1,%eax
	ret
Commit	Line	Data
1da177e4 LT	1	// -------------------------------------------------------------------------
	2	// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK.
	3	// All rights reserved.
	4	//
	5	// LICENSE TERMS
	6	//
	7	// The free distribution and use of this software in both source and binary
	8	// form is allowed (with or without changes) provided that:
	9	//
	10	// 1. distributions of this source code include the above copyright
	11	// notice, this list of conditions and the following disclaimer//
	12	//
	13	// 2. distributions in binary form include the above copyright
	14	// notice, this list of conditions and the following disclaimer
	15	// in the documentation and/or other associated materials//
	16	//
	17	// 3. the copyright holder's name is not used to endorse products
	18	// built using this software without specific written permission.
	19	//
	20	//
	21	// ALTERNATIVELY, provided that this notice is retained in full, this product
	22	// may be distributed under the terms of the GNU General Public License (GPL),
	23	// in which case the provisions of the GPL apply INSTEAD OF those given above.
	24	//
	25	// Copyright (c) 2004 Linus Torvalds <[email protected]>
	26	// Copyright (c) 2004 Red Hat, Inc., James Morris <[email protected]>
	27
	28	// DISCLAIMER
	29	//
	30	// This software is provided 'as is' with no explicit or implied warranties
	31	// in respect of its properties including, but not limited to, correctness
	32	// and fitness for purpose.
	33	// -------------------------------------------------------------------------
	34	// Issue Date: 29/07/2002
	35
	36	.file "aes-i586-asm.S"
	37	.text
	38
	39	// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
	40	// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
	41
	42	#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
	43
	44	// offsets to parameters with one register pushed onto stack
	45
	46	#define in_blk 8 // input byte array address parameter
	47	#define out_blk 12 // output byte array address parameter
	48	#define ctx 16 // AES context structure
	49
	50	// offsets in context structure
	51
	52	#define ekey 0 // encryption key schedule base address
	53	#define nrnd 256 // number of rounds
	54	#define dkey 260 // decryption key schedule base address
	55
	56	// register mapping for encrypt and decrypt subroutines
	57
	58	#define r0 eax
	59	#define r1 ebx
	60	#define r2 ecx
	61	#define r3 edx
	62	#define r4 esi
	63	#define r5 edi
	64
65	#define eaxl al
66	#define eaxh ah
67	#define ebxl bl
68	#define ebxh bh
69	#define ecxl cl
70	#define ecxh ch
71	#define edxl dl
72	#define edxh dh
73
74	#define _h(reg) reg##h
75	#define h(reg) _h(reg)
76
77	#define _l(reg) reg##l
78	#define l(reg) _l(reg)
79
80	// This macro takes a 32-bit word representing a column and uses
81	// each of its four bytes to index into four tables of 256 32-bit
82	// words to obtain values that are then xored into the appropriate
83	// output registers r0, r1, r4 or r5.
84
85	// Parameters:
86	// table table base address
87	// %1 out_state[0]
88	// %2 out_state[1]
89	// %3 out_state[2]
90	// %4 out_state[3]
91	// idx input register for the round (destroyed)
92	// tmp scratch register for the round
93	// sched key schedule
94
95	#define do_col(table, a1,a2,a3,a4, idx, tmp) \
96	movzx %l(idx),%tmp; \
97	xor table(,%tmp,4),%a1; \
98	movzx %h(idx),%tmp; \
99	shr $16,%idx; \
100	xor table+tlen(,%tmp,4),%a2; \
101	movzx %l(idx),%tmp; \
102	movzx %h(idx),%idx; \
103	xor table+2*tlen(,%tmp,4),%a3; \
104	xor table+3*tlen(,%idx,4),%a4;
105
106	// initialise output registers from the key schedule
107	// NB1: original value of a3 is in idx on exit
108	// NB2: original values of a1,a2,a4 aren't used
109	#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
110	mov 0 sched,%a1; \
111	movzx %l(idx),%tmp; \
112	mov 12 sched,%a2; \
113	xor table(,%tmp,4),%a1; \
114	mov 4 sched,%a4; \
115	movzx %h(idx),%tmp; \
116	shr $16,%idx; \
117	xor table+tlen(,%tmp,4),%a2; \
118	movzx %l(idx),%tmp; \
119	movzx %h(idx),%idx; \
120	xor table+3*tlen(,%idx,4),%a4; \
121	mov %a3,%idx; \
122	mov 8 sched,%a3; \
123	xor table+2*tlen(,%tmp,4),%a3;
124
125	// initialise output registers from the key schedule
126	// NB1: original value of a3 is in idx on exit
127	// NB2: original values of a1,a2,a4 aren't used
128	#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
129	mov 0 sched,%a1; \
130	movzx %l(idx),%tmp; \
131	mov 4 sched,%a2; \
132	xor table(,%tmp,4),%a1; \
133	mov 12 sched,%a4; \
134	movzx %h(idx),%tmp; \
135	shr $16,%idx; \
136	xor table+tlen(,%tmp,4),%a2; \
137	movzx %l(idx),%tmp; \
138	movzx %h(idx),%idx; \
139	xor table+3*tlen(,%idx,4),%a4; \
140	mov %a3,%idx; \
141	mov 8 sched,%a3; \
142	xor table+2*tlen(,%tmp,4),%a3;
143
144
145	// original Gladman had conditional saves to MMX regs.
146	#define save(a1, a2) \
147	mov %a2,4*a1(%esp)
148
149	#define restore(a1, a2) \
150	mov 4*a2(%esp),%a1
151
152	// These macros perform a forward encryption cycle. They are entered with
153	// the first previous round column values in r0,r1,r4,r5 and
154	// exit with the final values in the same registers, using stack
155	// for temporary storage.
156
157	// round column values
158	// on entry: r0,r1,r4,r5
159	// on exit: r2,r1,r4,r5
160	#define fwd_rnd1(arg, table) \
161	save (0,r1); \
162	save (1,r5); \
163	\
164	/* compute new column values */ \
165	do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \
166	do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \
167	restore(r0,0); \
168	do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \
169	restore(r0,1); \
170	do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */
171
172	// round column values
173	// on entry: r2,r1,r4,r5
174	// on exit: r0,r1,r4,r5
175	#define fwd_rnd2(arg, table) \
176	save (0,r1); \
177	save (1,r5); \
178	\
179	/* compute new column values */ \
180	do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \
181	do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \
182	restore(r2,0); \
183	do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \
184	restore(r2,1); \
185	do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */
186
187	// These macros performs an inverse encryption cycle. They are entered with
188	// the first previous round column values in r0,r1,r4,r5 and
189	// exit with the final values in the same registers, using stack
190	// for temporary storage
191
192	// round column values
193	// on entry: r0,r1,r4,r5
194	// on exit: r2,r1,r4,r5
195	#define inv_rnd1(arg, table) \
196	save (0,r1); \
197	save (1,r5); \
198	\
199	/* compute new column values */ \
200	do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \
201	do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \
202	restore(r0,0); \
203	do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \
204	restore(r0,1); \
205	do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */
206
207	// round column values
208	// on entry: r2,r1,r4,r5
209	// on exit: r0,r1,r4,r5
210	#define inv_rnd2(arg, table) \
211	save (0,r1); \
212	save (1,r5); \
213	\
214	/* compute new column values */ \
215	do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \
216	do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \
217	restore(r2,0); \
218	do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \
219	restore(r2,1); \
220	do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */
221
222	// AES (Rijndael) Encryption Subroutine
223
224	.global aes_enc_blk
225
226	.extern ft_tab
227	.extern fl_tab
228
229	.align 4
230
231	aes_enc_blk:
232	push %ebp
233	mov ctx(%esp),%ebp // pointer to context
234
235	// CAUTION: the order and the values used in these assigns
236	// rely on the register mappings
237
238	1: push %ebx
239	mov in_blk+4(%esp),%r2
240	push %esi
241	mov nrnd(%ebp),%r3 // number of rounds
242	push %edi
243	#if ekey != 0
244	lea ekey(%ebp),%ebp // key pointer
245	#endif
246
247	// input four columns and xor in first round key
248
249	mov (%r2),%r0
250	mov 4(%r2),%r1
251	mov 8(%r2),%r4
252	mov 12(%r2),%r5
253	xor (%ebp),%r0
254	xor 4(%ebp),%r1
255	xor 8(%ebp),%r4
256	xor 12(%ebp),%r5
257
258	sub $8,%esp // space for register saves on stack
259	add $16,%ebp // increment to next round key
260	sub $10,%r3
261	je 4f // 10 rounds for 128-bit key
262	add $32,%ebp
263	sub $2,%r3
264	je 3f // 12 rounds for 128-bit key
265	add $32,%ebp
266
267	2: fwd_rnd1( -64(%ebp) ,ft_tab) // 14 rounds for 128-bit key
268	fwd_rnd2( -48(%ebp) ,ft_tab)
269	3: fwd_rnd1( -32(%ebp) ,ft_tab) // 12 rounds for 128-bit key
270	fwd_rnd2( -16(%ebp) ,ft_tab)
271	4: fwd_rnd1( (%ebp) ,ft_tab) // 10 rounds for 128-bit key
272	fwd_rnd2( +16(%ebp) ,ft_tab)
273	fwd_rnd1( +32(%ebp) ,ft_tab)
274	fwd_rnd2( +48(%ebp) ,ft_tab)
275	fwd_rnd1( +64(%ebp) ,ft_tab)
276	fwd_rnd2( +80(%ebp) ,ft_tab)
277	fwd_rnd1( +96(%ebp) ,ft_tab)
278	fwd_rnd2(+112(%ebp) ,ft_tab)
279	fwd_rnd1(+128(%ebp) ,ft_tab)
280	fwd_rnd2(+144(%ebp) ,fl_tab) // last round uses a different table
281
282	// move final values to the output array. CAUTION: the
283	// order of these assigns rely on the register mappings
284
285	add $8,%esp
286	mov out_blk+12(%esp),%ebp
287	mov %r5,12(%ebp)
288	pop %edi
289	mov %r4,8(%ebp)
290	pop %esi
291	mov %r1,4(%ebp)
292	pop %ebx
293	mov %r0,(%ebp)
294	pop %ebp
295	mov $1,%eax
296	ret
297
298	// AES (Rijndael) Decryption Subroutine
299
300	.global aes_dec_blk
301
302	.extern it_tab
303	.extern il_tab
304
305	.align 4
306
307	aes_dec_blk:
308	push %ebp
309	mov ctx(%esp),%ebp // pointer to context
310
311	// CAUTION: the order and the values used in these assigns
312	// rely on the register mappings
313
314	1: push %ebx
315	mov in_blk+4(%esp),%r2
316	push %esi
317	mov nrnd(%ebp),%r3 // number of rounds
318	push %edi
319	#if dkey != 0
320	lea dkey(%ebp),%ebp // key pointer
321	#endif
322	mov %r3,%r0
323	shl $4,%r0
324	add %r0,%ebp
325
326	// input four columns and xor in first round key
327
328	mov (%r2),%r0
329	mov 4(%r2),%r1
330	mov 8(%r2),%r4
331	mov 12(%r2),%r5
332	xor (%ebp),%r0
333	xor 4(%ebp),%r1
334	xor 8(%ebp),%r4
335	xor 12(%ebp),%r5
336
337	sub $8,%esp // space for register saves on stack
338	sub $16,%ebp // increment to next round key
339	sub $10,%r3
340	je 4f // 10 rounds for 128-bit key
341	sub $32,%ebp
342	sub $2,%r3
343	je 3f // 12 rounds for 128-bit key
344	sub $32,%ebp
345
346	2: inv_rnd1( +64(%ebp), it_tab) // 14 rounds for 128-bit key
347	inv_rnd2( +48(%ebp), it_tab)
348	3: inv_rnd1( +32(%ebp), it_tab) // 12 rounds for 128-bit key
349	inv_rnd2( +16(%ebp), it_tab)
350	4: inv_rnd1( (%ebp), it_tab) // 10 rounds for 128-bit key
351	inv_rnd2( -16(%ebp), it_tab)
352	inv_rnd1( -32(%ebp), it_tab)
353	inv_rnd2( -48(%ebp), it_tab)
354	inv_rnd1( -64(%ebp), it_tab)
355	inv_rnd2( -80(%ebp), it_tab)
356	inv_rnd1( -96(%ebp), it_tab)
357	inv_rnd2(-112(%ebp), it_tab)
358	inv_rnd1(-128(%ebp), it_tab)
359	inv_rnd2(-144(%ebp), il_tab) // last round uses a different table
360
361	// move final values to the output array. CAUTION: the
362	// order of these assigns rely on the register mappings
363
364	add $8,%esp
365	mov out_blk+12(%esp),%ebp
366	mov %r5,12(%ebp)
367	pop %edi
368	mov %r4,8(%ebp)
369	pop %esi
370	mov %r1,4(%ebp)
371	pop %ebx
372	mov %r0,(%ebp)
373	pop %ebp
374	mov $1,%eax
375	ret
376