VeraCrypt/src/Crypto/AesSmall_x86.asm
2014-11-08 23:18:07 +01:00

1445 lines
38 KiB
NASM

; ---------------------------------------------------------------------------
; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
;
; LICENSE TERMS
;
; The free distribution and use of this software is allowed (with or without
; changes) provided that:
;
; 1. source code distributions include the above copyright notice, this
; list of conditions and the following disclaimer;
;
; 2. binary distributions include the above copyright notice, this list
; of conditions and the following disclaimer in their documentation;
;
; 3. the name of the copyright holder is not used to endorse products
; built using this software without specific written permission.
;
; DISCLAIMER
;
; This software is provided 'as is' with no explicit or implied warranties
; in respect of its properties, including, but not limited to, correctness
; and/or fitness for purpose.
; ---------------------------------------------------------------------------
; Issue 20/12/2007
;
; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h
; and the same define to be set here as well. If AES_V2C is set this file
; requires the C files aeskey.c and aestab.c for support.
; An AES implementation for x86 processors using the YASM (or NASM) assembler.
; This is a full assembler implementation covering encryption, decryption and
; key scheduling. It uses 2k bytes of tables but its encryption and decryption
; performance is very close to that obtained using large tables. Key schedule
; expansion is slower for both encryption and decryption but this is likely to
; be offset by the much smaller load that this version places on the processor
; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of
; the design of the AES round function used here.
;
; This code provides the standard AES block size (128 bits, 16 bytes) and the
; three standard AES key sizes (128, 192 and 256 bits). It has the same call
; interface as my C implementation. The ebx, esi, edi and ebp registers are
; preserved across calls but eax, ecx and edx and the artihmetic status flags
; are not. Although this is a full assembler implementation, it can be used
; in conjunction with my C code which provides faster key scheduling using
; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C
; defined. It is also important that the defines below match those used in the
; C code. This code uses the VC++ register saving conentions; if it is used
; with another compiler, conventions for using and saving registers may need
; to be checked (and calling conventions). The YASM command line for the VC++
; custom build step is:
;
; yasm -Xvc -f win32 -D <Z> -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"
;
; For the cryptlib build this is (pcg):
;
; yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm
;
; where <Z> is ASM_X86_V2 or ASM_X86_V2C. The calling intefaces are:
;
; AES_RETURN aes_encrypt(const unsigned char in_blk[],
; unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
;
; AES_RETURN aes_decrypt(const unsigned char in_blk[],
; unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
;
; AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
; const aes_encrypt_ctx cx[1]);
;
; AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
; const aes_decrypt_ctx cx[1]);
;
; AES_RETURN aes_encrypt_key(const unsigned char key[],
; unsigned int len, const aes_decrypt_ctx cx[1]);
;
; AES_RETURN aes_decrypt_key(const unsigned char key[],
; unsigned int len, const aes_decrypt_ctx cx[1]);
;
; where <NNN> is 128, 102 or 256. In the last two calls the length can be in
; either bits or bytes.
; The DLL interface must use the _stdcall convention in which the number
; of bytes of parameter space is added after an @ to the sutine's name.
; We must also remove our parameters from the stack before return (see
; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
;
; Adapted for TrueCrypt:
; - All tables generated at run-time
; - Adapted for 16-bit environment
;
CPU 386
USE16
SEGMENT _TEXT PUBLIC CLASS=CODE USE16
SEGMENT _DATA PUBLIC CLASS=DATA USE16
GROUP DGROUP _TEXT _DATA
extern _aes_dec_tab ; Aestab.c
extern _aes_enc_tab
; %define DLL_EXPORT
; The size of the code can be reduced by using functions for the encryption
; and decryption rounds in place of macro expansion
%define REDUCE_CODE_SIZE
; Comment in/out the following lines to obtain the desired subroutines. These
; selections MUST match those in the C header file aes.h
; %define AES_128 ; define if AES with 128 bit keys is needed
; %define AES_192 ; define if AES with 192 bit keys is needed
%define AES_256 ; define if AES with 256 bit keys is needed
; %define AES_VAR ; define if a variable key size is needed
%define ENCRYPTION ; define if encryption is needed
%define DECRYPTION ; define if decryption is needed
; %define AES_REV_DKS ; define if key decryption schedule is reversed
%ifndef ASM_X86_V2C
%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed
%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed
%endif
; The encryption key schedule has the following in memory layout where N is the
; number of rounds (10, 12 or 14):
;
; lo: | input key (round 0) | ; each round is four 32-bit words
; | encryption round 1 |
; | encryption round 2 |
; ....
; | encryption round N-1 |
; hi: | encryption round N |
;
; The decryption key schedule is normally set up so that it has the same
; layout as above by actually reversing the order of the encryption key
; schedule in memory (this happens when AES_REV_DKS is set):
;
; lo: | decryption round 0 | = | encryption round N |
; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
; .... ....
; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
; hi: | decryption round N | = | input key (round 0) |
;
; with rounds except the first and last modified using inv_mix_column()
; But if AES_REV_DKS is NOT set the order of keys is left as it is for
; encryption so that it has to be accessed in reverse when used for
; decryption (although the inverse mix column modifications are done)
;
; lo: | decryption round 0 | = | input key (round 0) |
; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
; .... ....
; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
; hi: | decryption round N | = | encryption round N |
;
; This layout is faster when the assembler key scheduling provided here
; is used.
;
; End of user defines
%ifdef AES_VAR
%ifndef AES_128
%define AES_128
%endif
%ifndef AES_192
%define AES_192
%endif
%ifndef AES_256
%define AES_256
%endif
%endif
%ifdef AES_VAR
%define KS_LENGTH 60
%elifdef AES_256
%define KS_LENGTH 60
%elifdef AES_192
%define KS_LENGTH 52
%else
%define KS_LENGTH 44
%endif
; These macros implement stack based local variables
%macro save 2
mov [esp+4*%1],%2
%endmacro
%macro restore 2
mov %1,[esp+4*%2]
%endmacro
%ifdef REDUCE_CODE_SIZE
%macro mf_call 1
call %1
%endmacro
%else
%macro mf_call 1
%1
%endmacro
%endif
; the DLL has to implement the _stdcall calling interface on return
; In this case we have to take our parameters (3 4-byte pointers)
; off the stack
%define parms 12
%macro do_name 1-2 parms
%ifndef DLL_EXPORT
global %1
%1:
%else
global %1@%2
export %1@%2
%1@%2:
%endif
%endmacro
%macro do_call 1-2 parms
%ifndef DLL_EXPORT
call %1
add esp,%2
%else
call %1@%2
%endif
%endmacro
%macro do_exit 0-1 parms
%ifdef DLL_EXPORT
ret %1
%else
ret
%endif
%endmacro
; finite field multiplies by {02}, {04} and {08}
%define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
%define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
%define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
; finite field multiplies required in table generation
%define f3(x) (f2(x) ^ x)
%define f9(x) (f8(x) ^ x)
%define fb(x) (f8(x) ^ f2(x) ^ x)
%define fd(x) (f8(x) ^ f4(x) ^ x)
%define fe(x) (f8(x) ^ f4(x) ^ f2(x))
%define etab_0(x) [_aes_enc_tab+4+8*x]
%define etab_1(x) [_aes_enc_tab+3+8*x]
%define etab_2(x) [_aes_enc_tab+2+8*x]
%define etab_3(x) [_aes_enc_tab+1+8*x]
%define etab_b(x) byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx
%define etab_w(x) word [_aes_enc_tab+8*x] ; used with movzx for 0x0000xx00
%define btab_0(x) [_aes_enc_tab+6+8*x]
%define btab_1(x) [_aes_enc_tab+5+8*x]
%define btab_2(x) [_aes_enc_tab+4+8*x]
%define btab_3(x) [_aes_enc_tab+3+8*x]
; ROUND FUNCTION. Build column[2] on ESI and column[3] on EDI that have the
; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
;
; Input:
;
; EAX column[0]
; EBX column[1]
; ECX column[2]
; EDX column[3]
; ESI column key[round][2]
; EDI column key[round][3]
; EBP scratch
;
; Output:
;
; EBP column[0] unkeyed
; EBX column[1] unkeyed
; ESI column[2] keyed
; EDI column[3] keyed
; EAX scratch
; ECX scratch
; EDX scratch
%macro rnd_fun 2
rol ebx,16
%1 esi, cl, 0, ebp
%1 esi, dh, 1, ebp
%1 esi, bh, 3, ebp
%1 edi, dl, 0, ebp
%1 edi, ah, 1, ebp
%1 edi, bl, 2, ebp
%2 ebp, al, 0, ebp
shr ebx,16
and eax,0xffff0000
or eax,ebx
shr edx,16
%1 ebp, ah, 1, ebx
%1 ebp, dh, 3, ebx
%2 ebx, dl, 2, ebx
%1 ebx, ch, 1, edx
%1 ebx, al, 0, edx
shr eax,16
shr ecx,16
%1 ebp, cl, 2, edx
%1 edi, ch, 3, edx
%1 esi, al, 2, edx
%1 ebx, ah, 3, edx
%endmacro
; Basic MOV and XOR Operations for normal rounds
%macro nr_xor 4
movzx %4,%2
xor %1,etab_%3(%4)
%endmacro
%macro nr_mov 4
movzx %4,%2
mov %1,etab_%3(%4)
%endmacro
; Basic MOV and XOR Operations for last round
%if 1
%macro lr_xor 4
movzx %4,%2
movzx %4,etab_b(%4)
%if %3 != 0
shl %4,8*%3
%endif
xor %1,%4
%endmacro
%macro lr_mov 4
movzx %4,%2
movzx %1,etab_b(%4)
%if %3 != 0
shl %1,8*%3
%endif
%endmacro
%else ; less effective but worth leaving as an option
%macro lr_xor 4
movzx %4,%2
mov %4,btab_%3(%4)
and %4,0x000000ff << 8 * %3
xor %1,%4
%endmacro
%macro lr_mov 4
movzx %4,%2
mov %1,btab_%3(%4)
and %1,0x000000ff << 8 * %3
%endmacro
%endif
; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions
%ifdef REDUCE_CODE_SIZE
l3s_col:
movzx ecx,al ; in eax
movzx ecx, etab_b(ecx) ; out eax
xor edx,ecx ; scratch ecx,edx
movzx ecx,ah
movzx ecx, etab_b(ecx)
shl ecx,8
xor edx,ecx
shr eax,16
movzx ecx,al
movzx ecx, etab_b(ecx)
shl ecx,16
xor edx,ecx
movzx ecx,ah
movzx ecx, etab_b(ecx)
shl ecx,24
xor edx,ecx
mov eax,edx
ret
%else
%macro l3s_col 0
movzx ecx,al ; in eax
movzx ecx, etab_b(ecx) ; out eax
xor edx,ecx ; scratch ecx,edx
movzx ecx,ah
movzx ecx, etab_b(ecx)
shl ecx,8
xor edx,ecx
shr eax,16
movzx ecx,al
movzx ecx, etab_b(ecx)
shl ecx,16
xor edx,ecx
movzx ecx,ah
movzx ecx, etab_b(ecx)
shl ecx,24
xor edx,ecx
mov eax,edx
%endmacro
%endif
; offsets to parameters
in_blk equ 2 ; input byte array address parameter
out_blk equ 4 ; output byte array address parameter
ctx equ 6 ; AES context structure
stk_spc equ 20 ; stack space
%ifdef ENCRYPTION
; %define ENCRYPTION_TABLE
%ifdef REDUCE_CODE_SIZE
enc_round:
sub sp, 2
add ebp,16
save 1,ebp
mov esi,[ebp+8]
mov edi,[ebp+12]
rnd_fun nr_xor, nr_mov
mov eax,ebp
mov ecx,esi
mov edx,edi
restore ebp,1
xor eax,[ebp]
xor ebx,[ebp+4]
add sp, 2
ret
%else
%macro enc_round 0
add ebp,16
save 0,ebp
mov esi,[ebp+8]
mov edi,[ebp+12]
rnd_fun nr_xor, nr_mov
mov eax,ebp
mov ecx,esi
mov edx,edi
restore ebp,0
xor eax,[ebp]
xor ebx,[ebp+4]
%endmacro
%endif
%macro enc_last_round 0
add ebp,16
save 0,ebp
mov esi,[ebp+8]
mov edi,[ebp+12]
rnd_fun lr_xor, lr_mov
mov eax,ebp
restore ebp,0
xor eax,[ebp]
xor ebx,[ebp+4]
%endmacro
section _TEXT
; AES Encryption Subroutine
do_name _aes_encrypt,12
mov ax, sp
movzx esp, ax
sub esp,stk_spc
mov [esp+16],ebp
mov [esp+12],ebx
mov [esp+ 8],esi
mov [esp+ 4],edi
movzx esi,word [esp+in_blk+stk_spc] ; input pointer
mov eax,[esi ]
mov ebx,[esi+ 4]
mov ecx,[esi+ 8]
mov edx,[esi+12]
movzx ebp,word [esp+ctx+stk_spc] ; key pointer
movzx edi,byte [ebp+4*KS_LENGTH]
xor eax,[ebp ]
xor ebx,[ebp+ 4]
xor ecx,[ebp+ 8]
xor edx,[ebp+12]
; determine the number of rounds
%ifndef AES_256
cmp edi,10*16
je .3
cmp edi,12*16
je .2
cmp edi,14*16
je .1
mov eax,-1
jmp .5
%endif
.1: mf_call enc_round
mf_call enc_round
.2: mf_call enc_round
mf_call enc_round
.3: mf_call enc_round
mf_call enc_round
mf_call enc_round
mf_call enc_round
mf_call enc_round
mf_call enc_round
mf_call enc_round
mf_call enc_round
mf_call enc_round
enc_last_round
movzx edx,word [esp+out_blk+stk_spc]
mov [edx],eax
mov [edx+4],ebx
mov [edx+8],esi
mov [edx+12],edi
xor eax,eax
.5: mov ebp,[esp+16]
mov ebx,[esp+12]
mov esi,[esp+ 8]
mov edi,[esp+ 4]
add esp,stk_spc
do_exit 12
%endif
%macro f_key 2
push ecx
push edx
mov edx,esi
ror eax,8
mf_call l3s_col
mov esi,eax
pop edx
pop ecx
xor esi,rc_val
mov [ebp+%1*%2],esi
xor edi,esi
mov [ebp+%1*%2+4],edi
xor ecx,edi
mov [ebp+%1*%2+8],ecx
xor edx,ecx
mov [ebp+%1*%2+12],edx
mov eax,edx
%if %2 == 24
%if %1 < 7
xor eax,[ebp+%1*%2+16-%2]
mov [ebp+%1*%2+16],eax
xor eax,[ebp+%1*%2+20-%2]
mov [ebp+%1*%2+20],eax
%endif
%elif %2 == 32
%if %1 < 6
push ecx
push edx
mov edx,[ebp+%1*%2+16-%2]
mf_call l3s_col
pop edx
pop ecx
mov [ebp+%1*%2+16],eax
xor eax,[ebp+%1*%2+20-%2]
mov [ebp+%1*%2+20],eax
xor eax,[ebp+%1*%2+24-%2]
mov [ebp+%1*%2+24],eax
xor eax,[ebp+%1*%2+28-%2]
mov [ebp+%1*%2+28],eax
%endif
%endif
%assign rc_val f2(rc_val)
%endmacro
%ifdef ENCRYPTION_KEY_SCHEDULE
%ifdef AES_128
%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif
%assign rc_val 1
do_name _aes_encrypt_key128,8
push ebp
push ebx
push esi
push edi
mov ebp,[esp+24]
mov [ebp+4*KS_LENGTH],dword 10*16
mov ebx,[esp+20]
mov esi,[ebx]
mov [ebp],esi
mov edi,[ebx+4]
mov [ebp+4],edi
mov ecx,[ebx+8]
mov [ebp+8],ecx
mov edx,[ebx+12]
mov [ebp+12],edx
add ebp,16
mov eax,edx
f_key 0,16 ; 11 * 4 = 44 unsigned longs
f_key 1,16 ; 4 + 4 * 10 generated = 44
f_key 2,16
f_key 3,16
f_key 4,16
f_key 5,16
f_key 6,16
f_key 7,16
f_key 8,16
f_key 9,16
pop edi
pop esi
pop ebx
pop ebp
xor eax,eax
do_exit 8
%endif
%ifdef AES_192
%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif
%assign rc_val 1
do_name _aes_encrypt_key192,8
push ebp
push ebx
push esi
push edi
mov ebp,[esp+24]
mov [ebp+4*KS_LENGTH],dword 12 * 16
mov ebx,[esp+20]
mov esi,[ebx]
mov [ebp],esi
mov edi,[ebx+4]
mov [ebp+4],edi
mov ecx,[ebx+8]
mov [ebp+8],ecx
mov edx,[ebx+12]
mov [ebp+12],edx
mov eax,[ebx+16]
mov [ebp+16],eax
mov eax,[ebx+20]
mov [ebp+20],eax
add ebp,24
f_key 0,24 ; 13 * 4 = 52 unsigned longs
f_key 1,24 ; 6 + 6 * 8 generated = 54
f_key 2,24
f_key 3,24
f_key 4,24
f_key 5,24
f_key 6,24
f_key 7,24
pop edi
pop esi
pop ebx
pop ebp
xor eax,eax
do_exit 8
%endif
%ifdef AES_256
%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif
%assign rc_val 1
do_name _aes_encrypt_key256,8
mov ax, sp
movzx esp, ax
push ebp
push ebx
push esi
push edi
movzx ebp, word [esp+20] ; ks
mov [ebp+4*KS_LENGTH],dword 14 * 16
movzx ebx, word [esp+18] ; key
mov esi,[ebx]
mov [ebp],esi
mov edi,[ebx+4]
mov [ebp+4],edi
mov ecx,[ebx+8]
mov [ebp+8],ecx
mov edx,[ebx+12]
mov [ebp+12],edx
mov eax,[ebx+16]
mov [ebp+16],eax
mov eax,[ebx+20]
mov [ebp+20],eax
mov eax,[ebx+24]
mov [ebp+24],eax
mov eax,[ebx+28]
mov [ebp+28],eax
add ebp,32
f_key 0,32 ; 15 * 4 = 60 unsigned longs
f_key 1,32 ; 8 + 8 * 7 generated = 64
f_key 2,32
f_key 3,32
f_key 4,32
f_key 5,32
f_key 6,32
pop edi
pop esi
pop ebx
pop ebp
xor eax,eax
do_exit 8
%endif
%ifdef AES_VAR
%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif
do_name _aes_encrypt_key,12
mov ecx,[esp+4]
mov eax,[esp+8]
mov edx,[esp+12]
push edx
push ecx
cmp eax,16
je .1
cmp eax,128
je .1
cmp eax,24
je .2
cmp eax,192
je .2
cmp eax,32
je .3
cmp eax,256
je .3
mov eax,-1
add esp,8
do_exit 12
.1: do_call _aes_encrypt_key128,8
do_exit 12
.2: do_call _aes_encrypt_key192,8
do_exit 12
.3: do_call _aes_encrypt_key256,8
do_exit 12
%endif
%endif
%ifdef ENCRYPTION_TABLE
; S-box data - 256 entries
section _DATA
%define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x)
_aes_enc_tab:
db u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5)
db u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76)
db u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0)
db u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0)
db u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc)
db u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15)
db u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a)
db u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75)
db u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0)
db u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84)
db u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b)
db u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf)
db u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85)
db u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8)
db u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5)
db u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2)
db u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17)
db u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73)
db u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88)
db u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb)
db u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c)
db u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79)
db u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9)
db u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08)
db u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6)
db u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a)
db u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e)
db u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e)
db u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94)
db u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf)
db u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68)
db u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16)
%endif
%ifdef DECRYPTION
; %define DECRYPTION_TABLE
%define dtab_0(x) [_aes_dec_tab+ 8*x]
%define dtab_1(x) [_aes_dec_tab+3+8*x]
%define dtab_2(x) [_aes_dec_tab+2+8*x]
%define dtab_3(x) [_aes_dec_tab+1+8*x]
%define dtab_x(x) byte [_aes_dec_tab+7+8*x]
%macro irn_fun 2
rol eax,16
%1 esi, cl, 0, ebp
%1 esi, bh, 1, ebp
%1 esi, al, 2, ebp
%1 edi, dl, 0, ebp
%1 edi, ch, 1, ebp
%1 edi, ah, 3, ebp
%2 ebp, bl, 0, ebp
shr eax,16
and ebx,0xffff0000
or ebx,eax
shr ecx,16
%1 ebp, bh, 1, eax
%1 ebp, ch, 3, eax
%2 eax, cl, 2, ecx
%1 eax, bl, 0, ecx
%1 eax, dh, 1, ecx
shr ebx,16
shr edx,16
%1 esi, dh, 3, ecx
%1 ebp, dl, 2, ecx
%1 eax, bh, 3, ecx
%1 edi, bl, 2, ecx
%endmacro
; Basic MOV and XOR Operations for normal rounds
%macro ni_xor 4
movzx %4,%2
xor %1,dtab_%3(%4)
%endmacro
%macro ni_mov 4
movzx %4,%2
mov %1,dtab_%3(%4)
%endmacro
; Basic MOV and XOR Operations for last round
%macro li_xor 4
movzx %4,%2
movzx %4,dtab_x(%4)
%if %3 != 0
shl %4,8*%3
%endif
xor %1,%4
%endmacro
%macro li_mov 4
movzx %4,%2
movzx %1,dtab_x(%4)
%if %3 != 0
shl %1,8*%3
%endif
%endmacro
%ifdef REDUCE_CODE_SIZE
dec_round:
sub sp, 2
%ifdef AES_REV_DKS
add ebp,16
%else
sub ebp,16
%endif
save 1,ebp
mov esi,[ebp+8]
mov edi,[ebp+12]
irn_fun ni_xor, ni_mov
mov ebx,ebp
mov ecx,esi
mov edx,edi
restore ebp,1
xor eax,[ebp]
xor ebx,[ebp+4]
add sp, 2
ret
%else
%macro dec_round 0
%ifdef AES_REV_DKS
add ebp,16
%else
sub ebp,16
%endif
save 0,ebp
mov esi,[ebp+8]
mov edi,[ebp+12]
irn_fun ni_xor, ni_mov
mov ebx,ebp
mov ecx,esi
mov edx,edi
restore ebp,0
xor eax,[ebp]
xor ebx,[ebp+4]
%endmacro
%endif
%macro dec_last_round 0
%ifdef AES_REV_DKS
add ebp,16
%else
sub ebp,16
%endif
save 0,ebp
mov esi,[ebp+8]
mov edi,[ebp+12]
irn_fun li_xor, li_mov
mov ebx,ebp
restore ebp,0
xor eax,[ebp]
xor ebx,[ebp+4]
%endmacro
section _TEXT
; AES Decryption Subroutine
do_name _aes_decrypt,12
mov ax, sp
movzx esp, ax
sub esp,stk_spc
mov [esp+16],ebp
mov [esp+12],ebx
mov [esp+ 8],esi
mov [esp+ 4],edi
; input four columns and xor in first round key
movzx esi,word [esp+in_blk+stk_spc] ; input pointer
mov eax,[esi ]
mov ebx,[esi+ 4]
mov ecx,[esi+ 8]
mov edx,[esi+12]
lea esi,[esi+16]
movzx ebp, word [esp+ctx+stk_spc] ; key pointer
movzx edi,byte[ebp+4*KS_LENGTH]
%ifndef AES_REV_DKS ; if decryption key schedule is not reversed
lea ebp,[ebp+edi] ; we have to access it from the top down
%endif
xor eax,[ebp ] ; key schedule
xor ebx,[ebp+ 4]
xor ecx,[ebp+ 8]
xor edx,[ebp+12]
; determine the number of rounds
%ifndef AES_256
cmp edi,10*16
je .3
cmp edi,12*16
je .2
cmp edi,14*16
je .1
mov eax,-1
jmp .5
%endif
.1: mf_call dec_round
mf_call dec_round
.2: mf_call dec_round
mf_call dec_round
.3: mf_call dec_round
mf_call dec_round
mf_call dec_round
mf_call dec_round
mf_call dec_round
mf_call dec_round
mf_call dec_round
mf_call dec_round
mf_call dec_round
dec_last_round
; move final values to the output array.
movzx ebp,word [esp+out_blk+stk_spc]
mov [ebp],eax
mov [ebp+4],ebx
mov [ebp+8],esi
mov [ebp+12],edi
xor eax,eax
.5: mov ebp,[esp+16]
mov ebx,[esp+12]
mov esi,[esp+ 8]
mov edi,[esp+ 4]
add esp,stk_spc
do_exit 12
%endif
%ifdef REDUCE_CODE_SIZE
inv_mix_col:
movzx ecx,dl ; input eax, edx
movzx ecx,etab_b(ecx) ; output eax
mov eax,dtab_0(ecx) ; used ecx
movzx ecx,dh
shr edx,16
movzx ecx,etab_b(ecx)
xor eax,dtab_1(ecx)
movzx ecx,dl
movzx ecx,etab_b(ecx)
xor eax,dtab_2(ecx)
movzx ecx,dh
movzx ecx,etab_b(ecx)
xor eax,dtab_3(ecx)
ret
%else
%macro inv_mix_col 0
movzx ecx,dl ; input eax, edx
movzx ecx,etab_b(ecx) ; output eax
mov eax,dtab_0(ecx) ; used ecx
movzx ecx,dh
shr edx,16
movzx ecx,etab_b(ecx)
xor eax,dtab_1(ecx)
movzx ecx,dl
movzx ecx,etab_b(ecx)
xor eax,dtab_2(ecx)
movzx ecx,dh
movzx ecx,etab_b(ecx)
xor eax,dtab_3(ecx)
%endmacro
%endif
%ifdef DECRYPTION_KEY_SCHEDULE
%ifdef AES_128
%ifndef DECRYPTION_TABLE
; %define DECRYPTION_TABLE
%endif
do_name _aes_decrypt_key128,8
push ebp
push ebx
push esi
push edi
mov eax,[esp+24] ; context
mov edx,[esp+20] ; key
push eax
push edx
do_call _aes_encrypt_key128,8 ; generate expanded encryption key
mov eax,10*16
mov esi,[esp+24] ; pointer to first round key
lea edi,[esi+eax] ; pointer to last round key
add esi,32
; the inverse mix column transformation
mov edx,[esi-16] ; needs to be applied to all round keys
mf_call inv_mix_col ; except first and last. Hence start by
mov [esi-16],eax ; transforming the four sub-keys in the
mov edx,[esi-12] ; second round key
mf_call inv_mix_col
mov [esi-12],eax ; transformations for subsequent rounds
mov edx,[esi-8] ; can then be made more efficient by
mf_call inv_mix_col ; noting that for three of the four sub-keys
mov [esi-8],eax ; in the encryption round key ek[r]:
mov edx,[esi-4] ;
mf_call inv_mix_col ; ek[r][n] = ek[r][n-1] ^ ek[r-1][n]
mov [esi-4],eax ;
; where n is 1..3. Hence the corresponding
.0: mov edx,[esi] ; subkeys in the decryption round key dk[r]
mf_call inv_mix_col ; also obey since inv_mix_col is linear in
mov [esi],eax ; GF(256):
xor eax,[esi-12] ;
mov [esi+4],eax ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
xor eax,[esi-8] ;
mov [esi+8],eax ; So we only need one inverse mix column
xor eax,[esi-4] ; operation (n = 0) for each four word cycle
mov [esi+12],eax ; in the expanded key.
add esi,16
cmp edi,esi
jg .0
jmp dec_end
%endif
%ifdef AES_192
%ifndef DECRYPTION_TABLE
; %define DECRYPTION_TABLE
%endif
do_name _aes_decrypt_key192,8
push ebp
push ebx
push esi
push edi
mov eax,[esp+24] ; context
mov edx,[esp+20] ; key
push eax
push edx
do_call _aes_encrypt_key192,8 ; generate expanded encryption key
mov eax,12*16
mov esi,[esp+24] ; first round key
lea edi,[esi+eax] ; last round key
add esi,48 ; the first 6 words are the key, of
; which the top 2 words are part of
mov edx,[esi-32] ; the second round key and hence
mf_call inv_mix_col ; need to be modified. After this we
mov [esi-32],eax ; need to do a further six values prior
mov edx,[esi-28] ; to using a more efficient technique
mf_call inv_mix_col ; based on:
mov [esi-28],eax ;
; dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
mov edx,[esi-24] ;
mf_call inv_mix_col ; for n = 1 .. 5 where the key expansion
mov [esi-24],eax ; cycle is now 6 words long
mov edx,[esi-20]
mf_call inv_mix_col
mov [esi-20],eax
mov edx,[esi-16]
mf_call inv_mix_col
mov [esi-16],eax
mov edx,[esi-12]
mf_call inv_mix_col
mov [esi-12],eax
mov edx,[esi-8]
mf_call inv_mix_col
mov [esi-8],eax
mov edx,[esi-4]
mf_call inv_mix_col
mov [esi-4],eax
.0: mov edx,[esi] ; the expanded key is 13 * 4 = 44 32-bit words
mf_call inv_mix_col ; of which 11 * 4 = 44 have to be modified
mov [esi],eax ; using inv_mix_col. We have already done 8
xor eax,[esi-20] ; of these so 36 are left - hence we need
mov [esi+4],eax ; exactly 6 loops of six here
xor eax,[esi-16]
mov [esi+8],eax
xor eax,[esi-12]
mov [esi+12],eax
xor eax,[esi-8]
mov [esi+16],eax
xor eax,[esi-4]
mov [esi+20],eax
add esi,24
cmp edi,esi
jg .0
jmp dec_end
%endif
%ifdef AES_256
%ifndef DECRYPTION_TABLE
; %define DECRYPTION_TABLE
%endif
do_name _aes_decrypt_key256,8
mov ax, sp
movzx esp, ax
push ebp
push ebx
push esi
push edi
movzx eax, word [esp+20] ; ks
movzx edx, word [esp+18] ; key
push ax
push dx
do_call _aes_encrypt_key256,4 ; generate expanded encryption key
mov eax,14*16
movzx esi, word [esp+20] ; ks
lea edi,[esi+eax]
add esi,64
mov edx,[esi-48] ; the primary key is 8 words, of which
mf_call inv_mix_col ; the top four require modification
mov [esi-48],eax
mov edx,[esi-44]
mf_call inv_mix_col
mov [esi-44],eax
mov edx,[esi-40]
mf_call inv_mix_col
mov [esi-40],eax
mov edx,[esi-36]
mf_call inv_mix_col
mov [esi-36],eax
mov edx,[esi-32] ; the encryption key expansion cycle is
mf_call inv_mix_col ; now eight words long so we need to
mov [esi-32],eax ; start by doing one complete block
mov edx,[esi-28]
mf_call inv_mix_col
mov [esi-28],eax
mov edx,[esi-24]
mf_call inv_mix_col
mov [esi-24],eax
mov edx,[esi-20]
mf_call inv_mix_col
mov [esi-20],eax
mov edx,[esi-16]
mf_call inv_mix_col
mov [esi-16],eax
mov edx,[esi-12]
mf_call inv_mix_col
mov [esi-12],eax
mov edx,[esi-8]
mf_call inv_mix_col
mov [esi-8],eax
mov edx,[esi-4]
mf_call inv_mix_col
mov [esi-4],eax
.0: mov edx,[esi] ; we can now speed up the remaining
mf_call inv_mix_col ; rounds by using the technique
mov [esi],eax ; outlined earlier. But note that
xor eax,[esi-28] ; there is one extra inverse mix
mov [esi+4],eax ; column operation as the 256 bit
xor eax,[esi-24] ; key has an extra non-linear step
mov [esi+8],eax ; for the midway element.
xor eax,[esi-20]
mov [esi+12],eax ; the expanded key is 15 * 4 = 60
mov edx,[esi+16] ; 32-bit words of which 52 need to
mf_call inv_mix_col ; be modified. We have already done
mov [esi+16],eax ; 12 so 40 are left - which means
xor eax,[esi-12] ; that we need exactly 5 loops of 8
mov [esi+20],eax
xor eax,[esi-8]
mov [esi+24],eax
xor eax,[esi-4]
mov [esi+28],eax
add esi,32
cmp edi,esi
jg .0
%endif
dec_end:
%ifdef AES_REV_DKS
movzx esi,word [esp+20] ; this reverses the order of the
.1: mov eax,[esi] ; round keys if required
mov ebx,[esi+4]
mov ebp,[edi]
mov edx,[edi+4]
mov [esi],ebp
mov [esi+4],edx
mov [edi],eax
mov [edi+4],ebx
mov eax,[esi+8]
mov ebx,[esi+12]
mov ebp,[edi+8]
mov edx,[edi+12]
mov [esi+8],ebp
mov [esi+12],edx
mov [edi+8],eax
mov [edi+12],ebx
add esi,16
sub edi,16
cmp edi,esi
jg .1
%endif
pop edi
pop esi
pop ebx
pop ebp
xor eax,eax
do_exit 8
%ifdef AES_VAR
do_name _aes_decrypt_key,12
mov ecx,[esp+4]
mov eax,[esp+8]
mov edx,[esp+12]
push edx
push ecx
cmp eax,16
je .1
cmp eax,128
je .1
cmp eax,24
je .2
cmp eax,192
je .2
cmp eax,32
je .3
cmp eax,256
je .3
mov eax,-1
add esp,8
do_exit 12
.1: do_call _aes_decrypt_key128,8
do_exit 12
.2: do_call _aes_decrypt_key192,8
do_exit 12
.3: do_call _aes_decrypt_key256,8
do_exit 12
%endif
%endif
%ifdef DECRYPTION_TABLE
; Inverse S-box data - 256 entries
section _DATA
%define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
_aes_dec_tab:
db v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38)
db v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb)
db v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87)
db v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb)
db v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d)
db v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e)
db v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2)
db v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25)
db v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16)
db v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92)
db v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda)
db v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84)
db v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a)
db v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06)
db v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02)
db v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b)
db v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea)
db v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73)
db v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85)
db v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e)
db v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89)
db v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b)
db v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20)
db v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4)
db v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31)
db v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f)
db v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d)
db v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef)
db v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0)
db v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61)
db v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26)
db v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d)
%endif