mirror of
https://github.com/veracrypt/VeraCrypt
synced 2024-11-27 21:43:29 +01:00
SIMD speed optimization for Kuznyechik cipher implementation (up to 2x speedup). Based on https://github.com/aprelev/libgost15.
This commit is contained in:
parent
685fad2d5d
commit
f53eb8e260
@ -254,6 +254,20 @@ void EncipherBlocks (int cipher, void *dataPtr, void *ks, size_t blockCount)
|
||||
else if (cipher == CAMELLIA) {
|
||||
camellia_encrypt_blocks(ks, data, data, (uint32) blockCount);
|
||||
}
|
||||
#endif
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined (_UEFI)
|
||||
else if (cipher == KUZNYECHIK
|
||||
&& HasSSE2()
|
||||
#if defined (TC_WINDOWS_DRIVER) && !defined (_WIN64)
|
||||
&& (blockCount >= 4) && NT_SUCCESS (KeSaveFloatingPointState (&floatingPointState))
|
||||
#endif
|
||||
)
|
||||
{
|
||||
kuznyechik_encrypt_blocks (data, data, blockCount, ks);
|
||||
#if defined (TC_WINDOWS_DRIVER) && !defined (_WIN64)
|
||||
KeRestoreFloatingPointState (&floatingPointState);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
else if (cipher == GOST89) {
|
||||
gost_encrypt(data, data, ks, (int)blockCount);
|
||||
@ -357,6 +371,20 @@ void DecipherBlocks (int cipher, void *dataPtr, void *ks, size_t blockCount)
|
||||
else if (cipher == CAMELLIA) {
|
||||
camellia_decrypt_blocks(ks, data, data, (uint32) blockCount);
|
||||
}
|
||||
#endif
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined (_UEFI)
|
||||
else if (cipher == KUZNYECHIK
|
||||
&& HasSSE2()
|
||||
#if defined (TC_WINDOWS_DRIVER) && !defined (_WIN64)
|
||||
&& (blockCount >= 4) && NT_SUCCESS (KeSaveFloatingPointState (&floatingPointState))
|
||||
#endif
|
||||
)
|
||||
{
|
||||
kuznyechik_decrypt_blocks (data, data, blockCount, ks);
|
||||
#if defined (TC_WINDOWS_DRIVER) && !defined (_WIN64)
|
||||
KeRestoreFloatingPointState (&floatingPointState);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
else if (cipher == GOST89) {
|
||||
gost_decrypt(data, data, ks, (int)blockCount);
|
||||
@ -429,6 +457,7 @@ BOOL CipherSupportsIntraDataUnitParallelization (int cipher)
|
||||
|| (cipher == GOST89)
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined (_UEFI)
|
||||
|| (cipher == SERPENT && HasSSE2())
|
||||
|| (cipher == KUZNYECHIK && HasSSE2())
|
||||
#endif
|
||||
#if CRYPTOPP_BOOL_X64
|
||||
|| (cipher == TWOFISH)
|
||||
|
@ -218,6 +218,7 @@
|
||||
<ClCompile Include="cpu.c" />
|
||||
<ClCompile Include="GostCipher.c" />
|
||||
<ClCompile Include="kuznyechik.c" />
|
||||
<ClCompile Include="kuznyechik_simd.c" />
|
||||
<ClCompile Include="Rmd160.c" />
|
||||
<ClCompile Include="SerpentFast.c" />
|
||||
<ClCompile Include="SerpentFast_simd.cpp" />
|
||||
|
@ -54,6 +54,9 @@
|
||||
<ClCompile Include="Camellia.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="kuznyechik_simd.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="Aes.h">
|
||||
|
@ -35,6 +35,7 @@ SOURCES = \
|
||||
GostCipher.c \
|
||||
Streebog.c \
|
||||
kuznyechik.c \
|
||||
kuznyechik_simd.c \
|
||||
Whirlpool.c \
|
||||
Camellia.c \
|
||||
Camellia_$(TC_ARCH).S \
|
||||
|
@ -4,14 +4,21 @@ and released into public domain.
|
||||
*/
|
||||
|
||||
#include "kuznyechik.h"
|
||||
// #include <memory.h>
|
||||
// #include <algorithm>
|
||||
// #include "portability.h"
|
||||
#include "cpu.h"
|
||||
#include "misc.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define inline __forceinline
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
void kuznyechik_set_key_simd(const byte* key, kuznyechik_kds *kds);
|
||||
void kuznyechik_encrypt_block_simd(byte* out, const byte* in, kuznyechik_kds* kds);
|
||||
void kuznyechik_encrypt_blocks_simd(byte* out, const byte* in, size_t blocks, kuznyechik_kds* kds);
|
||||
void kuznyechik_decrypt_block_simd(byte* out, const byte* in, kuznyechik_kds* kds);
|
||||
void kuznyechik_decrypt_blocks_simd(byte* out, const byte* in, size_t blocks, kuznyechik_kds* kds);
|
||||
#endif
|
||||
|
||||
//#define CPPCRYPTO_DEBUG
|
||||
|
||||
static const byte S[256] = {
|
||||
@ -2136,199 +2143,257 @@ and released into public domain.
|
||||
{LL(0x45aba4f6433784cc), LL(0x1dffec46132c75de)}, {LL(0x4e257c42d5ada17e), LL(0x1e80a3e223281c39)}, {LL(0xf65f342ea7db0310), LL(0x1f14273f33953b64)}, {LL(0x619b141e58d8a75e), LL(0x20a8ed9c45c16af1)}
|
||||
};
|
||||
|
||||
static inline void LS(uint64 x1, uint64 x2, uint64* t1, uint64* t2)
|
||||
{
|
||||
*t1 = T[0][(byte)(x1)][0] ^ T[1][(byte)(x1 >> 8)][0] ^ T[2][(byte)(x1 >> 16)][0] ^ T[3][(byte)(x1 >> 24)][0] ^ T[4][(byte)(x1 >> 32)][0] ^ T[5][(byte)(x1 >> 40)][0] ^
|
||||
T[6][(byte)(x1 >> 48)][0] ^ T[7][(byte)(x1 >> 56)][0] ^ T[8][(byte)(x2)][0] ^ T[9][(byte)(x2 >> 8)][0] ^ T[10][(byte)(x2 >> 16)][0] ^ T[11][(byte)(x2 >> 24)][0] ^
|
||||
T[12][(byte)(x2 >> 32)][0] ^ T[13][(byte)(x2 >> 40)][0] ^ T[14][(byte)(x2 >> 48)][0] ^ T[15][(byte)(x2 >> 56)][0];
|
||||
*t2 = T[0][(byte)(x1)][1] ^ T[1][(byte)(x1 >> 8)][1] ^ T[2][(byte)(x1 >> 16)][1] ^ T[3][(byte)(x1 >> 24)][1] ^ T[4][(byte)(x1 >> 32)][1] ^ T[5][(byte)(x1 >> 40)][1] ^
|
||||
T[6][(byte)(x1 >> 48)][1] ^ T[7][(byte)(x1 >> 56)][1] ^ T[8][(byte)(x2)][1] ^ T[9][(byte)(x2 >> 8)][1] ^ T[10][(byte)(x2 >> 16)][1] ^ T[11][(byte)(x2 >> 24)][1] ^
|
||||
T[12][(byte)(x2 >> 32)][1] ^ T[13][(byte)(x2 >> 40)][1] ^ T[14][(byte)(x2 >> 48)][1] ^ T[15][(byte)(x2 >> 56)][1];
|
||||
#define LS(x1,x2,t1,t2) { \
|
||||
t1 = T[0][(byte)(x1)][0] ^ T[1][(byte)(x1 >> 8)][0] ^ T[2][(byte)(x1 >> 16)][0] ^ T[3][(byte)(x1 >> 24)][0] ^ T[4][(byte)(x1 >> 32)][0] ^ T[5][(byte)(x1 >> 40)][0] ^ \
|
||||
T[6][(byte)(x1 >> 48)][0] ^ T[7][(byte)(x1 >> 56)][0] ^ T[8][(byte)(x2)][0] ^ T[9][(byte)(x2 >> 8)][0] ^ T[10][(byte)(x2 >> 16)][0] ^ T[11][(byte)(x2 >> 24)][0] ^ \
|
||||
T[12][(byte)(x2 >> 32)][0] ^ T[13][(byte)(x2 >> 40)][0] ^ T[14][(byte)(x2 >> 48)][0] ^ T[15][(byte)(x2 >> 56)][0]; \
|
||||
t2 = T[0][(byte)(x1)][1] ^ T[1][(byte)(x1 >> 8)][1] ^ T[2][(byte)(x1 >> 16)][1] ^ T[3][(byte)(x1 >> 24)][1] ^ T[4][(byte)(x1 >> 32)][1] ^ T[5][(byte)(x1 >> 40)][1] ^ \
|
||||
T[6][(byte)(x1 >> 48)][1] ^ T[7][(byte)(x1 >> 56)][1] ^ T[8][(byte)(x2)][1] ^ T[9][(byte)(x2 >> 8)][1] ^ T[10][(byte)(x2 >> 16)][1] ^ T[11][(byte)(x2 >> 24)][1] ^ \
|
||||
T[12][(byte)(x2 >> 32)][1] ^ T[13][(byte)(x2 >> 40)][1] ^ T[14][(byte)(x2 >> 48)][1] ^ T[15][(byte)(x2 >> 56)][1]; \
|
||||
}
|
||||
|
||||
static inline void ILS(uint64 x1, uint64 x2, uint64* t1, uint64* t2)
|
||||
{
|
||||
*t1 = IT[0][(byte)(x1)][0] ^ IT[1][(byte)(x1 >> 8)][0] ^ IT[2][(byte)(x1 >> 16)][0] ^ IT[3][(byte)(x1 >> 24)][0] ^ IT[4][(byte)(x1 >> 32)][0] ^ IT[5][(byte)(x1 >> 40)][0] ^
|
||||
IT[6][(byte)(x1 >> 48)][0] ^ IT[7][(byte)(x1 >> 56)][0] ^ IT[8][(byte)(x2)][0] ^ IT[9][(byte)(x2 >> 8)][0] ^ IT[10][(byte)(x2 >> 16)][0] ^ IT[11][(byte)(x2 >> 24)][0] ^
|
||||
IT[12][(byte)(x2 >> 32)][0] ^ IT[13][(byte)(x2 >> 40)][0] ^ IT[14][(byte)(x2 >> 48)][0] ^ IT[15][(byte)(x2 >> 56)][0];
|
||||
*t2 = IT[0][(byte)(x1)][1] ^ IT[1][(byte)(x1 >> 8)][1] ^ IT[2][(byte)(x1 >> 16)][1] ^ IT[3][(byte)(x1 >> 24)][1] ^ IT[4][(byte)(x1 >> 32)][1] ^ IT[5][(byte)(x1 >> 40)][1] ^
|
||||
IT[6][(byte)(x1 >> 48)][1] ^ IT[7][(byte)(x1 >> 56)][1] ^ IT[8][(byte)(x2)][1] ^ IT[9][(byte)(x2 >> 8)][1] ^ IT[10][(byte)(x2 >> 16)][1] ^ IT[11][(byte)(x2 >> 24)][1] ^
|
||||
IT[12][(byte)(x2 >> 32)][1] ^ IT[13][(byte)(x2 >> 40)][1] ^ IT[14][(byte)(x2 >> 48)][1] ^ IT[15][(byte)(x2 >> 56)][1];
|
||||
#define ILS(x1,x2,t1,t2) { \
|
||||
t1 = IT[0][(byte)(x1)][0] ^ IT[1][(byte)(x1 >> 8)][0] ^ IT[2][(byte)(x1 >> 16)][0] ^ IT[3][(byte)(x1 >> 24)][0] ^ IT[4][(byte)(x1 >> 32)][0] ^ IT[5][(byte)(x1 >> 40)][0] ^ \
|
||||
IT[6][(byte)(x1 >> 48)][0] ^ IT[7][(byte)(x1 >> 56)][0] ^ IT[8][(byte)(x2)][0] ^ IT[9][(byte)(x2 >> 8)][0] ^ IT[10][(byte)(x2 >> 16)][0] ^ IT[11][(byte)(x2 >> 24)][0] ^ \
|
||||
IT[12][(byte)(x2 >> 32)][0] ^ IT[13][(byte)(x2 >> 40)][0] ^ IT[14][(byte)(x2 >> 48)][0] ^ IT[15][(byte)(x2 >> 56)][0]; \
|
||||
t2 = IT[0][(byte)(x1)][1] ^ IT[1][(byte)(x1 >> 8)][1] ^ IT[2][(byte)(x1 >> 16)][1] ^ IT[3][(byte)(x1 >> 24)][1] ^ IT[4][(byte)(x1 >> 32)][1] ^ IT[5][(byte)(x1 >> 40)][1] ^ \
|
||||
IT[6][(byte)(x1 >> 48)][1] ^ IT[7][(byte)(x1 >> 56)][1] ^ IT[8][(byte)(x2)][1] ^ IT[9][(byte)(x2 >> 8)][1] ^ IT[10][(byte)(x2 >> 16)][1] ^ IT[11][(byte)(x2 >> 24)][1] ^ \
|
||||
IT[12][(byte)(x2 >> 32)][1] ^ IT[13][(byte)(x2 >> 40)][1] ^ IT[14][(byte)(x2 >> 48)][1] ^ IT[15][(byte)(x2 >> 56)][1]; \
|
||||
}
|
||||
|
||||
static inline void ILSS(uint64 x1, uint64 x2, uint64* t1, uint64* t2)
|
||||
{
|
||||
*t1 = IT[0][S[(byte)(x1)]][0] ^ IT[1][S[(byte)(x1 >> 8)]][0] ^ IT[2][S[(byte)(x1 >> 16)]][0] ^ IT[3][S[(byte)(x1 >> 24)]][0] ^ IT[4][S[(byte)(x1 >> 32)]][0] ^ IT[5][S[(byte)(x1 >> 40)]][0] ^
|
||||
IT[6][S[(byte)(x1 >> 48)]][0] ^ IT[7][S[(byte)(x1 >> 56)]][0] ^ IT[8][S[(byte)(x2)]][0] ^ IT[9][S[(byte)(x2 >> 8)]][0] ^ IT[10][S[(byte)(x2 >> 16)]][0] ^ IT[11][S[(byte)(x2 >> 24)]][0] ^
|
||||
IT[12][S[(byte)(x2 >> 32)]][0] ^ IT[13][S[(byte)(x2 >> 40)]][0] ^ IT[14][S[(byte)(x2 >> 48)]][0] ^ IT[15][S[(byte)(x2 >> 56)]][0];
|
||||
*t2 = IT[0][S[(byte)(x1)]][1] ^ IT[1][S[(byte)(x1 >> 8)]][1] ^ IT[2][S[(byte)(x1 >> 16)]][1] ^ IT[3][S[(byte)(x1 >> 24)]][1] ^ IT[4][S[(byte)(x1 >> 32)]][1] ^ IT[5][S[(byte)(x1 >> 40)]][1] ^
|
||||
IT[6][S[(byte)(x1 >> 48)]][1] ^ IT[7][S[(byte)(x1 >> 56)]][1] ^ IT[8][S[(byte)(x2)]][1] ^ IT[9][S[(byte)(x2 >> 8)]][1] ^ IT[10][S[(byte)(x2 >> 16)]][1] ^ IT[11][S[(byte)(x2 >> 24)]][1] ^
|
||||
IT[12][S[(byte)(x2 >> 32)]][1] ^ IT[13][S[(byte)(x2 >> 40)]][1] ^ IT[14][S[(byte)(x2 >> 48)]][1] ^ IT[15][S[(byte)(x2 >> 56)]][1];
|
||||
#define ILSS(x1,x2,t1,t2) { \
|
||||
t1 = IT[0][S[(byte)(x1)]][0] ^ IT[1][S[(byte)(x1 >> 8)]][0] ^ IT[2][S[(byte)(x1 >> 16)]][0] ^ IT[3][S[(byte)(x1 >> 24)]][0] ^ IT[4][S[(byte)(x1 >> 32)]][0] ^ IT[5][S[(byte)(x1 >> 40)]][0] ^ \
|
||||
IT[6][S[(byte)(x1 >> 48)]][0] ^ IT[7][S[(byte)(x1 >> 56)]][0] ^ IT[8][S[(byte)(x2)]][0] ^ IT[9][S[(byte)(x2 >> 8)]][0] ^ IT[10][S[(byte)(x2 >> 16)]][0] ^ IT[11][S[(byte)(x2 >> 24)]][0] ^ \
|
||||
IT[12][S[(byte)(x2 >> 32)]][0] ^ IT[13][S[(byte)(x2 >> 40)]][0] ^ IT[14][S[(byte)(x2 >> 48)]][0] ^ IT[15][S[(byte)(x2 >> 56)]][0]; \
|
||||
t2 = IT[0][S[(byte)(x1)]][1] ^ IT[1][S[(byte)(x1 >> 8)]][1] ^ IT[2][S[(byte)(x1 >> 16)]][1] ^ IT[3][S[(byte)(x1 >> 24)]][1] ^ IT[4][S[(byte)(x1 >> 32)]][1] ^ IT[5][S[(byte)(x1 >> 40)]][1] ^ \
|
||||
IT[6][S[(byte)(x1 >> 48)]][1] ^ IT[7][S[(byte)(x1 >> 56)]][1] ^ IT[8][S[(byte)(x2)]][1] ^ IT[9][S[(byte)(x2 >> 8)]][1] ^ IT[10][S[(byte)(x2 >> 16)]][1] ^ IT[11][S[(byte)(x2 >> 24)]][1] ^ \
|
||||
IT[12][S[(byte)(x2 >> 32)]][1] ^ IT[13][S[(byte)(x2 >> 40)]][1] ^ IT[14][S[(byte)(x2 >> 48)]][1] ^ IT[15][S[(byte)(x2 >> 56)]][1]; \
|
||||
}
|
||||
|
||||
static inline void ISI(byte* val)
|
||||
{
|
||||
val[0] = IS[val[0]];
|
||||
val[1] = IS[val[1]];
|
||||
val[2] = IS[val[2]];
|
||||
val[3] = IS[val[3]];
|
||||
val[4] = IS[val[4]];
|
||||
val[5] = IS[val[5]];
|
||||
val[6] = IS[val[6]];
|
||||
val[7] = IS[val[7]];
|
||||
#define ISI(val) { \
|
||||
(val)[0] = IS[(val)[0]]; \
|
||||
(val)[1] = IS[(val)[1]]; \
|
||||
(val)[2] = IS[(val)[2]]; \
|
||||
(val)[3] = IS[(val)[3]]; \
|
||||
(val)[4] = IS[(val)[4]]; \
|
||||
(val)[5] = IS[(val)[5]]; \
|
||||
(val)[6] = IS[(val)[6]]; \
|
||||
(val)[7] = IS[(val)[7]]; \
|
||||
}
|
||||
|
||||
static inline void F(uint64 k00, uint64 k01, uint64 k10, uint64 k11, int i, uint64* o00, uint64* o01, uint64* o10, uint64* o11)
|
||||
{
|
||||
*o10 = k00;
|
||||
*o11 = k01;
|
||||
k00 ^= C[i][0];
|
||||
k01 ^= C[i][1];
|
||||
LS(k00, k01, o00, o01);
|
||||
*o00 ^= k10;
|
||||
*o01 ^= k11;
|
||||
}
|
||||
#define F(k00,k01,k10,k11,i,o00,o01,o10,o11) { \
|
||||
o10 = k00; \
|
||||
o11 = k01; \
|
||||
k00 ^= C[i][0]; \
|
||||
k01 ^= C[i][1]; \
|
||||
LS(k00, k01, o00, o01); \
|
||||
o00 ^= k10; \
|
||||
o01 ^= k11; \
|
||||
}
|
||||
|
||||
static inline void FK(uint64* k00, uint64* k01, uint64* k10, uint64* k11, int ist)
|
||||
{
|
||||
uint64 t00, t01, t10, t11;
|
||||
int i;
|
||||
for (i = 0; i < 8; i += 2)
|
||||
{
|
||||
F(*k00, *k01, *k10, *k11, i + ist, &t00, &t01, &t10, &t11);
|
||||
F(t00, t01, t10, t11, i + 1 + ist, k00, k01, k10, k11);
|
||||
}
|
||||
#define FK(k00,k01,k10,k11,ist) { \
|
||||
for (i = 0; i < 8; i += 2) \
|
||||
{ \
|
||||
F(k00, k01, k10, k11, i + ist, t00, t01, t10, t11); \
|
||||
F(t00, t01, t10, t11, i + 1 + ist, k00, k01, k10, k11); \
|
||||
} \
|
||||
}
|
||||
|
||||
void kuznyechik_set_key(const byte* key, kuznyechik_kds* kds)
|
||||
{
|
||||
int i;
|
||||
uint64 k00 = *(const uint64*)key;
|
||||
uint64 k01 = *(((const uint64*)key) + 1);
|
||||
uint64 k10 = *(((const uint64*)key) + 2);
|
||||
uint64 k11 = *(((const uint64*)key) + 3);
|
||||
|
||||
kds->rke[0][0] = k00;
|
||||
kds->rke[0][1] = k01;
|
||||
kds->rke[1][0] = k10;
|
||||
kds->rke[1][1] = k11;
|
||||
FK(&k00, &k01, &k10, &k11, 0);
|
||||
kds->rke[2][0] = k00;
|
||||
kds->rke[2][1] = k01;
|
||||
kds->rke[3][0] = k10;
|
||||
kds->rke[3][1] = k11;
|
||||
FK(&k00, &k01, &k10, &k11, 8);
|
||||
kds->rke[4][0] = k00;
|
||||
kds->rke[4][1] = k01;
|
||||
kds->rke[5][0] = k10;
|
||||
kds->rke[5][1] = k11;
|
||||
FK(&k00, &k01, &k10, &k11, 16);
|
||||
kds->rke[6][0] = k00;
|
||||
kds->rke[6][1] = k01;
|
||||
kds->rke[7][0] = k10;
|
||||
kds->rke[7][1] = k11;
|
||||
FK(&k00, &k01, &k10, &k11, 24);
|
||||
kds->rke[8][0] = k00;
|
||||
kds->rke[8][1] = k01;
|
||||
kds->rke[9][0] = k10;
|
||||
kds->rke[9][1] = k11;
|
||||
|
||||
kds->rkd[0][0] = kds->rke[0][0];
|
||||
kds->rkd[0][1] = kds->rke[0][1];
|
||||
|
||||
for (i = 1; i < 10; i++)
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined(_UEFI) && (!defined (TC_WINDOWS_DRIVER) || (!defined (DEBUG) && defined (_WIN64)))
|
||||
if(HasSSE2())
|
||||
{
|
||||
uint64 t1 = kds->rke[i][0], t2 = kds->rke[i][1];
|
||||
kds->rkd[i][0] = t1; kds->rkd[i][1] = t2;
|
||||
ILSS(t1, t2, &kds->rkd[i][0], &kds->rkd[i][1]);
|
||||
kuznyechik_set_key_simd (key, kds);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
int i;
|
||||
uint64 k00 = *(const uint64*)key;
|
||||
uint64 k01 = *(((const uint64*)key) + 1);
|
||||
uint64 k10 = *(((const uint64*)key) + 2);
|
||||
uint64 k11 = *(((const uint64*)key) + 3);
|
||||
uint64 t00, t01, t10, t11;
|
||||
|
||||
kds->rke[0] = k00;
|
||||
kds->rke[1] = k01;
|
||||
kds->rke[2] = k10;
|
||||
kds->rke[3] = k11;
|
||||
FK(k00, k01, k10, k11, 0);
|
||||
kds->rke[4] = k00;
|
||||
kds->rke[5] = k01;
|
||||
kds->rke[6] = k10;
|
||||
kds->rke[7] = k11;
|
||||
FK(k00, k01, k10, k11, 8);
|
||||
kds->rke[8] = k00;
|
||||
kds->rke[9] = k01;
|
||||
kds->rke[10] = k10;
|
||||
kds->rke[11] = k11;
|
||||
FK(k00, k01, k10, k11, 16);
|
||||
kds->rke[12] = k00;
|
||||
kds->rke[13] = k01;
|
||||
kds->rke[14] = k10;
|
||||
kds->rke[15] = k11;
|
||||
FK(k00, k01, k10, k11, 24);
|
||||
kds->rke[16] = k00;
|
||||
kds->rke[17] = k01;
|
||||
kds->rke[18] = k10;
|
||||
kds->rke[19] = k11;
|
||||
|
||||
kds->rkd[0] = kds->rke[0];
|
||||
kds->rkd[1] = kds->rke[1];
|
||||
|
||||
for (i = 1; i < 10; i++)
|
||||
{
|
||||
uint64 t1 = kds->rke[2*i], t2 = kds->rke[2*i+1];
|
||||
kds->rkd[2*i] = t1; kds->rkd[2*i + 1] = t2;
|
||||
ILSS(t1, t2, kds->rkd[2*i], kds->rkd[2*i+1]);
|
||||
}
|
||||
}
|
||||
#ifdef CPPCRYPTO_DEBUG
|
||||
for(int i = 0; i < 10; i++)
|
||||
printf("key[%d]: { 0x%016I64X, 0x%016I64X }\n", i, kds->rke[i][0], kds->rke[i][1]);
|
||||
printf("key[%d]: { 0x%016I64X, 0x%016I64X }\n", i, kds->rke[2*i], kds->rke[2*i+1]);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void kuznyechik_encrypt_block(byte* out, const byte* in, kuznyechik_kds* kds)
|
||||
{
|
||||
uint64 x1 = *(const uint64*)in;
|
||||
uint64 x2 = *(((const uint64*)in)+1);
|
||||
uint64 t1, t2;
|
||||
x1 ^= kds->rke[0][0];
|
||||
x2 ^= kds->rke[0][1];
|
||||
LS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rke[1][0];
|
||||
t2 ^= kds->rke[1][1];
|
||||
LS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rke[2][0];
|
||||
x2 ^= kds->rke[2][1];
|
||||
LS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rke[3][0];
|
||||
t2 ^= kds->rke[3][1];
|
||||
LS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rke[4][0];
|
||||
x2 ^= kds->rke[4][1];
|
||||
LS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rke[5][0];
|
||||
t2 ^= kds->rke[5][1];
|
||||
LS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rke[6][0];
|
||||
x2 ^= kds->rke[6][1];
|
||||
LS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rke[7][0];
|
||||
t2 ^= kds->rke[7][1];
|
||||
LS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rke[8][0];
|
||||
x2 ^= kds->rke[8][1];
|
||||
LS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rke[9][0];
|
||||
t2 ^= kds->rke[9][1];
|
||||
*(uint64*)out = t1;
|
||||
*(((uint64*)out) + 1) = t2;
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined(_UEFI) && (!defined (TC_WINDOWS_DRIVER) || (!defined (DEBUG) && defined (_WIN64)))
|
||||
if(HasSSE2())
|
||||
{
|
||||
kuznyechik_encrypt_block_simd (out, in, kds);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
uint64 x1 = *(const uint64*)in;
|
||||
uint64 x2 = *(((const uint64*)in)+1);
|
||||
uint64 t1, t2;
|
||||
x1 ^= kds->rke[0];
|
||||
x2 ^= kds->rke[1];
|
||||
LS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rke[2];
|
||||
t2 ^= kds->rke[3];
|
||||
LS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rke[4];
|
||||
x2 ^= kds->rke[5];
|
||||
LS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rke[6];
|
||||
t2 ^= kds->rke[7];
|
||||
LS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rke[8];
|
||||
x2 ^= kds->rke[9];
|
||||
LS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rke[10];
|
||||
t2 ^= kds->rke[11];
|
||||
LS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rke[12];
|
||||
x2 ^= kds->rke[13];
|
||||
LS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rke[14];
|
||||
t2 ^= kds->rke[15];
|
||||
LS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rke[16];
|
||||
x2 ^= kds->rke[17];
|
||||
LS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rke[18];
|
||||
t2 ^= kds->rke[19];
|
||||
*(uint64*)out = t1;
|
||||
*(((uint64*)out) + 1) = t2;
|
||||
}
|
||||
}
|
||||
|
||||
void kuznyechik_encrypt_blocks(byte* out, const byte* in, size_t blocks, kuznyechik_kds* kds)
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined(_UEFI) && (!defined (DEBUG) || !defined (TC_WINDOWS_DRIVER))
|
||||
if(HasSSE2())
|
||||
{
|
||||
kuznyechik_encrypt_blocks_simd (out, in, blocks, kds);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
while (blocks)
|
||||
{
|
||||
kuznyechik_encrypt_block (out, in, kds);
|
||||
in += 16;
|
||||
out += 16;
|
||||
blocks--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kuznyechik_decrypt_block(byte* out, const byte* in, kuznyechik_kds* kds)
|
||||
{
|
||||
uint64 x1 = *(const uint64*)in;
|
||||
uint64 x2 = *(((const uint64*)in) + 1);
|
||||
uint64 t1, t2;
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined(_UEFI) && (!defined (TC_WINDOWS_DRIVER) || (!defined (DEBUG) && defined (_WIN64)))
|
||||
if(HasSSE2())
|
||||
{
|
||||
kuznyechik_decrypt_block_simd (out, in, kds);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
uint64 x1 = *(const uint64*)in;
|
||||
uint64 x2 = *(((const uint64*)in) + 1);
|
||||
uint64 t1, t2;
|
||||
|
||||
ILSS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rkd[9][0];
|
||||
t2 ^= kds->rkd[9][1];
|
||||
ILS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rkd[8][0];
|
||||
x2 ^= kds->rkd[8][1];
|
||||
ILS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rkd[7][0];
|
||||
t2 ^= kds->rkd[7][1];
|
||||
ILS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rkd[6][0];
|
||||
x2 ^= kds->rkd[6][1];
|
||||
ILS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rkd[5][0];
|
||||
t2 ^= kds->rkd[5][1];
|
||||
ILS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rkd[4][0];
|
||||
x2 ^= kds->rkd[4][1];
|
||||
ILS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rkd[3][0];
|
||||
t2 ^= kds->rkd[3][1];
|
||||
ILS(t1, t2, &x1, &x2);
|
||||
x1 ^= kds->rkd[2][0];
|
||||
x2 ^= kds->rkd[2][1];
|
||||
ILS(x1, x2, &t1, &t2);
|
||||
t1 ^= kds->rkd[1][0];
|
||||
t2 ^= kds->rkd[1][1];
|
||||
ISI((byte*)&t1);
|
||||
ISI((byte*)&t2);
|
||||
t1 ^= kds->rkd[0][0];
|
||||
t2 ^= kds->rkd[0][1];
|
||||
*(uint64*)out = t1;
|
||||
*(((uint64*)out) + 1) = t2;
|
||||
ILSS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rkd[18];
|
||||
t2 ^= kds->rkd[19];
|
||||
ILS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rkd[16];
|
||||
x2 ^= kds->rkd[17];
|
||||
ILS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rkd[14];
|
||||
t2 ^= kds->rkd[15];
|
||||
ILS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rkd[12];
|
||||
x2 ^= kds->rkd[13];
|
||||
ILS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rkd[10];
|
||||
t2 ^= kds->rkd[11];
|
||||
ILS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rkd[8];
|
||||
x2 ^= kds->rkd[9];
|
||||
ILS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rkd[6];
|
||||
t2 ^= kds->rkd[7];
|
||||
ILS(t1, t2, x1, x2);
|
||||
x1 ^= kds->rkd[4];
|
||||
x2 ^= kds->rkd[5];
|
||||
ILS(x1, x2, t1, t2);
|
||||
t1 ^= kds->rkd[2];
|
||||
t2 ^= kds->rkd[3];
|
||||
ISI((byte*)&t1);
|
||||
ISI((byte*)&t2);
|
||||
t1 ^= kds->rkd[0];
|
||||
t2 ^= kds->rkd[1];
|
||||
*(uint64*)out = t1;
|
||||
*(((uint64*)out) + 1) = t2;
|
||||
}
|
||||
}
|
||||
|
||||
void kuznyechik_decrypt_blocks(byte* out, const byte* in, size_t blocks, kuznyechik_kds* kds)
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined(_UEFI) && (!defined (DEBUG) || !defined (TC_WINDOWS_DRIVER))
|
||||
if(HasSSE2())
|
||||
{
|
||||
kuznyechik_decrypt_blocks_simd (out, in, blocks, kds);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
while (blocks)
|
||||
{
|
||||
kuznyechik_decrypt_block (out, in, kds);
|
||||
in += 16;
|
||||
out += 16;
|
||||
blocks--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
static inline uint8_t mul_gf(uint8_t x, uint8_t y, uint16_t p) {
|
||||
|
@ -16,14 +16,16 @@ extern "C" {
|
||||
|
||||
typedef struct _kuznyechik_kds
|
||||
{
|
||||
uint64 rke[10][2];
|
||||
uint64 rkd[10][2];
|
||||
uint64 rke[20];
|
||||
uint64 rkd[20];
|
||||
} kuznyechik_kds;
|
||||
|
||||
#define KUZNYECHIK_KS (sizeof(kuznyechik_kds))
|
||||
|
||||
void kuznyechik_encrypt_block(byte* out, const byte* in, kuznyechik_kds* kds);
|
||||
void kuznyechik_encrypt_blocks(byte* out, const byte* in, size_t blocks, kuznyechik_kds* kds);
|
||||
void kuznyechik_decrypt_block(byte* out, const byte* in, kuznyechik_kds* kds);
|
||||
void kuznyechik_decrypt_blocks(byte* out, const byte* in, size_t blocks, kuznyechik_kds* kds);
|
||||
void kuznyechik_set_key(const byte* key, kuznyechik_kds *kds);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
9517
src/Crypto/kuznyechik_simd.c
Normal file
9517
src/Crypto/kuznyechik_simd.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -462,5 +462,53 @@ namespace VeraCrypt
|
||||
{
|
||||
kuznyechik_set_key (key, (kuznyechik_kds *) ScheduledKey.Ptr());
|
||||
}
|
||||
void CipherKuznyechik::EncryptBlocks (byte *data, size_t blockCount) const
|
||||
{
|
||||
if (!Initialized)
|
||||
throw NotInitialized (SRC_POS);
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
if ((blockCount >= 4)
|
||||
&& IsHwSupportAvailable())
|
||||
{
|
||||
kuznyechik_encrypt_blocks (data, data, blockCount, (kuznyechik_kds *) ScheduledKey.Ptr());
|
||||
}
|
||||
else
|
||||
#endif
|
||||
Cipher::EncryptBlocks (data, blockCount);
|
||||
}
|
||||
|
||||
void CipherKuznyechik::DecryptBlocks (byte *data, size_t blockCount) const
|
||||
{
|
||||
if (!Initialized)
|
||||
throw NotInitialized (SRC_POS);
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
if ((blockCount >= 4)
|
||||
&& IsHwSupportAvailable())
|
||||
{
|
||||
kuznyechik_decrypt_blocks (data, data, blockCount, (kuznyechik_kds *) ScheduledKey.Ptr());
|
||||
}
|
||||
else
|
||||
#endif
|
||||
Cipher::DecryptBlocks (data, blockCount);
|
||||
}
|
||||
|
||||
bool CipherKuznyechik::IsHwSupportAvailable () const
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
static bool state = false;
|
||||
static bool stateValid = false;
|
||||
|
||||
if (!stateValid)
|
||||
{
|
||||
state = HasSSE2() ? true : false;
|
||||
stateValid = true;
|
||||
}
|
||||
return state;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
bool Cipher::HwSupportEnabled = true;
|
||||
}
|
||||
|
@ -104,13 +104,13 @@ namespace VeraCrypt
|
||||
TC_CIPHER (Serpent, 16, 32);
|
||||
TC_CIPHER (Twofish, 16, 32);
|
||||
TC_CIPHER (Camellia, 16, 32);
|
||||
TC_CIPHER (Kuznyechik, 16, 32);
|
||||
|
||||
#undef TC_CIPHER_ADD_METHODS
|
||||
#define TC_CIPHER_ADD_METHODS
|
||||
|
||||
TC_CIPHER (Gost89, 16, 32);
|
||||
TC_CIPHER (Gost89StaticSBOX, 16, 32);
|
||||
TC_CIPHER (Kuznyechik, 16, 32);
|
||||
|
||||
#undef TC_CIPHER
|
||||
|
||||
|
@ -79,6 +79,7 @@ OBJS += ../Crypto/Camellia.o
|
||||
OBJS += ../Crypto/GostCipher.o
|
||||
OBJS += ../Crypto/Streebog.o
|
||||
OBJS += ../Crypto/kuznyechik.o
|
||||
OBJS += ../Crypto/kuznyechik_simd.o
|
||||
|
||||
OBJS += ../Common/Crc.o
|
||||
OBJS += ../Common/Endian.o
|
||||
|
Loading…
Reference in New Issue
Block a user