aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstrongswan/plugins
diff options
context:
space:
mode:
authorMartin Willi <martin@revosec.ch>2015-04-14 12:38:18 +0200
committerMartin Willi <martin@revosec.ch>2015-04-15 13:44:40 +0200
commit37794878ccea66310391194daa26e7cb1ca0e85e (patch)
tree4d85951f1df7a19f5c98f06b1f2423ba1e2e97dd /src/libstrongswan/plugins
parent93f0080265a700ab9060783807b53c39e1b536b1 (diff)
downloadstrongswan-37794878ccea66310391194daa26e7cb1ca0e85e.tar.bz2
strongswan-37794878ccea66310391194daa26e7cb1ca0e85e.tar.xz
aesni: Avoid loading AES/GHASH round keys into local variables
The performance impact is not measurable, as the compiler loads these variables in xmm registers in unrolled loops anyway. However, we avoid loading these sensitive keys onto the stack. This happens for larger key schedules, where the register count is insufficient. If that key material is not on the stack, we can avoid to wipe it explicitly after crypto operations.
Diffstat (limited to 'src/libstrongswan/plugins')
-rw-r--r--src/libstrongswan/plugins/aesni/aesni_cbc.c612
-rw-r--r--src/libstrongswan/plugins/aesni/aesni_ccm.c496
-rw-r--r--src/libstrongswan/plugins/aesni/aesni_cmac.c82
-rw-r--r--src/libstrongswan/plugins/aesni/aesni_ctr.c543
-rw-r--r--src/libstrongswan/plugins/aesni/aesni_gcm.c998
-rw-r--r--src/libstrongswan/plugins/aesni/aesni_xcbc.c81
6 files changed, 1244 insertions, 1568 deletions
diff --git a/src/libstrongswan/plugins/aesni/aesni_cbc.c b/src/libstrongswan/plugins/aesni/aesni_cbc.c
index f2fce0f13..78ada7663 100644
--- a/src/libstrongswan/plugins/aesni/aesni_cbc.c
+++ b/src/libstrongswan/plugins/aesni/aesni_cbc.c
@@ -70,22 +70,10 @@ struct private_aesni_cbc_t {
static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i t, fb, *bi, *bo;
+ __m128i *ks, t, fb, *bi, *bo;
int i;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
@@ -94,19 +82,19 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
- fb = _mm_xor_si128(fb, k0);
-
- fb = _mm_aesenc_si128(fb, k1);
- fb = _mm_aesenc_si128(fb, k2);
- fb = _mm_aesenc_si128(fb, k3);
- fb = _mm_aesenc_si128(fb, k4);
- fb = _mm_aesenc_si128(fb, k5);
- fb = _mm_aesenc_si128(fb, k6);
- fb = _mm_aesenc_si128(fb, k7);
- fb = _mm_aesenc_si128(fb, k8);
- fb = _mm_aesenc_si128(fb, k9);
-
- fb = _mm_aesenclast_si128(fb, k10);
+ fb = _mm_xor_si128(fb, ks[0]);
+
+ fb = _mm_aesenc_si128(fb, ks[1]);
+ fb = _mm_aesenc_si128(fb, ks[2]);
+ fb = _mm_aesenc_si128(fb, ks[3]);
+ fb = _mm_aesenc_si128(fb, ks[4]);
+ fb = _mm_aesenc_si128(fb, ks[5]);
+ fb = _mm_aesenc_si128(fb, ks[6]);
+ fb = _mm_aesenc_si128(fb, ks[7]);
+ fb = _mm_aesenc_si128(fb, ks[8]);
+ fb = _mm_aesenc_si128(fb, ks[9]);
+
+ fb = _mm_aesenclast_si128(fb, ks[10]);
_mm_storeu_si128(bo + i, fb);
}
}
@@ -117,24 +105,12 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i last, *bi, *bo;
+ __m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@@ -153,52 +129,52 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
f4 = t3;
last = t4;
- t1 = _mm_xor_si128(t1, k0);
- t2 = _mm_xor_si128(t2, k0);
- t3 = _mm_xor_si128(t3, k0);
- t4 = _mm_xor_si128(t4, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t2 = _mm_aesdec_si128(t2, k1);
- t3 = _mm_aesdec_si128(t3, k1);
- t4 = _mm_aesdec_si128(t4, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t2 = _mm_aesdec_si128(t2, k2);
- t3 = _mm_aesdec_si128(t3, k2);
- t4 = _mm_aesdec_si128(t4, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t2 = _mm_aesdec_si128(t2, k3);
- t3 = _mm_aesdec_si128(t3, k3);
- t4 = _mm_aesdec_si128(t4, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t2 = _mm_aesdec_si128(t2, k4);
- t3 = _mm_aesdec_si128(t3, k4);
- t4 = _mm_aesdec_si128(t4, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t2 = _mm_aesdec_si128(t2, k5);
- t3 = _mm_aesdec_si128(t3, k5);
- t4 = _mm_aesdec_si128(t4, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t2 = _mm_aesdec_si128(t2, k6);
- t3 = _mm_aesdec_si128(t3, k6);
- t4 = _mm_aesdec_si128(t4, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t2 = _mm_aesdec_si128(t2, k7);
- t3 = _mm_aesdec_si128(t3, k7);
- t4 = _mm_aesdec_si128(t4, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t2 = _mm_aesdec_si128(t2, k8);
- t3 = _mm_aesdec_si128(t3, k8);
- t4 = _mm_aesdec_si128(t4, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t2 = _mm_aesdec_si128(t2, k9);
- t3 = _mm_aesdec_si128(t3, k9);
- t4 = _mm_aesdec_si128(t4, k9);
-
- t1 = _mm_aesdeclast_si128(t1, k10);
- t2 = _mm_aesdeclast_si128(t2, k10);
- t3 = _mm_aesdeclast_si128(t3, k10);
- t4 = _mm_aesdeclast_si128(t4, k10);
+ t1 = _mm_xor_si128(t1, ks[0]);
+ t2 = _mm_xor_si128(t2, ks[0]);
+ t3 = _mm_xor_si128(t3, ks[0]);
+ t4 = _mm_xor_si128(t4, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t2 = _mm_aesdec_si128(t2, ks[1]);
+ t3 = _mm_aesdec_si128(t3, ks[1]);
+ t4 = _mm_aesdec_si128(t4, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t2 = _mm_aesdec_si128(t2, ks[2]);
+ t3 = _mm_aesdec_si128(t3, ks[2]);
+ t4 = _mm_aesdec_si128(t4, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t2 = _mm_aesdec_si128(t2, ks[3]);
+ t3 = _mm_aesdec_si128(t3, ks[3]);
+ t4 = _mm_aesdec_si128(t4, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t2 = _mm_aesdec_si128(t2, ks[4]);
+ t3 = _mm_aesdec_si128(t3, ks[4]);
+ t4 = _mm_aesdec_si128(t4, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t2 = _mm_aesdec_si128(t2, ks[5]);
+ t3 = _mm_aesdec_si128(t3, ks[5]);
+ t4 = _mm_aesdec_si128(t4, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t2 = _mm_aesdec_si128(t2, ks[6]);
+ t3 = _mm_aesdec_si128(t3, ks[6]);
+ t4 = _mm_aesdec_si128(t4, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t2 = _mm_aesdec_si128(t2, ks[7]);
+ t3 = _mm_aesdec_si128(t3, ks[7]);
+ t4 = _mm_aesdec_si128(t4, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t2 = _mm_aesdec_si128(t2, ks[8]);
+ t3 = _mm_aesdec_si128(t3, ks[8]);
+ t4 = _mm_aesdec_si128(t4, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t2 = _mm_aesdec_si128(t2, ks[9]);
+ t3 = _mm_aesdec_si128(t3, ks[9]);
+ t4 = _mm_aesdec_si128(t4, ks[9]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[10]);
+ t2 = _mm_aesdeclast_si128(t2, ks[10]);
+ t3 = _mm_aesdeclast_si128(t3, ks[10]);
+ t4 = _mm_aesdeclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
@@ -213,19 +189,19 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(last, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t1 = _mm_aesdec_si128(t1, k9);
-
- t1 = _mm_aesdeclast_si128(t1, k10);
+ t1 = _mm_xor_si128(last, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
@@ -238,24 +214,10 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i t, fb, *bi, *bo;
+ __m128i *ks, t, fb, *bi, *bo;
int i;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
@@ -264,21 +226,21 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
- fb = _mm_xor_si128(fb, k0);
-
- fb = _mm_aesenc_si128(fb, k1);
- fb = _mm_aesenc_si128(fb, k2);
- fb = _mm_aesenc_si128(fb, k3);
- fb = _mm_aesenc_si128(fb, k4);
- fb = _mm_aesenc_si128(fb, k5);
- fb = _mm_aesenc_si128(fb, k6);
- fb = _mm_aesenc_si128(fb, k7);
- fb = _mm_aesenc_si128(fb, k8);
- fb = _mm_aesenc_si128(fb, k9);
- fb = _mm_aesenc_si128(fb, k10);
- fb = _mm_aesenc_si128(fb, k11);
-
- fb = _mm_aesenclast_si128(fb, k12);
+ fb = _mm_xor_si128(fb, ks[0]);
+
+ fb = _mm_aesenc_si128(fb, ks[1]);
+ fb = _mm_aesenc_si128(fb, ks[2]);
+ fb = _mm_aesenc_si128(fb, ks[3]);
+ fb = _mm_aesenc_si128(fb, ks[4]);
+ fb = _mm_aesenc_si128(fb, ks[5]);
+ fb = _mm_aesenc_si128(fb, ks[6]);
+ fb = _mm_aesenc_si128(fb, ks[7]);
+ fb = _mm_aesenc_si128(fb, ks[8]);
+ fb = _mm_aesenc_si128(fb, ks[9]);
+ fb = _mm_aesenc_si128(fb, ks[10]);
+ fb = _mm_aesenc_si128(fb, ks[11]);
+
+ fb = _mm_aesenclast_si128(fb, ks[12]);
_mm_storeu_si128(bo + i, fb);
}
}
@@ -289,26 +251,12 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i last, *bi, *bo;
+ __m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@@ -327,60 +275,60 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
f4 = t3;
last = t4;
- t1 = _mm_xor_si128(t1, k0);
- t2 = _mm_xor_si128(t2, k0);
- t3 = _mm_xor_si128(t3, k0);
- t4 = _mm_xor_si128(t4, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t2 = _mm_aesdec_si128(t2, k1);
- t3 = _mm_aesdec_si128(t3, k1);
- t4 = _mm_aesdec_si128(t4, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t2 = _mm_aesdec_si128(t2, k2);
- t3 = _mm_aesdec_si128(t3, k2);
- t4 = _mm_aesdec_si128(t4, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t2 = _mm_aesdec_si128(t2, k3);
- t3 = _mm_aesdec_si128(t3, k3);
- t4 = _mm_aesdec_si128(t4, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t2 = _mm_aesdec_si128(t2, k4);
- t3 = _mm_aesdec_si128(t3, k4);
- t4 = _mm_aesdec_si128(t4, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t2 = _mm_aesdec_si128(t2, k5);
- t3 = _mm_aesdec_si128(t3, k5);
- t4 = _mm_aesdec_si128(t4, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t2 = _mm_aesdec_si128(t2, k6);
- t3 = _mm_aesdec_si128(t3, k6);
- t4 = _mm_aesdec_si128(t4, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t2 = _mm_aesdec_si128(t2, k7);
- t3 = _mm_aesdec_si128(t3, k7);
- t4 = _mm_aesdec_si128(t4, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t2 = _mm_aesdec_si128(t2, k8);
- t3 = _mm_aesdec_si128(t3, k8);
- t4 = _mm_aesdec_si128(t4, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t2 = _mm_aesdec_si128(t2, k9);
- t3 = _mm_aesdec_si128(t3, k9);
- t4 = _mm_aesdec_si128(t4, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t2 = _mm_aesdec_si128(t2, k10);
- t3 = _mm_aesdec_si128(t3, k10);
- t4 = _mm_aesdec_si128(t4, k10);
- t1 = _mm_aesdec_si128(t1, k11);
- t2 = _mm_aesdec_si128(t2, k11);
- t3 = _mm_aesdec_si128(t3, k11);
- t4 = _mm_aesdec_si128(t4, k11);
-
- t1 = _mm_aesdeclast_si128(t1, k12);
- t2 = _mm_aesdeclast_si128(t2, k12);
- t3 = _mm_aesdeclast_si128(t3, k12);
- t4 = _mm_aesdeclast_si128(t4, k12);
+ t1 = _mm_xor_si128(t1, ks[0]);
+ t2 = _mm_xor_si128(t2, ks[0]);
+ t3 = _mm_xor_si128(t3, ks[0]);
+ t4 = _mm_xor_si128(t4, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t2 = _mm_aesdec_si128(t2, ks[1]);
+ t3 = _mm_aesdec_si128(t3, ks[1]);
+ t4 = _mm_aesdec_si128(t4, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t2 = _mm_aesdec_si128(t2, ks[2]);
+ t3 = _mm_aesdec_si128(t3, ks[2]);
+ t4 = _mm_aesdec_si128(t4, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t2 = _mm_aesdec_si128(t2, ks[3]);
+ t3 = _mm_aesdec_si128(t3, ks[3]);
+ t4 = _mm_aesdec_si128(t4, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t2 = _mm_aesdec_si128(t2, ks[4]);
+ t3 = _mm_aesdec_si128(t3, ks[4]);
+ t4 = _mm_aesdec_si128(t4, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t2 = _mm_aesdec_si128(t2, ks[5]);
+ t3 = _mm_aesdec_si128(t3, ks[5]);
+ t4 = _mm_aesdec_si128(t4, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t2 = _mm_aesdec_si128(t2, ks[6]);
+ t3 = _mm_aesdec_si128(t3, ks[6]);
+ t4 = _mm_aesdec_si128(t4, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t2 = _mm_aesdec_si128(t2, ks[7]);
+ t3 = _mm_aesdec_si128(t3, ks[7]);
+ t4 = _mm_aesdec_si128(t4, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t2 = _mm_aesdec_si128(t2, ks[8]);
+ t3 = _mm_aesdec_si128(t3, ks[8]);
+ t4 = _mm_aesdec_si128(t4, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t2 = _mm_aesdec_si128(t2, ks[9]);
+ t3 = _mm_aesdec_si128(t3, ks[9]);
+ t4 = _mm_aesdec_si128(t4, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t2 = _mm_aesdec_si128(t2, ks[10]);
+ t3 = _mm_aesdec_si128(t3, ks[10]);
+ t4 = _mm_aesdec_si128(t4, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+ t2 = _mm_aesdec_si128(t2, ks[11]);
+ t3 = _mm_aesdec_si128(t3, ks[11]);
+ t4 = _mm_aesdec_si128(t4, ks[11]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[12]);
+ t2 = _mm_aesdeclast_si128(t2, ks[12]);
+ t3 = _mm_aesdeclast_si128(t3, ks[12]);
+ t4 = _mm_aesdeclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
@@ -395,21 +343,21 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(last, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t1 = _mm_aesdec_si128(t1, k11);
-
- t1 = _mm_aesdeclast_si128(t1, k12);
+ t1 = _mm_xor_si128(last, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
@@ -422,26 +370,10 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i t, fb, *bi, *bo;
+ __m128i *ks, t, fb, *bi, *bo;
int i;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
- k13 = key->schedule[13];
- k14 = key->schedule[14];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
@@ -450,23 +382,23 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
- fb = _mm_xor_si128(fb, k0);
-
- fb = _mm_aesenc_si128(fb, k1);
- fb = _mm_aesenc_si128(fb, k2);
- fb = _mm_aesenc_si128(fb, k3);
- fb = _mm_aesenc_si128(fb, k4);
- fb = _mm_aesenc_si128(fb, k5);
- fb = _mm_aesenc_si128(fb, k6);
- fb = _mm_aesenc_si128(fb, k7);
- fb = _mm_aesenc_si128(fb, k8);
- fb = _mm_aesenc_si128(fb, k9);
- fb = _mm_aesenc_si128(fb, k10);
- fb = _mm_aesenc_si128(fb, k11);
- fb = _mm_aesenc_si128(fb, k12);
- fb = _mm_aesenc_si128(fb, k13);
-
- fb = _mm_aesenclast_si128(fb, k14);
+ fb = _mm_xor_si128(fb, ks[0]);
+
+ fb = _mm_aesenc_si128(fb, ks[1]);
+ fb = _mm_aesenc_si128(fb, ks[2]);
+ fb = _mm_aesenc_si128(fb, ks[3]);
+ fb = _mm_aesenc_si128(fb, ks[4]);
+ fb = _mm_aesenc_si128(fb, ks[5]);
+ fb = _mm_aesenc_si128(fb, ks[6]);
+ fb = _mm_aesenc_si128(fb, ks[7]);
+ fb = _mm_aesenc_si128(fb, ks[8]);
+ fb = _mm_aesenc_si128(fb, ks[9]);
+ fb = _mm_aesenc_si128(fb, ks[10]);
+ fb = _mm_aesenc_si128(fb, ks[11]);
+ fb = _mm_aesenc_si128(fb, ks[12]);
+ fb = _mm_aesenc_si128(fb, ks[13]);
+
+ fb = _mm_aesenclast_si128(fb, ks[14]);
_mm_storeu_si128(bo + i, fb);
}
}
@@ -477,28 +409,12 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i last, *bi, *bo;
+ __m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
- k13 = key->schedule[13];
- k14 = key->schedule[14];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@@ -517,68 +433,68 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
f4 = t3;
last = t4;
- t1 = _mm_xor_si128(t1, k0);
- t2 = _mm_xor_si128(t2, k0);
- t3 = _mm_xor_si128(t3, k0);
- t4 = _mm_xor_si128(t4, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t2 = _mm_aesdec_si128(t2, k1);
- t3 = _mm_aesdec_si128(t3, k1);
- t4 = _mm_aesdec_si128(t4, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t2 = _mm_aesdec_si128(t2, k2);
- t3 = _mm_aesdec_si128(t3, k2);
- t4 = _mm_aesdec_si128(t4, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t2 = _mm_aesdec_si128(t2, k3);
- t3 = _mm_aesdec_si128(t3, k3);
- t4 = _mm_aesdec_si128(t4, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t2 = _mm_aesdec_si128(t2, k4);
- t3 = _mm_aesdec_si128(t3, k4);
- t4 = _mm_aesdec_si128(t4, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t2 = _mm_aesdec_si128(t2, k5);
- t3 = _mm_aesdec_si128(t3, k5);
- t4 = _mm_aesdec_si128(t4, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t2 = _mm_aesdec_si128(t2, k6);
- t3 = _mm_aesdec_si128(t3, k6);
- t4 = _mm_aesdec_si128(t4, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t2 = _mm_aesdec_si128(t2, k7);
- t3 = _mm_aesdec_si128(t3, k7);
- t4 = _mm_aesdec_si128(t4, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t2 = _mm_aesdec_si128(t2, k8);
- t3 = _mm_aesdec_si128(t3, k8);
- t4 = _mm_aesdec_si128(t4, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t2 = _mm_aesdec_si128(t2, k9);
- t3 = _mm_aesdec_si128(t3, k9);
- t4 = _mm_aesdec_si128(t4, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t2 = _mm_aesdec_si128(t2, k10);
- t3 = _mm_aesdec_si128(t3, k10);
- t4 = _mm_aesdec_si128(t4, k10);
- t1 = _mm_aesdec_si128(t1, k11);
- t2 = _mm_aesdec_si128(t2, k11);
- t3 = _mm_aesdec_si128(t3, k11);
- t4 = _mm_aesdec_si128(t4, k11);
- t1 = _mm_aesdec_si128(t1, k12);
- t2 = _mm_aesdec_si128(t2, k12);
- t3 = _mm_aesdec_si128(t3, k12);
- t4 = _mm_aesdec_si128(t4, k12);
- t1 = _mm_aesdec_si128(t1, k13);
- t2 = _mm_aesdec_si128(t2, k13);
- t3 = _mm_aesdec_si128(t3, k13);
- t4 = _mm_aesdec_si128(t4, k13);
-
- t1 = _mm_aesdeclast_si128(t1, k14);
- t2 = _mm_aesdeclast_si128(t2, k14);
- t3 = _mm_aesdeclast_si128(t3, k14);
- t4 = _mm_aesdeclast_si128(t4, k14);
+ t1 = _mm_xor_si128(t1, ks[0]);
+ t2 = _mm_xor_si128(t2, ks[0]);
+ t3 = _mm_xor_si128(t3, ks[0]);
+ t4 = _mm_xor_si128(t4, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t2 = _mm_aesdec_si128(t2, ks[1]);
+ t3 = _mm_aesdec_si128(t3, ks[1]);
+ t4 = _mm_aesdec_si128(t4, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t2 = _mm_aesdec_si128(t2, ks[2]);
+ t3 = _mm_aesdec_si128(t3, ks[2]);
+ t4 = _mm_aesdec_si128(t4, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t2 = _mm_aesdec_si128(t2, ks[3]);
+ t3 = _mm_aesdec_si128(t3, ks[3]);
+ t4 = _mm_aesdec_si128(t4, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t2 = _mm_aesdec_si128(t2, ks[4]);
+ t3 = _mm_aesdec_si128(t3, ks[4]);
+ t4 = _mm_aesdec_si128(t4, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t2 = _mm_aesdec_si128(t2, ks[5]);
+ t3 = _mm_aesdec_si128(t3, ks[5]);
+ t4 = _mm_aesdec_si128(t4, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t2 = _mm_aesdec_si128(t2, ks[6]);
+ t3 = _mm_aesdec_si128(t3, ks[6]);
+ t4 = _mm_aesdec_si128(t4, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t2 = _mm_aesdec_si128(t2, ks[7]);
+ t3 = _mm_aesdec_si128(t3, ks[7]);
+ t4 = _mm_aesdec_si128(t4, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t2 = _mm_aesdec_si128(t2, ks[8]);
+ t3 = _mm_aesdec_si128(t3, ks[8]);
+ t4 = _mm_aesdec_si128(t4, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t2 = _mm_aesdec_si128(t2, ks[9]);
+ t3 = _mm_aesdec_si128(t3, ks[9]);
+ t4 = _mm_aesdec_si128(t4, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t2 = _mm_aesdec_si128(t2, ks[10]);
+ t3 = _mm_aesdec_si128(t3, ks[10]);
+ t4 = _mm_aesdec_si128(t4, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+ t2 = _mm_aesdec_si128(t2, ks[11]);
+ t3 = _mm_aesdec_si128(t3, ks[11]);
+ t4 = _mm_aesdec_si128(t4, ks[11]);
+ t1 = _mm_aesdec_si128(t1, ks[12]);
+ t2 = _mm_aesdec_si128(t2, ks[12]);
+ t3 = _mm_aesdec_si128(t3, ks[12]);
+ t4 = _mm_aesdec_si128(t4, ks[12]);
+ t1 = _mm_aesdec_si128(t1, ks[13]);
+ t2 = _mm_aesdec_si128(t2, ks[13]);
+ t3 = _mm_aesdec_si128(t3, ks[13]);
+ t4 = _mm_aesdec_si128(t4, ks[13]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[14]);
+ t2 = _mm_aesdeclast_si128(t2, ks[14]);
+ t3 = _mm_aesdeclast_si128(t3, ks[14]);
+ t4 = _mm_aesdeclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
@@ -593,23 +509,23 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(last, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t1 = _mm_aesdec_si128(t1, k11);
- t1 = _mm_aesdec_si128(t1, k12);
- t1 = _mm_aesdec_si128(t1, k13);
-
- t1 = _mm_aesdeclast_si128(t1, k14);
+ t1 = _mm_xor_si128(last, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+ t1 = _mm_aesdec_si128(t1, ks[12]);
+ t1 = _mm_aesdec_si128(t1, ks[13]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
diff --git a/src/libstrongswan/plugins/aesni/aesni_ccm.c b/src/libstrongswan/plugins/aesni/aesni_ccm.c
index 0e4a24f30..d523bc17a 100644
--- a/src/libstrongswan/plugins/aesni/aesni_ccm.c
+++ b/src/libstrongswan/plugins/aesni/aesni_ccm.c
@@ -159,17 +159,18 @@ static void build_ctr(private_aesni_ccm_t *this, u_int32_t i, u_char *iv,
static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
u_int16_t alen, u_char *assoc)
{
- __m128i b, t, c;
+ __m128i *ks, b, t, c;
u_int i, round, blocks, rem;
+ ks = this->key->schedule;
build_b0(this, len, alen, iv, &b);
c = _mm_loadu_si128(&b);
- c = _mm_xor_si128(c, this->key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- c = _mm_aesenc_si128(c, this->key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
}
- c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
if (alen)
{
@@ -200,12 +201,12 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
t = _mm_loadu_si128(((__m128i*)(assoc - sizeof(alen))) + i);
}
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, this->key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- c = _mm_aesenc_si128(c, this->key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
}
- c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
}
}
return c;
@@ -217,18 +218,19 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
static void crypt_icv(private_aesni_ccm_t *this, u_char *iv,
__m128i c, u_char *icv)
{
- __m128i b, t;
+ __m128i *ks, b, t;
u_int round;
+ ks = this->key->schedule;
build_ctr(this, 0, iv, &b);
t = _mm_loadu_si128(&b);
- t = _mm_xor_si128(t, this->key->schedule[0]);
+ t = _mm_xor_si128(t, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
t = _mm_xor_si128(t, c);
@@ -258,23 +260,24 @@ static inline __m128i increment_be(__m128i x)
static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
void *in, void *out, __m128i c)
{
- __m128i t, b, d;
+ __m128i *ks, t, b, d;
u_int round;
+ ks = key->schedule;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
d = _mm_loadu_si128(&b);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, key->schedule[0]);
- t = _mm_xor_si128(state, key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
for (round = 1; round < key->rounds; round++)
{
- c = _mm_aesenc_si128(c, key->schedule[round]);
- t = _mm_aesenc_si128(t, key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
- t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[key->rounds]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(&b, t);
@@ -290,31 +293,32 @@ static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
void *in, void *out, __m128i c)
{
- __m128i t, b, d;
+ __m128i *ks, t, b, d;
u_int round;
+ ks = key->schedule;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
d = _mm_loadu_si128(&b);
- t = _mm_xor_si128(state, key->schedule[0]);
+ t = _mm_xor_si128(state, ks[0]);
for (round = 1; round < key->rounds; round++)
{
- t = _mm_aesenc_si128(t, key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[key->rounds]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(&b, t);
memset((u_char*)&b + rem, 0, sizeof(b) - rem);
t = _mm_loadu_si128(&b);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < key->rounds; round++)
{
- c = _mm_aesenc_si128(c, key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
}
- c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[key->rounds]);
memcpy(out, &b, rem);
@@ -328,8 +332,7 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@@ -340,47 +343,37 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, k0);
- t = _mm_xor_si128(state, k0);
-
- c = _mm_aesenc_si128(c, k1);
- t = _mm_aesenc_si128(t, k1);
- c = _mm_aesenc_si128(c, k2);
- t = _mm_aesenc_si128(t, k2);
- c = _mm_aesenc_si128(c, k3);
- t = _mm_aesenc_si128(t, k3);
- c = _mm_aesenc_si128(c, k4);
- t = _mm_aesenc_si128(t, k4);
- c = _mm_aesenc_si128(c, k5);
- t = _mm_aesenc_si128(t, k5);
- c = _mm_aesenc_si128(c, k6);
- t = _mm_aesenc_si128(t, k6);
- c = _mm_aesenc_si128(c, k7);
- t = _mm_aesenc_si128(t, k7);
- c = _mm_aesenc_si128(c, k8);
- t = _mm_aesenc_si128(t, k8);
- c = _mm_aesenc_si128(c, k9);
- t = _mm_aesenc_si128(t, k9);
-
- c = _mm_aesenclast_si128(c, k10);
- t = _mm_aesenclast_si128(t, k10);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ t = _mm_aesenc_si128(t, ks[9]);
+
+ c = _mm_aesenclast_si128(c, ks[10]);
+ t = _mm_aesenclast_si128(t, ks[10]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
@@ -402,8 +395,7 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@@ -414,52 +406,42 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
- t = _mm_xor_si128(state, k0);
+ t = _mm_xor_si128(state, ks[0]);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_aesenclast_si128(t, ks[10]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, k0);
+ c = _mm_xor_si128(c, ks[0]);
- c = _mm_aesenc_si128(c, k1);
- c = _mm_aesenc_si128(c, k2);
- c = _mm_aesenc_si128(c, k3);
- c = _mm_aesenc_si128(c, k4);
- c = _mm_aesenc_si128(c, k5);
- c = _mm_aesenc_si128(c, k6);
- c = _mm_aesenc_si128(c, k7);
- c = _mm_aesenc_si128(c, k8);
- c = _mm_aesenc_si128(c, k9);
+ c = _mm_aesenc_si128(c, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
- c = _mm_aesenclast_si128(c, k10);
+ c = _mm_aesenclast_si128(c, ks[10]);
state = increment_be(state);
}
@@ -478,8 +460,7 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@@ -490,53 +471,41 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, k0);
- t = _mm_xor_si128(state, k0);
-
- c = _mm_aesenc_si128(c, k1);
- t = _mm_aesenc_si128(t, k1);
- c = _mm_aesenc_si128(c, k2);
- t = _mm_aesenc_si128(t, k2);
- c = _mm_aesenc_si128(c, k3);
- t = _mm_aesenc_si128(t, k3);
- c = _mm_aesenc_si128(c, k4);
- t = _mm_aesenc_si128(t, k4);
- c = _mm_aesenc_si128(c, k5);
- t = _mm_aesenc_si128(t, k5);
- c = _mm_aesenc_si128(c, k6);
- t = _mm_aesenc_si128(t, k6);
- c = _mm_aesenc_si128(c, k7);
- t = _mm_aesenc_si128(t, k7);
- c = _mm_aesenc_si128(c, k8);
- t = _mm_aesenc_si128(t, k8);
- c = _mm_aesenc_si128(c, k9);
- t = _mm_aesenc_si128(t, k9);
- c = _mm_aesenc_si128(c, k10);
- t = _mm_aesenc_si128(t, k10);
- c = _mm_aesenc_si128(c, k11);
- t = _mm_aesenc_si128(t, k11);
-
- c = _mm_aesenclast_si128(c, k12);
- t = _mm_aesenclast_si128(t, k12);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+ t = _mm_aesenc_si128(t, ks[11]);
+
+ c = _mm_aesenclast_si128(c, ks[12]);
+ t = _mm_aesenclast_si128(t, ks[12]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
@@ -558,8 +527,7 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@@ -570,58 +538,46 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
- t = _mm_xor_si128(state, k0);
-
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
-
- t = _mm_aesenclast_si128(t, k12);
+ t = _mm_xor_si128(state, ks[0]);
+
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ t = _mm_aesenc_si128(t, ks[11]);
+
+ t = _mm_aesenclast_si128(t, ks[12]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, k0);
-
- c = _mm_aesenc_si128(c, k1);
- c = _mm_aesenc_si128(c, k2);
- c = _mm_aesenc_si128(c, k3);
- c = _mm_aesenc_si128(c, k4);
- c = _mm_aesenc_si128(c, k5);
- c = _mm_aesenc_si128(c, k6);
- c = _mm_aesenc_si128(c, k7);
- c = _mm_aesenc_si128(c, k8);
- c = _mm_aesenc_si128(c, k9);
- c = _mm_aesenc_si128(c, k10);
- c = _mm_aesenc_si128(c, k11);
-
- c = _mm_aesenclast_si128(c, k12);
+ c = _mm_xor_si128(c, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+
+ c = _mm_aesenclast_si128(c, ks[12]);
state = increment_be(state);
}
@@ -640,8 +596,7 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@@ -652,59 +607,45 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, k0);
- t = _mm_xor_si128(state, k0);
-
- c = _mm_aesenc_si128(c, k1);
- t = _mm_aesenc_si128(t, k1);
- c = _mm_aesenc_si128(c, k2);
- t = _mm_aesenc_si128(t, k2);
- c = _mm_aesenc_si128(c, k3);
- t = _mm_aesenc_si128(t, k3);
- c = _mm_aesenc_si128(c, k4);
- t = _mm_aesenc_si128(t, k4);
- c = _mm_aesenc_si128(c, k5);
- t = _mm_aesenc_si128(t, k5);
- c = _mm_aesenc_si128(c, k6);
- t = _mm_aesenc_si128(t, k6);
- c = _mm_aesenc_si128(c, k7);
- t = _mm_aesenc_si128(t, k7);
- c = _mm_aesenc_si128(c, k8);
- t = _mm_aesenc_si128(t, k8);
- c = _mm_aesenc_si128(c, k9);
- t = _mm_aesenc_si128(t, k9);
- c = _mm_aesenc_si128(c, k10);
- t = _mm_aesenc_si128(t, k10);
- c = _mm_aesenc_si128(c, k11);
- t = _mm_aesenc_si128(t, k11);
- c = _mm_aesenc_si128(c, k12);
- t = _mm_aesenc_si128(t, k12);
- c = _mm_aesenc_si128(c, k13);
- t = _mm_aesenc_si128(t, k13);
-
- c = _mm_aesenclast_si128(c, k14);
- t = _mm_aesenclast_si128(t, k14);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+ t = _mm_aesenc_si128(t, ks[11]);
+ c = _mm_aesenc_si128(c, ks[12]);
+ t = _mm_aesenc_si128(t, ks[12]);
+ c = _mm_aesenc_si128(c, ks[13]);
+ t = _mm_aesenc_si128(t, ks[13]);
+
+ c = _mm_aesenclast_si128(c, ks[14]);
+ t = _mm_aesenclast_si128(t, ks[14]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
@@ -726,8 +667,7 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@@ -738,64 +678,50 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
- t = _mm_xor_si128(state, k0);
-
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
- t = _mm_aesenc_si128(t, k12);
- t = _mm_aesenc_si128(t, k13);
-
- t = _mm_aesenclast_si128(t, k14);
+ t = _mm_xor_si128(state, ks[0]);
+
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ t = _mm_aesenc_si128(t, ks[11]);
+ t = _mm_aesenc_si128(t, ks[12]);
+ t = _mm_aesenc_si128(t, ks[13]);
+
+ t = _mm_aesenclast_si128(t, ks[14]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, k0);
-
- c = _mm_aesenc_si128(c, k1);
- c = _mm_aesenc_si128(c, k2);
- c = _mm_aesenc_si128(c, k3);
- c = _mm_aesenc_si128(c, k4);
- c = _mm_aesenc_si128(c, k5);
- c = _mm_aesenc_si128(c, k6);
- c = _mm_aesenc_si128(c, k7);
- c = _mm_aesenc_si128(c, k8);
- c = _mm_aesenc_si128(c, k9);
- c = _mm_aesenc_si128(c, k10);
- c = _mm_aesenc_si128(c, k11);
- c = _mm_aesenc_si128(c, k12);
- c = _mm_aesenc_si128(c, k13);
-
- c = _mm_aesenclast_si128(c, k14);
+ c = _mm_xor_si128(c, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+ c = _mm_aesenc_si128(c, ks[12]);
+ c = _mm_aesenc_si128(c, ks[13]);
+
+ c = _mm_aesenclast_si128(c, ks[14]);
state = increment_be(state);
}
diff --git a/src/libstrongswan/plugins/aesni/aesni_cmac.c b/src/libstrongswan/plugins/aesni/aesni_cmac.c
index a35445fb4..d6a87e6d7 100644
--- a/src/libstrongswan/plugins/aesni/aesni_cmac.c
+++ b/src/libstrongswan/plugins/aesni/aesni_cmac.c
@@ -67,8 +67,7 @@ struct private_mac_t {
METHOD(mac_t, get_mac, bool,
private_mac_t *this, chunk_t data, u_int8_t *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i t, l, *bi;
+ __m128i *ks, t, l, *bi;
u_int blocks, rem, i;
if (!this->k)
@@ -76,18 +75,7 @@ METHOD(mac_t, get_mac, bool,
return FALSE;
}
- k0 = this->k->schedule[0];
- k1 = this->k->schedule[1];
- k2 = this->k->schedule[2];
- k3 = this->k->schedule[3];
- k4 = this->k->schedule[4];
- k5 = this->k->schedule[5];
- k6 = this->k->schedule[6];
- k7 = this->k->schedule[7];
- k8 = this->k->schedule[8];
- k9 = this->k->schedule[9];
- k10 = this->k->schedule[10];
-
+ ks = this->k->schedule;
t = this->t;
if (this->rem_size + data.len > AES_BLOCK_SIZE)
@@ -105,17 +93,17 @@ METHOD(mac_t, get_mac, bool,
t = _mm_xor_si128(t, _mm_loadu_si128((__m128i*)this->rem));
- t = _mm_xor_si128(t, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_xor_si128(t, ks[0]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenclast_si128(t, ks[10]);
/* process blocks M_2 ... M_n-1 */
bi = (__m128i*)data.ptr;
@@ -132,17 +120,17 @@ METHOD(mac_t, get_mac, bool,
{
t = _mm_xor_si128(t, _mm_loadu_si128(bi + i));
- t = _mm_xor_si128(t, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_xor_si128(t, ks[0]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenclast_si128(t, ks[10]);
}
/* store remaining bytes of block M_n */
@@ -188,17 +176,17 @@ METHOD(mac_t, get_mac, bool,
*/
t = _mm_xor_si128(l, t);
- t = _mm_xor_si128(t, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_xor_si128(t, ks[0]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenclast_si128(t, ks[10]);
_mm_storeu_si128((__m128i*)out, t);
diff --git a/src/libstrongswan/plugins/aesni/aesni_ctr.c b/src/libstrongswan/plugins/aesni/aesni_ctr.c
index e6f9b841a..989813814 100644
--- a/src/libstrongswan/plugins/aesni/aesni_ctr.c
+++ b/src/libstrongswan/plugins/aesni/aesni_ctr.c
@@ -87,10 +87,9 @@ static inline __m128i increment_be(__m128i x)
static void encrypt_ctr128(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
- __m128i state, b, *bi, *bo;
+ __m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
@@ -100,17 +99,7 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
@@ -119,56 +108,56 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t2 = _mm_xor_si128(state, k0);
+ t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t3 = _mm_xor_si128(state, k0);
+ t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t4 = _mm_xor_si128(state, k0);
+ t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
- t2 = _mm_aesenclast_si128(t2, k10);
- t3 = _mm_aesenclast_si128(t3, k10);
- t4 = _mm_aesenclast_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
+ t2 = _mm_aesenclast_si128(t2, ks[10]);
+ t3 = _mm_aesenclast_si128(t3, ks[10]);
+ t4 = _mm_aesenclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
@@ -183,20 +172,20 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
@@ -207,19 +196,19 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
- t1 = _mm_xor_si128(state, k0);
-
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_xor_si128(state, ks[0]);
+
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
@@ -233,10 +222,9 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
static void encrypt_ctr192(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
- __m128i state, b, *bi, *bo;
+ __m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
@@ -246,19 +234,7 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
@@ -267,64 +243,64 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t2 = _mm_xor_si128(state, k0);
+ t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t3 = _mm_xor_si128(state, k0);
+ t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t4 = _mm_xor_si128(state, k0);
+ t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
- t2 = _mm_aesenclast_si128(t2, k12);
- t3 = _mm_aesenclast_si128(t3, k12);
- t4 = _mm_aesenclast_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
+ t2 = _mm_aesenclast_si128(t2, ks[12]);
+ t3 = _mm_aesenclast_si128(t3, ks[12]);
+ t4 = _mm_aesenclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
@@ -339,22 +315,22 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
@@ -365,21 +341,21 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
- t1 = _mm_xor_si128(state, k0);
-
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
+ t1 = _mm_xor_si128(state, ks[0]);
+
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
@@ -393,10 +369,9 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
static void encrypt_ctr256(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
- __m128i state, b, *bi, *bo;
+ __m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
@@ -406,21 +381,7 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
@@ -429,72 +390,72 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t2 = _mm_xor_si128(state, k0);
+ t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t3 = _mm_xor_si128(state, k0);
+ t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t4 = _mm_xor_si128(state, k0);
+ t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t2 = _mm_aesenc_si128(t2, k12);
- t3 = _mm_aesenc_si128(t3, k12);
- t4 = _mm_aesenc_si128(t4, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t2 = _mm_aesenc_si128(t2, k13);
- t3 = _mm_aesenc_si128(t3, k13);
- t4 = _mm_aesenc_si128(t4, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
- t2 = _mm_aesenclast_si128(t2, k14);
- t3 = _mm_aesenclast_si128(t3, k14);
- t4 = _mm_aesenclast_si128(t4, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t2 = _mm_aesenc_si128(t2, ks[12]);
+ t3 = _mm_aesenc_si128(t3, ks[12]);
+ t4 = _mm_aesenc_si128(t4, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t2 = _mm_aesenc_si128(t2, ks[13]);
+ t3 = _mm_aesenc_si128(t3, ks[13]);
+ t4 = _mm_aesenc_si128(t4, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
+ t2 = _mm_aesenclast_si128(t2, ks[14]);
+ t3 = _mm_aesenclast_si128(t3, ks[14]);
+ t4 = _mm_aesenclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
@@ -509,24 +470,24 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
@@ -537,23 +498,23 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
- t1 = _mm_xor_si128(state, k0);
-
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
+ t1 = _mm_xor_si128(state, ks[0]);
+
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
diff --git a/src/libstrongswan/plugins/aesni/aesni_gcm.c b/src/libstrongswan/plugins/aesni/aesni_gcm.c
index 6296ad2fd..53c0b144e 100644
--- a/src/libstrongswan/plugins/aesni/aesni_gcm.c
+++ b/src/libstrongswan/plugins/aesni/aesni_gcm.c
@@ -327,15 +327,16 @@ static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y,
static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j,
u_char *icv)
{
- __m128i t, b;
+ __m128i *ks, t, b;
u_int round;
- t = _mm_xor_si128(j, this->key->schedule[0]);
+ ks = this->key->schedule;
+ t = _mm_xor_si128(j, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
t = _mm_xor_si128(y, t);
@@ -375,18 +376,19 @@ static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv)
static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
void *in, void *out, __m128i cb, __m128i y)
{
- __m128i t, b;
+ __m128i *ks, t, b;
u_int round;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
- t = _mm_xor_si128(cb, this->key->schedule[0]);
+ ks = this->key->schedule;
+ t = _mm_xor_si128(cb, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
b = _mm_xor_si128(t, b);
memcpy(out, &b, rem);
@@ -401,7 +403,7 @@ static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
void *in, void *out, __m128i cb, __m128i y)
{
- __m128i t, b;
+ __m128i *ks, t, b;
u_int round;
memset(&b, 0, sizeof(b));
@@ -409,12 +411,13 @@ static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
y = ghash(this->h, y, b);
- t = _mm_xor_si128(cb, this->key->schedule[0]);
+ ks = this->key->schedule;
+ t = _mm_xor_si128(cb, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
b = _mm_xor_si128(t, b);
memcpy(out, &b, rem);
@@ -429,9 +432,8 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
@@ -443,22 +445,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
@@ -467,56 +454,56 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
- t2 = _mm_aesenclast_si128(t2, k10);
- t3 = _mm_aesenclast_si128(t3, k10);
- t4 = _mm_aesenclast_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
+ t2 = _mm_aesenclast_si128(t2, ks[10]);
+ t3 = _mm_aesenclast_si128(t3, ks[10]);
+ t4 = _mm_aesenclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
@@ -524,7 +511,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
t4 = _mm_xor_si128(t4, d4);
y = _mm_xor_si128(y, t1);
- y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
@@ -536,22 +523,22 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
- y = ghash(h4, y, t1);
+ y = ghash(this->h, y, t1);
cb = increment_be(cb);
}
@@ -571,9 +558,8 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
@@ -585,22 +571,7 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
@@ -610,58 +581,58 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1);
- y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
- t2 = _mm_aesenclast_si128(t2, k10);
- t3 = _mm_aesenclast_si128(t3, k10);
- t4 = _mm_aesenclast_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
+ t2 = _mm_aesenclast_si128(t2, ks[10]);
+ t3 = _mm_aesenclast_si128(t3, ks[10]);
+ t4 = _mm_aesenclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
@@ -678,19 +649,19 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- y = ghash(h4, y, d1);
+ y = ghash(this->h, y, d1);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
@@ -713,9 +684,8 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
@@ -727,24 +697,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
@@ -753,64 +706,64 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
- t2 = _mm_aesenclast_si128(t2, k12);
- t3 = _mm_aesenclast_si128(t3, k12);
- t4 = _mm_aesenclast_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
+ t2 = _mm_aesenclast_si128(t2, ks[12]);
+ t3 = _mm_aesenclast_si128(t3, ks[12]);
+ t4 = _mm_aesenclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
@@ -818,7 +771,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
t4 = _mm_xor_si128(t4, d4);
y = _mm_xor_si128(y, t1);
- y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
@@ -830,24 +783,24 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenclast_si128(t1, k12);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
- y = ghash(h4, y, t1);
+ y = ghash(this->h, y, t1);
cb = increment_be(cb);
}
@@ -867,9 +820,8 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
@@ -881,24 +833,7 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
@@ -908,66 +843,66 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1);
- y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
- t2 = _mm_aesenclast_si128(t2, k12);
- t3 = _mm_aesenclast_si128(t3, k12);
- t4 = _mm_aesenclast_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
+ t2 = _mm_aesenclast_si128(t2, ks[12]);
+ t3 = _mm_aesenclast_si128(t3, ks[12]);
+ t4 = _mm_aesenclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
@@ -984,21 +919,21 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- y = ghash(h4, y, d1);
-
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenclast_si128(t1, k12);
+ y = ghash(this->h, y, d1);
+
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
@@ -1021,9 +956,8 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
@@ -1035,26 +969,7 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
@@ -1063,72 +978,72 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t2 = _mm_aesenc_si128(t2, k12);
- t3 = _mm_aesenc_si128(t3, k12);
- t4 = _mm_aesenc_si128(t4, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t2 = _mm_aesenc_si128(t2, k13);
- t3 = _mm_aesenc_si128(t3, k13);
- t4 = _mm_aesenc_si128(t4, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
- t2 = _mm_aesenclast_si128(t2, k14);
- t3 = _mm_aesenclast_si128(t3, k14);
- t4 = _mm_aesenclast_si128(t4, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t2 = _mm_aesenc_si128(t2, ks[12]);
+ t3 = _mm_aesenc_si128(t3, ks[12]);
+ t4 = _mm_aesenc_si128(t4, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t2 = _mm_aesenc_si128(t2, ks[13]);
+ t3 = _mm_aesenc_si128(t3, ks[13]);
+ t4 = _mm_aesenc_si128(t4, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
+ t2 = _mm_aesenclast_si128(t2, ks[14]);
+ t3 = _mm_aesenclast_si128(t3, ks[14]);
+ t4 = _mm_aesenclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
@@ -1136,7 +1051,7 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
t4 = _mm_xor_si128(t4, d4);
y = _mm_xor_si128(y, t1);
- y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
@@ -1148,21 +1063,21 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t1 = _mm_aesenclast_si128(t1, k14);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
@@ -1187,9 +1102,8 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
@@ -1201,26 +1115,7 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
@@ -1230,74 +1125,74 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1);
- y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t2 = _mm_aesenc_si128(t2, k12);
- t3 = _mm_aesenc_si128(t3, k12);
- t4 = _mm_aesenc_si128(t4, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t2 = _mm_aesenc_si128(t2, k13);
- t3 = _mm_aesenc_si128(t3, k13);
- t4 = _mm_aesenc_si128(t4, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
- t2 = _mm_aesenclast_si128(t2, k14);
- t3 = _mm_aesenclast_si128(t3, k14);
- t4 = _mm_aesenclast_si128(t4, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t2 = _mm_aesenc_si128(t2, ks[12]);
+ t3 = _mm_aesenc_si128(t3, ks[12]);
+ t4 = _mm_aesenc_si128(t4, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t2 = _mm_aesenc_si128(t2, ks[13]);
+ t3 = _mm_aesenc_si128(t3, ks[13]);
+ t4 = _mm_aesenc_si128(t4, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
+ t2 = _mm_aesenclast_si128(t2, ks[14]);
+ t3 = _mm_aesenclast_si128(t3, ks[14]);
+ t4 = _mm_aesenclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
@@ -1314,23 +1209,23 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
{
d1 = _mm_loadu_si128(bi + i);
- y = ghash(h4, y, d1);
-
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t1 = _mm_aesenclast_si128(t1, k14);
+ y = ghash(this->h, y, d1);
+
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
@@ -1423,7 +1318,7 @@ METHOD(aead_t, set_key, bool,
private_aesni_gcm_t *this, chunk_t key)
{
u_int round;
- __m128i h;
+ __m128i *ks, h;
if (key.len != this->key_size + SALT_SIZE)
{
@@ -1436,12 +1331,13 @@ METHOD(aead_t, set_key, bool,
DESTROY_IF(this->key);
this->key = aesni_key_create(TRUE, key);
- h = _mm_xor_si128(_mm_setzero_si128(), this->key->schedule[0]);
+ ks = this->key->schedule;
+ h = _mm_xor_si128(_mm_setzero_si128(), ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- h = _mm_aesenc_si128(h, this->key->schedule[round]);
+ h = _mm_aesenc_si128(h, ks[round]);
}
- h = _mm_aesenclast_si128(h, this->key->schedule[this->key->rounds]);
+ h = _mm_aesenclast_si128(h, ks[this->key->rounds]);
this->h = h;
h = swap128(h);
diff --git a/src/libstrongswan/plugins/aesni/aesni_xcbc.c b/src/libstrongswan/plugins/aesni/aesni_xcbc.c
index b2e8cd5ca..24a75cec0 100644
--- a/src/libstrongswan/plugins/aesni/aesni_xcbc.c
+++ b/src/libstrongswan/plugins/aesni/aesni_xcbc.c
@@ -72,8 +72,7 @@ struct private_aesni_mac_t {
METHOD(mac_t, get_mac, bool,
private_aesni_mac_t *this, chunk_t data, u_int8_t *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i e, *bi;
+ __m128i *ks, e, *bi;
u_int blocks, rem, i;
if (!this->k1)
@@ -81,17 +80,7 @@ METHOD(mac_t, get_mac, bool,
return FALSE;
}
- k0 = this->k1->schedule[0];
- k1 = this->k1->schedule[1];
- k2 = this->k1->schedule[2];
- k3 = this->k1->schedule[3];
- k4 = this->k1->schedule[4];
- k5 = this->k1->schedule[5];
- k6 = this->k1->schedule[6];
- k7 = this->k1->schedule[7];
- k8 = this->k1->schedule[8];
- k9 = this->k1->schedule[9];
- k10 = this->k1->schedule[10];
+ ks = this->k1->schedule;
e = this->e;
@@ -114,17 +103,17 @@ METHOD(mac_t, get_mac, bool,
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
- e = _mm_xor_si128(e, k0);
- e = _mm_aesenc_si128(e, k1);
- e = _mm_aesenc_si128(e, k2);
- e = _mm_aesenc_si128(e, k3);
- e = _mm_aesenc_si128(e, k4);
- e = _mm_aesenc_si128(e, k5);
- e = _mm_aesenc_si128(e, k6);
- e = _mm_aesenc_si128(e, k7);
- e = _mm_aesenc_si128(e, k8);
- e = _mm_aesenc_si128(e, k9);
- e = _mm_aesenclast_si128(e, k10);
+ e = _mm_xor_si128(e, ks[0]);
+ e = _mm_aesenc_si128(e, ks[1]);
+ e = _mm_aesenc_si128(e, ks[2]);
+ e = _mm_aesenc_si128(e, ks[3]);
+ e = _mm_aesenc_si128(e, ks[4]);
+ e = _mm_aesenc_si128(e, ks[5]);
+ e = _mm_aesenc_si128(e, ks[6]);
+ e = _mm_aesenc_si128(e, ks[7]);
+ e = _mm_aesenc_si128(e, ks[8]);
+ e = _mm_aesenc_si128(e, ks[9]);
+ e = _mm_aesenclast_si128(e, ks[10]);
bi = (__m128i*)data.ptr;
rem = data.len % AES_BLOCK_SIZE;
@@ -140,17 +129,17 @@ METHOD(mac_t, get_mac, bool,
{
e = _mm_xor_si128(e, _mm_loadu_si128(bi + i));
- e = _mm_xor_si128(e, k0);
- e = _mm_aesenc_si128(e, k1);
- e = _mm_aesenc_si128(e, k2);
- e = _mm_aesenc_si128(e, k3);
- e = _mm_aesenc_si128(e, k4);
- e = _mm_aesenc_si128(e, k5);
- e = _mm_aesenc_si128(e, k6);
- e = _mm_aesenc_si128(e, k7);
- e = _mm_aesenc_si128(e, k8);
- e = _mm_aesenc_si128(e, k9);
- e = _mm_aesenclast_si128(e, k10);
+ e = _mm_xor_si128(e, ks[0]);
+ e = _mm_aesenc_si128(e, ks[1]);
+ e = _mm_aesenc_si128(e, ks[2]);
+ e = _mm_aesenc_si128(e, ks[3]);
+ e = _mm_aesenc_si128(e, ks[4]);
+ e = _mm_aesenc_si128(e, ks[5]);
+ e = _mm_aesenc_si128(e, ks[6]);
+ e = _mm_aesenc_si128(e, ks[7]);
+ e = _mm_aesenc_si128(e, ks[8]);
+ e = _mm_aesenc_si128(e, ks[9]);
+ e = _mm_aesenclast_si128(e, ks[10]);
}
/* store remaining bytes of block M[n] */
@@ -196,17 +185,17 @@ METHOD(mac_t, get_mac, bool,
}
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
- e = _mm_xor_si128(e, k0);
- e = _mm_aesenc_si128(e, k1);
- e = _mm_aesenc_si128(e, k2);
- e = _mm_aesenc_si128(e, k3);
- e = _mm_aesenc_si128(e, k4);
- e = _mm_aesenc_si128(e, k5);
- e = _mm_aesenc_si128(e, k6);
- e = _mm_aesenc_si128(e, k7);
- e = _mm_aesenc_si128(e, k8);
- e = _mm_aesenc_si128(e, k9);
- e = _mm_aesenclast_si128(e, k10);
+ e = _mm_xor_si128(e, ks[0]);
+ e = _mm_aesenc_si128(e, ks[1]);
+ e = _mm_aesenc_si128(e, ks[2]);
+ e = _mm_aesenc_si128(e, ks[3]);
+ e = _mm_aesenc_si128(e, ks[4]);
+ e = _mm_aesenc_si128(e, ks[5]);
+ e = _mm_aesenc_si128(e, ks[6]);
+ e = _mm_aesenc_si128(e, ks[7]);
+ e = _mm_aesenc_si128(e, ks[8]);
+ e = _mm_aesenc_si128(e, ks[9]);
+ e = _mm_aesenclast_si128(e, ks[10]);
_mm_storeu_si128((__m128i*)out, e);
/* (2) Define E[0] = 0x00000000000000000000000000000000 */