From 11e9d19d7c6c3461cbab5e5670d66974cd7cf819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Fri, 4 Jun 2010 15:48:16 +0300 Subject: [PATCH 3/3] engine/padlock: implement sha1/sha224/sha256 acceleration Limited support for VIA C7 that works only when EVP_MD_CTX_FLAG_ONESHOT is used appropriately (as done by EVP_Digest, and my previous HMAC patch). Full support for VIA Nano including partial transformation. Benchmarks from VIA Nano 1.6GHz, done with including the previous HMAC and apps/speed patches done. From single run, error margin of about 100-200k. No padlock type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes sha1 20057.60k 51514.05k 99721.39k 130167.81k 142811.14k sha256 7757.72k 16907.18k 28937.05k 35181.23k 37568.51k hmac(sha1) 8582.53k 27644.69k 70402.30k 114602.67k 140167.85k With the patch sha1 37713.77k 114562.71k 259637.33k 379907.41k 438818.13k sha256 34262.86k 103233.75k 232476.07k 338386.60k 389860.01k hmac(sha1) 8424.70k 31475.11k 104036.10k 245559.30k 406667.26k --- engines/e_padlock.c | 596 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 553 insertions(+), 43 deletions(-) diff --git a/engines/e_padlock.c b/engines/e_padlock.c index 381a746..2f8c72a 100644 --- a/engines/e_padlock.c +++ b/engines/e_padlock.c @@ -3,6 +3,9 @@ * Written by Michal Ludvig * http://www.logix.cz/michal * + * SHA support by Timo Teras . Portions based on + * code originally written by Michal Ludvig. + * * Big thanks to Andy Polyakov for a help with optimization, * assembler fixes, port to MS Windows and a lot of other * valuable work on this engine! @@ -74,12 +77,23 @@ #ifndef OPENSSL_NO_AES #include #endif +#ifndef OPENSSL_NO_SHA +#include +#endif #include #include #ifndef OPENSSL_NO_HW #ifndef OPENSSL_NO_HW_PADLOCK +/* PadLock RNG is disabled by default */ +#define PADLOCK_NO_RNG 1 + +/* No ASM routines for SHA in MSC yet */ +#ifdef _MSC_VER +#define OPENSSL_NO_SHA +#endif + /* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */ #if (OPENSSL_VERSION_NUMBER >= 0x00908000L) # ifndef OPENSSL_NO_DYNAMIC_ENGINE @@ -140,58 +154,40 @@ static int padlock_available(void); static int padlock_init(ENGINE *e); /* RNG Stuff */ +#ifndef PADLOCK_NO_RNG static RAND_METHOD padlock_rand; - -/* Cipher Stuff */ -#ifndef OPENSSL_NO_AES -static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid); #endif /* Engine names */ static const char *padlock_id = "padlock"; static char padlock_name[100]; -/* Available features */ -static int padlock_use_ace = 0; /* Advanced Cryptography Engine */ -static int padlock_use_rng = 0; /* Random Number Generator */ -#ifndef OPENSSL_NO_AES -static int padlock_aes_align_required = 1; -#endif +static int padlock_bind_helper(ENGINE *e); -/* ===== Engine "management" functions ===== */ - -/* Prepare the ENGINE structure for registration */ -static int -padlock_bind_helper(ENGINE *e) -{ - /* Check available features */ - padlock_available(); - -#if 1 /* disable RNG for now, see commentary in vicinity of RNG code */ - padlock_use_rng=0; -#endif - - /* Generate a nice engine name with available features */ - BIO_snprintf(padlock_name, sizeof(padlock_name), - "VIA PadLock (%s, %s)", - padlock_use_rng ? "RNG" : "no-RNG", - padlock_use_ace ? "ACE" : "no-ACE"); + /* Available features */ +enum padlock_flags { + PADLOCK_RNG = 0x01, + PADLOCK_ACE = 0x02, + PADLOCK_ACE2 = 0x04, + PADLOCK_PHE = 0x08, + PADLOCK_PMM = 0x10, + PADLOCK_NANO = 0x20, +}; +enum padlock_flags padlock_flags; - /* Register everything or return with an error */ - if (!ENGINE_set_id(e, padlock_id) || - !ENGINE_set_name(e, padlock_name) || +#define PADLOCK_HAVE_RNG (padlock_flags & PADLOCK_RNG) +#define PADLOCK_HAVE_ACE (padlock_flags & (PADLOCK_ACE|PADLOCK_ACE2)) +#define PADLOCK_HAVE_ACE1 (padlock_flags & PADLOCK_ACE) +#define PADLOCK_HAVE_ACE2 (padlock_flags & PADLOCK_ACE2) +#define PADLOCK_HAVE_PHE (padlock_flags & PADLOCK_PHE) +#define PADLOCK_HAVE_PMM (padlock_flags & PADLOCK_PMM) +#define PADLOCK_HAVE_NANO (padlock_flags & PADLOCK_NANO) - !ENGINE_set_init_function(e, padlock_init) || #ifndef OPENSSL_NO_AES - (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) || +static int padlock_aes_align_required = 1; #endif - (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) { - return 0; - } - /* Everything looks good */ - return 1; -} +/* ===== Engine "management" functions ===== */ /* Constructor */ static ENGINE * @@ -215,7 +211,7 @@ ENGINE_padlock(void) static int padlock_init(ENGINE *e) { - return (padlock_use_rng || padlock_use_ace); + return padlock_flags; } /* This stuff is needed if this ENGINE is being compiled into a self-contained @@ -367,10 +363,20 @@ padlock_available(void) : "+a"(eax), "=d"(edx) : : "ecx"); /* Fill up some flags */ - padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6)); - padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2)); + padlock_flags |= ((edx & (0x3<<3)) ? PADLOCK_RNG : 0); + padlock_flags |= ((edx & (0x3<<7)) ? PADLOCK_ACE : 0); + padlock_flags |= ((edx & (0x3<<9)) ? PADLOCK_ACE2 : 0); + padlock_flags |= ((edx & (0x3<<11)) ? PADLOCK_PHE : 0); + padlock_flags |= ((edx & (0x3<<13)) ? PADLOCK_PMM : 0); + + /* Check for VIA Nano CPU */ + eax = 0x00000001; + asm volatile ("pushl %%ebx; cpuid; popl %%ebx" + : "+a"(eax) : : "ecx", "edx"); + if ((eax | 0x000F) == 0x06FF) + padlock_flags |= PADLOCK_NANO; - return padlock_use_ace + padlock_use_rng; + return padlock_flags; } #ifndef OPENSSL_NO_AES @@ -1159,6 +1165,454 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, #endif /* OPENSSL_NO_AES */ +#ifndef OPENSSL_NO_SHA + +static inline void +padlock_copy_bswap(void *dst, void *src, size_t count) +{ + uint32_t *udst = dst, *usrc = src; + unsigned int reg; + int i = 0; + + for (i = 0; i < count; i++) { + reg = usrc[i]; + asm volatile("bswapl %0" : "+&r"(reg)); + udst[i] = reg; + } +} + +#define PADLOCK_SHA_ALIGN(dd) (uint32_t*)(((uintptr_t)(dd) + 15) & ~15) +#define PADLOCK_SHA_HWCTX (128+16) + +static void +padlock_sha1(void *hwctx, const void *buf, uint32_t total, uint32_t now) +{ + uint32_t pos = total - now; + + asm volatile ("xsha1" + : "+S"(buf), "+D"(hwctx), "+a"(pos), "+c"(total) + : : "memory"); +} + +static void +padlock_sha1_partial(void *hwctx, const void *buf, uint32_t blocks) +{ + asm volatile ("xsha1" + : "+S"(buf), "+D"(hwctx), "+c"(blocks) + : "a"(-1) : "memory"); +} + +static int padlock_sha1_init(EVP_MD_CTX *ctx) +{ + return SHA1_Init(ctx->md_data); +} + +static int padlock_sha1_update(EVP_MD_CTX *ctx, const void *data, + size_t len) +{ + unsigned char hwctx[PADLOCK_SHA_HWCTX]; + uint32_t *aligned = PADLOCK_SHA_ALIGN(hwctx); + SHA_CTX *c = ctx->md_data; + uint_fast64_t total; + const unsigned char *p = data; + unsigned int l = 0; + + /* Calculate total length (Nl,Nh) is length in bits */ + total = (((uint_fast64_t) c->Nh) << 29) + (c->Nl >> 3); + total += len; + + if ((ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) && + (total <= 0xfffffffe)) { + if (c->num != 0) { + l = (len < SHA_CBLOCK) ? len : SHA_CBLOCK; + if (!SHA1_Update(c, data, l)) + return 0; + p += l; + if (c->num != 0) { + p = (unsigned char *) c->data; + len = c->num; + l = 0; + } + } + memcpy(aligned, &c->h0, 5 * sizeof(SHA_LONG)); + padlock_sha1(aligned, p, total, len - l); + memcpy(&c->h0, aligned, 5 * sizeof(SHA_LONG)); + c->num = -1; + return 1; + } + + return SHA1_Update(c, data, len); +} + +static int padlock_nano_sha1_update(EVP_MD_CTX *ctx, const void *data, + size_t len) +{ + unsigned char hwctx[PADLOCK_SHA_HWCTX]; + uint32_t *aligned = PADLOCK_SHA_ALIGN(hwctx); + SHA_CTX *c = ctx->md_data; + uint_fast64_t total; + unsigned char *p; + unsigned int n; + + /* Calculate total length (Nl,Nh) is length in bits */ + total = (((uint_fast64_t) c->Nh) << 29) + (c->Nl >> 3); + total += len; + c->Nh = total >> 29; + c->Nl = (total << 3) & 0xffffffffUL; + + memcpy(aligned, &c->h0, 5 * sizeof(SHA_LONG)); + + /* Check partial data */ + n = c->num; + if (n) { + p = (unsigned char *) c->data; + if (len >= SHA_CBLOCK || len+n >= SHA_CBLOCK) { + memcpy(p+n, data, SHA_CBLOCK-n); + padlock_sha1_partial(aligned, p, 1); + n = SHA_CBLOCK - n; + data += n; + len -= n; + c->num = 0; + memset(p, 0, SHA_CBLOCK); + } else { + memcpy(p+n, data, len); + c->num += (unsigned int)len; + return 1; + } + } + + /* Can we finalize straight away? */ + if ((ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) && + (total <= 0xfffffffe)) { + padlock_sha1(aligned, data, total, len); + memcpy(&c->h0, aligned, 5 * sizeof(SHA_LONG)); + c->num = -1; + return 1; + } + + /* Use nonfinalizing update */ + n = len / SHA_CBLOCK; + if (n != 0) { + padlock_sha1_partial(aligned, data, n); + data += n * SHA_CBLOCK; + len -= n * SHA_CBLOCK; + } + memcpy(&c->h0, aligned, 5 * sizeof(SHA_LONG)); + + /* Buffer remaining bytes */ + if (len) { + memcpy(c->data, data, len); + c->num = len; + } + + return 1; +} + +static int padlock_sha1_final(EVP_MD_CTX *ctx, unsigned char *md) +{ + SHA_CTX *c = ctx->md_data; + uint_fast64_t total; + + if (c->num == -1) { + padlock_copy_bswap(md, &c->h0, 5); + c->num = 0; + return 1; + } + + total = (((uint_fast64_t) c->Nh) << 29) + (c->Nl >> 3); + if (total <= 0xfffffffe) { + unsigned char hwctx[PADLOCK_SHA_HWCTX]; + uint32_t *aligned = PADLOCK_SHA_ALIGN(hwctx); + + memcpy(aligned, &c->h0, 5 * sizeof(SHA_LONG)); + padlock_sha1(aligned, c->data, total, c->num); + padlock_copy_bswap(md, aligned, 5); + c->num = 0; + return 1; + } + + return SHA1_Final(md, c); +} + +static EVP_MD padlock_sha1_md = { + NID_sha1, + NID_sha1WithRSAEncryption, + SHA_DIGEST_LENGTH, + 0, + padlock_sha1_init, + padlock_sha1_update, + padlock_sha1_final, + NULL, + NULL, + EVP_PKEY_RSA_method, + SHA_CBLOCK, + sizeof(SHA_CTX), +}; + +static EVP_MD padlock_dss1_md = { + NID_dsa, + NID_dsaWithSHA1, + SHA_DIGEST_LENGTH, + 0, + padlock_sha1_init, + padlock_sha1_update, + padlock_sha1_final, + NULL, + NULL, + EVP_PKEY_DSA_method, + SHA_CBLOCK, + sizeof(SHA_CTX), +}; + + +#if !defined(OPENSSL_NO_SHA256) + +static void +padlock_sha256(void *hwctx, const void *buf, uint32_t total, uint32_t now) +{ + uint32_t pos = total - now; + + asm volatile ("xsha256" + : "+S"(buf), "+D"(hwctx), "+a"(pos), "+c"(total) + : : "memory"); +} + +static void +padlock_sha256_partial(void *hwctx, const void *buf, uint32_t blocks) +{ + asm volatile ("xsha256" + : "+S"(buf), "+D"(hwctx), "+c"(blocks) + : "a"(-1) : "memory"); +} + +static int padlock_sha256_update(EVP_MD_CTX *ctx, const void *data, + size_t len) +{ + unsigned char hwctx[PADLOCK_SHA_HWCTX]; + uint32_t *aligned = PADLOCK_SHA_ALIGN(hwctx); + SHA256_CTX *c = ctx->md_data; + uint_fast64_t total; + const unsigned char *p = data; + unsigned int l = 0; + + /* Calculate total length (Nl,Nh) is length in bits */ + total = (((uint_fast64_t) c->Nh) << 29) + (c->Nl >> 3); + total += len; + + if ((ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) && + (total <= 0xfffffffe)) { + if (c->num != 0) { + l = (len < SHA256_CBLOCK) ? len : SHA256_CBLOCK; + if (!SHA256_Update(c, data, l)) + return 0; + p += l; + if (c->num != 0) { + p = (unsigned char *) c->data; + len = c->num; + l = 0; + } + } + memcpy(aligned, c->h, sizeof(c->h)); + padlock_sha256(aligned, p, total, len - l); + memcpy(c->h, aligned, sizeof(c->h)); + c->num = -1; + return 1; + } + + return SHA256_Update(c, data, len); +} + +static int padlock_nano_sha256_update(EVP_MD_CTX *ctx, const void *data, + size_t len) +{ + unsigned char hwctx[PADLOCK_SHA_HWCTX]; + uint32_t *aligned = PADLOCK_SHA_ALIGN(hwctx); + SHA256_CTX *c = ctx->md_data; + uint_fast64_t total; + unsigned char *p; + unsigned int n; + + /* Calculate total length (Nl,Nh) is length in bits */ + total = (((uint_fast64_t) c->Nh) << 29) + (c->Nl >> 3); + total += len; + c->Nh = total >> 29; + c->Nl = (total << 3) & 0xffffffffUL; + + memcpy(aligned, c->h, sizeof(c->h)); + + /* Check partial data */ + n = c->num; + if (n) { + p = (unsigned char *) c->data; + if (len >= SHA256_CBLOCK || len+n >= SHA256_CBLOCK) { + memcpy(p+n, data, SHA256_CBLOCK-n); + padlock_sha256_partial(aligned, p, 1); + n = SHA256_CBLOCK - n; + data += n; + len -= n; + c->num = 0; + memset(p, 0, SHA256_CBLOCK); + } else { + memcpy(p+n, data, len); + c->num += (unsigned int)len; + return 1; + } + } + + /* Can we finalize straight away? */ + if ((ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) && + (total <= 0xfffffffe)) { + padlock_sha256(aligned, data, total, len); + memcpy(c->h, aligned, sizeof(c->h)); + c->num = -1; + return 1; + } + + /* Use nonfinalizing update */ + n = len / SHA256_CBLOCK; + if (n != 0) { + padlock_sha256_partial(aligned, data, n); + data += n * SHA256_CBLOCK; + len -= n * SHA256_CBLOCK; + } + memcpy(c->h, aligned, sizeof(c->h)); + + /* Buffer remaining bytes */ + if (len) { + memcpy(c->data, data, len); + c->num = len; + } + + return 1; +} + +static int padlock_sha256_final(EVP_MD_CTX *ctx, unsigned char *md) +{ + SHA256_CTX *c = ctx->md_data; + uint_fast64_t total; + + if (c->num == -1) { + padlock_copy_bswap(md, c->h, sizeof(c->h)/sizeof(c->h[0])); + c->num = 0; + return 1; + } + + total = (((uint_fast64_t) c->Nh) << 29) + (c->Nl >> 3); + if (total <= 0xfffffffe) { + unsigned char hwctx[PADLOCK_SHA_HWCTX]; + uint32_t *aligned = PADLOCK_SHA_ALIGN(hwctx); + + memcpy(aligned, c->h, sizeof(c->h)); + padlock_sha256(aligned, c->data, total, c->num); + padlock_copy_bswap(md, aligned, sizeof(c->h)/sizeof(c->h[0])); + c->num = 0; + return 1; + } + + return SHA256_Final(md, c); +} + +#if !defined(OPENSSL_NO_SHA224) + +static int padlock_sha224_init(EVP_MD_CTX *ctx) +{ + return SHA224_Init(ctx->md_data); +} + +static EVP_MD padlock_sha224_md = { + NID_sha224, + NID_sha224WithRSAEncryption, + SHA224_DIGEST_LENGTH, + 0, + padlock_sha224_init, + padlock_sha256_update, + padlock_sha256_final, + NULL, + NULL, + EVP_PKEY_RSA_method, + SHA_CBLOCK, + sizeof(SHA256_CTX), +}; +#endif /* !OPENSSL_NO_SHA224 */ + +static int padlock_sha256_init(EVP_MD_CTX *ctx) +{ + return SHA256_Init(ctx->md_data); +} + +static EVP_MD padlock_sha256_md = { + NID_sha256, + NID_sha256WithRSAEncryption, + SHA256_DIGEST_LENGTH, + 0, + padlock_sha256_init, + padlock_sha256_update, + padlock_sha256_final, + NULL, + NULL, + EVP_PKEY_RSA_method, + SHA_CBLOCK, + sizeof(SHA256_CTX), +}; +#endif /* !OPENSSL_NO_SHA256 */ + +static int padlock_digest_nids[] = { +#if !defined(OPENSSL_NO_SHA) + NID_sha1, + NID_dsa, +#endif +#if !defined(OPENSSL_NO_SHA256) +#if !defined(OPENSSL_NO_SHA224) + NID_sha224, +#endif + NID_sha256, +#endif +}; + +static int padlock_digest_nids_num = sizeof(padlock_digest_nids)/sizeof(padlock_digest_nids[0]); + +static int +padlock_digests (ENGINE *e, const EVP_MD **digest, const int **nids, int nid) +{ + /* No specific digest => return a list of supported nids ... */ + if (!digest) { + *nids = padlock_digest_nids; + return padlock_digest_nids_num; + } + + /* ... or the requested "digest" otherwise */ + switch (nid) { +#if !defined(OPENSSL_NO_SHA) + case NID_sha1: + *digest = &padlock_sha1_md; + break; + case NID_dsa: + *digest = &padlock_dss1_md; + break; +#endif +#if !defined(OPENSSL_NO_SHA256) +#if !defined(OPENSSL_NO_SHA224) + case NID_sha224: + *digest = &padlock_sha224_md; + break; +#endif /* OPENSSL_NO_SHA224 */ + case NID_sha256: + *digest = &padlock_sha256_md; + break; +#endif /* OPENSSL_NO_SHA256 */ + default: + /* Sorry, we don't support this NID */ + *digest = NULL; + return 0; + } + + return 1; +} + +#endif /* OPENSSL_NO_SHA */ + +#ifndef PADLOCK_NO_RNG + /* ===== Random Number Generator ===== */ /* * This code is not engaged. The reason is that it does not comply @@ -1215,6 +1669,62 @@ static RAND_METHOD padlock_rand = { padlock_rand_status, /* rand status */ }; +#endif /* PADLOCK_NO_RNG */ + +/* Prepare the ENGINE structure for registration */ +static int +padlock_bind_helper(ENGINE *e) +{ + /* Check available features */ + padlock_available(); + + /* Generate a nice engine name with available features */ + BIO_snprintf(padlock_name, sizeof(padlock_name), + "VIA PadLock: %s%s%s%s%s%s", + padlock_flags ? "" : "not supported", + PADLOCK_HAVE_RNG ? "RNG " : "", + PADLOCK_HAVE_ACE ? (PADLOCK_HAVE_ACE2 ? "ACE2 " : "ACE ") : "", + PADLOCK_HAVE_PHE ? "PHE " : "", + PADLOCK_HAVE_PMM ? "PMM " : "", + PADLOCK_HAVE_NANO ? "NANO " : "" + ); + +#ifndef OPENSSL_NO_SHA + /* Use Nano SHA acceleration? */ + if (PADLOCK_HAVE_NANO) { + padlock_sha1_md.update = padlock_nano_sha1_update; + padlock_dss1_md.update = padlock_nano_sha1_update; +#if !defined(OPENSSL_NO_SHA256) +#if !defined(OPENSSL_NO_SHA224) + padlock_sha224_md.update = padlock_nano_sha256_update; +#endif + padlock_sha256_md.update = padlock_nano_sha256_update; +#endif + } +#endif + + /* Register everything or return with an error */ + if (!ENGINE_set_id(e, padlock_id) || + !ENGINE_set_name(e, padlock_name) || + + !ENGINE_set_init_function(e, padlock_init) +#ifndef OPENSSL_NO_AES + || (PADLOCK_HAVE_ACE && !ENGINE_set_ciphers (e, padlock_ciphers)) +#endif +#ifndef OPENSSL_NO_SHA + || (PADLOCK_HAVE_PHE && !ENGINE_set_digests (e, padlock_digests)) +#endif +#ifndef PADLOCK_NO_RNG + || (PADLOCK_HAVE_RNG && !ENGINE_set_RAND (e, &padlock_rand)) +#endif + ) { + return 0; + } + + /* Everything looks good */ + return 1; +} + #else /* !COMPILE_HW_PADLOCK */ #ifndef OPENSSL_NO_DYNAMIC_ENGINE OPENSSL_EXPORT -- 1.7.0.4