SECLIB-667: Accelerate SHA-256 with A64 crypto extensions

Provide an additional pair of #defines, MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
and MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY. At most one of them may be
specified. If used, it is necessary to compile with -march=armv8-a+crypto.

The MBEDTLS_SHA256_PROCESS_ALT and MBEDTLS_SHA256_ALT mechanisms
continue to work, and are mutually exclusive with A64_CRYPTO.

There should be minimal code size impact if no A64_CRYPTO option is set.

Signed-off-by: Tom Cosgrove <tom.cosgrove@arm.com>
diff --git a/library/sha256.c b/library/sha256.c
index c3573f8..0db5f4d 100644
--- a/library/sha256.c
+++ b/library/sha256.c
@@ -44,12 +44,97 @@
 #endif /* MBEDTLS_PLATFORM_C */
 #endif /* MBEDTLS_SELF_TEST */
 
+#if defined(__aarch64__)
+#  if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) || \
+      defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
+#    include <arm_neon.h>
+#  endif
+#  if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) && defined(__linux__)
+#    include <sys/auxv.h>
+#  endif
+#else
+#  undef MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY
+#  undef MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
+#endif
+
+#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT)
+/*
+ * Capability detection code comes early, so we can disable
+ * MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT if no detection mechanism found
+ */
+#if defined(HWCAP_SHA2)
+static int mbedtls_a64_crypto_sha256_check_support( void )
+{
+    return( ( getauxval( AT_HWCAP ) & HWCAP_SHA2 ) ? 1 : 0 );
+}
+#elif defined(__APPLE__)
+static int mbedtls_a64_crypto_sha256_check_support( void )
+{
+    return( 1 );
+}
+#elif defined(__unix__) && defined(SIG_SETMASK)
+/* Detection with SIGILL, setjmp() and longjmp() */
+#include <signal.h>
+#include <setjmp.h>
+
+#ifndef asm
+#define asm __asm__
+#endif
+
+static jmp_buf return_from_sigill;
+
+/*
+ * A64 SHA256 support detection via SIGILL
+ */
+static void sigill_handler( int signal )
+{
+    (void) signal;
+    longjmp( return_from_sigill, 1 );
+}
+
+static int mbedtls_a64_crypto_sha256_check_support( void )
+{
+    struct sigaction old_action, new_action;
+
+    sigset_t old_mask;
+    if( sigprocmask( 0, NULL, &old_mask ) )
+        return( 0 );
+
+    sigemptyset( &new_action.sa_mask );
+    new_action.sa_flags = 0;
+    new_action.sa_handler = sigill_handler;
+
+    sigaction( SIGILL, &new_action, &old_action );
+
+    static int ret = 0;
+
+    if( setjmp( return_from_sigill ) == 0 )        /* First return only */
+    {
+        /* If this traps, we will return a second time from setjmp() with 1 */
+        asm( "sha256h q0, q0, v0.4s" : : : "v0" );
+        ret = 1;
+    }
+
+    sigaction( SIGILL, &old_action, NULL );
+    sigprocmask( SIG_SETMASK, &old_mask, NULL );
+
+    return( ret );
+}
+#else
+#warning "No mechanism to detect A64_CRYPTO found, using C code only"
+#undef MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
+#endif  /* HWCAP_SHA2, __APPLE__, __unix__ && SIG_SETMASK */
+
+#endif  /* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT */
+
 #define SHA256_VALIDATE_RET(cond)                           \
     MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA256_BAD_INPUT_DATA )
 #define SHA256_VALIDATE(cond)  MBEDTLS_INTERNAL_VALIDATE( cond )
 
 #if !defined(MBEDTLS_SHA256_ALT)
 
+#define SHA256_BLOCK_SIZE 64
+
 void mbedtls_sha256_init( mbedtls_sha256_context *ctx )
 {
     SHA256_VALIDATE( ctx != NULL );
@@ -143,6 +228,132 @@
     0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
 };
 
+#endif
+
+#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) || \
+    defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
+
+#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
+#  define mbedtls_internal_sha256_process_many_a64_crypto mbedtls_internal_sha256_process_many
+#  define mbedtls_internal_sha256_process_a64_crypto      mbedtls_internal_sha256_process
+#endif
+
+static size_t mbedtls_internal_sha256_process_many_a64_crypto(
+                  mbedtls_sha256_context *ctx, const uint8_t *msg, size_t len )
+{
+    uint32x4_t abcd = vld1q_u32( &ctx->state[0] );
+    uint32x4_t efgh = vld1q_u32( &ctx->state[4] );
+
+    size_t processed = 0;
+
+    for( ;
+         len >= SHA256_BLOCK_SIZE;
+         processed += SHA256_BLOCK_SIZE,
+               msg += SHA256_BLOCK_SIZE,
+               len -= SHA256_BLOCK_SIZE )
+    {
+        uint32x4_t tmp, abcd_prev;
+
+        uint32x4_t abcd_orig = abcd;
+        uint32x4_t efgh_orig = efgh;
+
+        uint32x4_t sched0 = vld1q_u32( (const uint32_t *)( msg + 16 * 0 ) );
+        uint32x4_t sched1 = vld1q_u32( (const uint32_t *)( msg + 16 * 1 ) );
+        uint32x4_t sched2 = vld1q_u32( (const uint32_t *)( msg + 16 * 2 ) );
+        uint32x4_t sched3 = vld1q_u32( (const uint32_t *)( msg + 16 * 3 ) );
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__  /* Will be true if not defined */
+                                               /* Untested on BE */
+        sched0 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched0 ) ) );
+        sched1 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched1 ) ) );
+        sched2 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched2 ) ) );
+        sched3 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched3 ) ) );
+#endif
+
+        /* Rounds 0 to 3 */
+        tmp = vaddq_u32( sched0, vld1q_u32( &K[0] ) );
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+        efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+
+        /* Rounds 4 to 7 */
+        tmp = vaddq_u32( sched1, vld1q_u32( &K[4] ) );
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+        efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+
+        /* Rounds 8 to 11 */
+        tmp = vaddq_u32( sched2, vld1q_u32( &K[8] ) );
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+        efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+
+        /* Rounds 12 to 15 */
+        tmp = vaddq_u32( sched3, vld1q_u32( &K[12] ) );
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+        efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+
+        for( int t = 16; t < 64; t += 16 )
+        {
+            /* Rounds t to t + 3 */
+            sched0 = vsha256su1q_u32( vsha256su0q_u32( sched0, sched1 ), sched2, sched3 );
+            tmp = vaddq_u32( sched0, vld1q_u32( &K[t] ) );
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+            efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+
+            /* Rounds t + 4 to t + 7 */
+            sched1 = vsha256su1q_u32( vsha256su0q_u32( sched1, sched2 ), sched3, sched0 );
+            tmp = vaddq_u32( sched1, vld1q_u32( &K[t + 4] ) );
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+            efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+
+            /* Rounds t + 8 to t + 11 */
+            sched2 = vsha256su1q_u32( vsha256su0q_u32( sched2, sched3 ), sched0, sched1 );
+            tmp = vaddq_u32( sched2, vld1q_u32( &K[t + 8] ) );
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+            efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+
+            /* Rounds t + 12 to t + 15 */
+            sched3 = vsha256su1q_u32( vsha256su0q_u32( sched3, sched0 ), sched1, sched2 );
+            tmp = vaddq_u32( sched3, vld1q_u32( &K[t + 12] ) );
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
+            efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
+        }
+
+        abcd = vaddq_u32( abcd, abcd_orig );
+        efgh = vaddq_u32( efgh, efgh_orig );
+    }
+
+    vst1q_u32( &ctx->state[0], abcd );
+    vst1q_u32( &ctx->state[4], efgh );
+
+    return( processed );
+}
+
+int mbedtls_internal_sha256_process_a64_crypto( mbedtls_sha256_context *ctx,
+        const unsigned char data[SHA256_BLOCK_SIZE] )
+{
+    return( ( mbedtls_internal_sha256_process_many_a64_crypto( ctx, data,
+                SHA256_BLOCK_SIZE ) == SHA256_BLOCK_SIZE ) ? 0 : -1 );
+}
+
+#endif /* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT || MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY */
+
+
+#if !defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT)
+#define mbedtls_internal_sha256_process_many_c mbedtls_internal_sha256_process_many
+#define mbedtls_internal_sha256_process_c      mbedtls_internal_sha256_process
+#endif
+
+
+#if !defined(MBEDTLS_SHA256_PROCESS_ALT) && \
+    !defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
+
 #define  SHR(x,n) (((x) & 0xFFFFFFFF) >> (n))
 #define ROTR(x,n) (SHR(x,n) | ((x) << (32 - (n))))
 
@@ -169,8 +380,8 @@
         (d) += local.temp1; (h) = local.temp1 + local.temp2;        \
     } while( 0 )
 
-int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
-                                const unsigned char data[64] )
+int mbedtls_internal_sha256_process_c( mbedtls_sha256_context *ctx,
+                                const unsigned char data[SHA256_BLOCK_SIZE] )
 {
     struct
     {
@@ -257,7 +468,69 @@
     return( 0 );
 }
 
-#endif /* !MBEDTLS_SHA256_PROCESS_ALT */
+#endif /* !MBEDTLS_SHA256_PROCESS_ALT && !MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY */
+
+
+#if !defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
+
+static size_t mbedtls_internal_sha256_process_many_c(
+                  mbedtls_sha256_context *ctx, const uint8_t *data, size_t len )
+{
+    size_t processed = 0;
+
+    while( len >= SHA256_BLOCK_SIZE )
+    {
+        if( mbedtls_internal_sha256_process_c( ctx, data ) != 0 )
+            return( 0 );
+
+        data += SHA256_BLOCK_SIZE;
+        len  -= SHA256_BLOCK_SIZE;
+
+        processed += SHA256_BLOCK_SIZE;
+    }
+
+    return( processed );
+}
+
+#endif /* !MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY */
+
+
+#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT)
+
+static int mbedtls_a64_crypto_sha256_has_support( void )
+{
+    static int done = 0;
+    static int supported = 0;
+
+    if( !done )
+    {
+        supported = mbedtls_a64_crypto_sha256_check_support();
+        done = 1;
+    }
+
+    return( supported );
+}
+
+static size_t mbedtls_internal_sha256_process_many( mbedtls_sha256_context *ctx,
+                  const uint8_t *msg, size_t len )
+{
+    if( mbedtls_a64_crypto_sha256_has_support() )
+        return( mbedtls_internal_sha256_process_many_a64_crypto( ctx, msg, len ) );
+    else
+        return( mbedtls_internal_sha256_process_many_c( ctx, msg, len ) );
+}
+
+int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
+        const unsigned char data[SHA256_BLOCK_SIZE] )
+{
+    if( mbedtls_a64_crypto_sha256_has_support() )
+        return( mbedtls_internal_sha256_process_a64_crypto( ctx, data ) );
+    else
+        return( mbedtls_internal_sha256_process_c( ctx, data ) );
+}
+
+#endif /* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT */
+
 
 /*
  * SHA-256 process buffer
@@ -277,7 +550,7 @@
         return( 0 );
 
     left = ctx->total[0] & 0x3F;
-    fill = 64 - left;
+    fill = SHA256_BLOCK_SIZE - left;
 
     ctx->total[0] += (uint32_t) ilen;
     ctx->total[0] &= 0xFFFFFFFF;
@@ -297,13 +570,15 @@
         left = 0;
     }
 
-    while( ilen >= 64 )
+    while( ilen >= SHA256_BLOCK_SIZE )
     {
-        if( ( ret = mbedtls_internal_sha256_process( ctx, input ) ) != 0 )
-            return( ret );
+        size_t processed =
+                    mbedtls_internal_sha256_process_many( ctx, input, ilen );
+        if( processed < SHA256_BLOCK_SIZE )
+            return( MBEDTLS_ERR_ERROR_GENERIC_ERROR );
 
-        input += 64;
-        ilen  -= 64;
+        input += processed;
+        ilen  -= processed;
     }
 
     if( ilen > 0 )
@@ -340,7 +615,7 @@
     else
     {
         /* We'll need an extra block */
-        memset( ctx->buffer + used, 0, 64 - used );
+        memset( ctx->buffer + used, 0, SHA256_BLOCK_SIZE - used );
 
         if( ( ret = mbedtls_internal_sha256_process( ctx, ctx->buffer ) ) != 0 )
             return( ret );