Perf improvement in memcpy_if

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
diff --git a/library/constant_time.c b/library/constant_time.c
index 86cc066..6c7ef56 100644
--- a/library/constant_time.c
+++ b/library/constant_time.c
@@ -152,8 +152,13 @@
                           const unsigned char *src2,
                           size_t len)
 {
+#if defined(MBEDTLS_CT_SIZE_64)
+    const uint64_t mask     = (uint64_t) condition;
+    const uint64_t not_mask = (uint64_t) ~mbedtls_ct_compiler_opaque(condition);
+#else
     const uint32_t mask     = (uint32_t) condition;
     const uint32_t not_mask = (uint32_t) ~mbedtls_ct_compiler_opaque(condition);
+#endif
 
     /* If src2 is NULL, setup src2 so that we read from the destination address.
      *
@@ -167,11 +172,19 @@
     /* dest[i] = c1 == c2 ? src[i] : dest[i] */
     size_t i = 0;
 #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(MBEDTLS_CT_SIZE_64)
+    for (; (i + 8) <= len; i += 8) {
+        uint64_t a = mbedtls_get_unaligned_uint64(src1 + i) & mask;
+        uint64_t b = mbedtls_get_unaligned_uint64(src2 + i) & not_mask;
+        mbedtls_put_unaligned_uint64(dest + i, a | b);
+    }
+#else
     for (; (i + 4) <= len; i += 4) {
         uint32_t a = mbedtls_get_unaligned_uint32(src1 + i) & mask;
         uint32_t b = mbedtls_get_unaligned_uint32(src2 + i) & not_mask;
         mbedtls_put_unaligned_uint32(dest + i, a | b);
     }
+#endif /* defined(MBEDTLS_CT_SIZE_64) */
 #endif /* MBEDTLS_EFFICIENT_UNALIGNED_ACCESS */
     for (; i < len; i++) {
         dest[i] = (src1[i] & mask) | (src2[i] & not_mask);