Merge pull request #7051 from gabor-mezei-arm/6376_Secp521r1_fast_reduction

Add a raw entry point to Secp521r1 fast reduction
diff --git a/library/ecp_curves.c b/library/ecp_curves.c
index 2a97b8c..1a027d6 100644
--- a/library/ecp_curves.c
+++ b/library/ecp_curves.c
@@ -4584,6 +4584,8 @@
 #endif
 #if defined(MBEDTLS_ECP_DP_SECP521R1_ENABLED)
 static int ecp_mod_p521(mbedtls_mpi *);
+MBEDTLS_STATIC_TESTABLE
+int mbedtls_ecp_mod_p521_raw(mbedtls_mpi_uint *N_p, size_t N_n);
 #endif
 
 #define NIST_MODP(P)      grp->modp = ecp_mod_ ## P;
@@ -5189,11 +5191,6 @@
           MBEDTLS_ECP_DP_SECP384R1_ENABLED */
 
 #if defined(MBEDTLS_ECP_DP_SECP521R1_ENABLED)
-/*
- * Here we have an actual Mersenne prime, so things are more straightforward.
- * However, chunks are aligned on a 'weird' boundary (521 bits).
- */
-
 /* Size of p521 in terms of mbedtls_mpi_uint */
 #define P521_WIDTH      (521 / 8 / sizeof(mbedtls_mpi_uint) + 1)
 
@@ -5201,48 +5198,81 @@
 #define P521_MASK       0x01FF
 
 /*
- * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5)
- * Write N as A1 + 2^521 A0, return A0 + A1
+ * Fast quasi-reduction modulo p521 = 2^521 - 1 (FIPS 186-3 D.2.5)
  */
 static int ecp_mod_p521(mbedtls_mpi *N)
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    size_t i;
-    mbedtls_mpi M;
-    mbedtls_mpi_uint Mp[P521_WIDTH + 1];
-    /* Worst case for the size of M is when mbedtls_mpi_uint is 16 bits:
-     * we need to hold bits 513 to 1056, which is 34 limbs, that is
-     * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */
-
-    if (N->n < P521_WIDTH) {
-        return 0;
-    }
-
-    /* M = A1 */
-    M.s = 1;
-    M.n = N->n - (P521_WIDTH - 1);
-    if (M.n > P521_WIDTH + 1) {
-        M.n = P521_WIDTH + 1;
-    }
-    M.p = Mp;
-    memcpy(Mp, N->p + P521_WIDTH - 1, M.n * sizeof(mbedtls_mpi_uint));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_shift_r(&M, 521 % (8 * sizeof(mbedtls_mpi_uint))));
-
-    /* N = A0 */
-    N->p[P521_WIDTH - 1] &= P521_MASK;
-    for (i = P521_WIDTH; i < N->n; i++) {
-        N->p[i] = 0;
-    }
-
-    /* N = A0 + A1 */
-    MBEDTLS_MPI_CHK(mbedtls_mpi_add_abs(N, N, &M));
-
+    size_t expected_width = 2 * P521_WIDTH;
+    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(N, expected_width));
+    ret = mbedtls_ecp_mod_p521_raw(N->p, expected_width);
 cleanup:
     return ret;
 }
 
+MBEDTLS_STATIC_TESTABLE
+int mbedtls_ecp_mod_p521_raw(mbedtls_mpi_uint *X, size_t X_limbs)
+{
+    mbedtls_mpi_uint carry = 0;
+
+    if (X_limbs != 2 * P521_WIDTH || X[2 * P521_WIDTH - 1] != 0) {
+        return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+    }
+
+    /* Step 1: Reduction to P521_WIDTH limbs */
+    /* Helper references for bottom part of X */
+    mbedtls_mpi_uint *X0 = X;
+    size_t X0_limbs = P521_WIDTH;
+    /* Helper references for top part of X */
+    mbedtls_mpi_uint *X1 = X + X0_limbs;
+    size_t X1_limbs = X_limbs - X0_limbs;
+    /* Split X as X0 + 2^P521_WIDTH X1 and compute X0 + 2^(biL - 9) X1.
+     * (We are using that 2^P521_WIDTH = 2^(512 + biL) and that
+     * 2^(512 + biL) X1 = 2^(biL - 9) X1 mod P521.)
+     * The high order limb of the result will be held in carry and the rest
+     * in X0 (that is the result will be represented as
+     * 2^P521_WIDTH carry + X0).
+     *
+     * Also, note that the resulting carry is either 0 or 1:
+     * X0 < 2^P521_WIDTH = 2^(512 + biL) and X1 < 2^(P521_WIDTH-biL) = 2^512
+     * therefore
+     * X0 + 2^(biL - 9) X1 < 2^(512 + biL) + 2^(512 + biL - 9)
+     * which in turn is less than 2 * 2^(512 + biL).
+     */
+    mbedtls_mpi_uint shift = ((mbedtls_mpi_uint) 1u) << (biL - 9);
+    carry = mbedtls_mpi_core_mla(X0, X0_limbs, X1, X1_limbs, shift);
+    /* Set X to X0 (by clearing the top part). */
+    memset(X1, 0, X1_limbs * sizeof(mbedtls_mpi_uint));
+
+    /* Step 2: Reduction modulo P521
+     *
+     * At this point X is reduced to P521_WIDTH limbs. What remains is to add
+     * the carry (that is 2^P521_WIDTH carry) and to reduce mod P521. */
+
+    /* 2^P521_WIDTH carry = 2^(512 + biL) carry = 2^(biL - 9) carry mod P521.
+     * Also, recall that carry is either 0 or 1. */
+    mbedtls_mpi_uint addend = carry << (biL - 9);
+    /* Keep the top 9 bits and reduce the rest, using 2^521 = 1 mod P521. */
+    addend += (X[P521_WIDTH - 1] >> 9);
+    X[P521_WIDTH - 1] &= P521_MASK;
+
+    /* Resuse the top part of X (already zeroed) as a helper array for
+     * carrying out the addition. */
+    mbedtls_mpi_uint *addend_arr = X + P521_WIDTH;
+    addend_arr[0] = addend;
+    (void) mbedtls_mpi_core_add(X, X, addend_arr, P521_WIDTH);
+    /* Both addends were less than P521 therefore X < 2 * P521. (This also means
+     * that the result fit in P521_WIDTH limbs and there won't be any carry.) */
+
+    /* Clear the reused part of X. */
+    addend_arr[0] = 0;
+
+    return 0;
+}
+
 #undef P521_WIDTH
 #undef P521_MASK
+
 #endif /* MBEDTLS_ECP_DP_SECP521R1_ENABLED */
 
 #endif /* MBEDTLS_ECP_NIST_OPTIM */
diff --git a/library/ecp_invasive.h b/library/ecp_invasive.h
index 3ee238e..3d1321c 100644
--- a/library/ecp_invasive.h
+++ b/library/ecp_invasive.h
@@ -95,6 +95,28 @@
 
 #endif /* MBEDTLS_ECP_DP_SECP192R1_ENABLED */
 
+#if defined(MBEDTLS_ECP_DP_SECP521R1_ENABLED)
+
+/** Fast quasi-reduction modulo p521 = 2^521 - 1 (FIPS 186-3 D.2.5)
+ *
+ * \param[in,out]   X       The address of the MPI to be converted.
+ *                          Must have twice as many limbs as the modulus
+ *                          (the modulus is 521 bits long). Upon return this
+ *                          holds the reduced value. The reduced value is
+ *                          in range `0 <= X < 2 * N` (where N is the modulus).
+ *                          and its the bitlength is one plus the bitlength
+ *                          of the modulus.
+ * \param[in]       X_limbs The length of \p X in limbs.
+ *
+ * \return          \c 0 on success.
+ * \return          #MBEDTLS_ERR_ECP_BAD_INPUT_DATA if \p X_limbs does not have
+ *                  twice as many limbs as the modulus.
+ */
+MBEDTLS_STATIC_TESTABLE
+int mbedtls_ecp_mod_p521_raw(mbedtls_mpi_uint *X, size_t X_limbs);
+
+#endif /* MBEDTLS_ECP_DP_SECP521R1_ENABLED */
+
 #endif /* MBEDTLS_TEST_HOOKS && MBEDTLS_ECP_C */
 
 #endif /* MBEDTLS_ECP_INVASIVE_H */
diff --git a/scripts/mbedtls_dev/ecp.py b/scripts/mbedtls_dev/ecp.py
index 93cd212..6370d25 100644
--- a/scripts/mbedtls_dev/ecp.py
+++ b/scripts/mbedtls_dev/ecp.py
@@ -75,3 +75,94 @@
     @property
     def is_valid(self) -> bool:
         return True
+
+class EcpP521R1Raw(bignum_common.ModOperationCommon,
+                   EcpTarget):
+    """Test cases for ecp quasi_reduction()."""
+    test_function = "ecp_mod_p521_raw"
+    test_name = "ecp_mod_p521_raw"
+    input_style = "arch_split"
+    arity = 1
+
+    moduli = [("01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+               "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff")
+             ] # type: List[str]
+
+    input_values = [
+        "0", "1",
+
+        # Corner case: maximum canonical P521 multiplication result
+        ("0003ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+         "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+         "fffff800"
+         "0000000000000000000000000000000000000000000000000000000000000000"
+         "0000000000000000000000000000000000000000000000000000000000000004"),
+
+        # Test case for overflow during addition
+        ("0001efffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+         "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+         "000001ef"
+         "0000000000000000000000000000000000000000000000000000000000000000"
+         "000000000000000000000000000000000000000000000000000000000f000000"),
+
+        # First 8 number generated by random.getrandbits(1042) - seed(2,2)
+        ("0003cc2e82523e86feac7eb7dc38f519b91751dacdbd47d364be8049a372db8f"
+         "6e405d93ffed9235288bc781ae66267594c9c9500925e4749b575bd13653f8dd"
+         "9b1f282e"
+         "4067c3584ee207f8da94e3e8ab73738fcf1822ffbc6887782b491044d5e34124"
+         "5c6e433715ba2bdd177219d30e7a269fd95bafc8f2a4d27bdcf4bb99f4bea973"),
+        ("00017052829e07b0829a48d422fe99a22c70501e533c91352d3d854e061b9030"
+         "3b08c6e33c7295782d6c797f8f7d9b782a1be9cd8697bbd0e2520e33e44c5055"
+         "6c71c4a6"
+         "6148a86fe8624fab5186ee32ee8d7ee9770348a05d300cb90706a045defc044a"
+         "09325626e6b58de744ab6cce80877b6f71e1f6d2ef8acd128b4f2fc15f3f57eb"),
+        ("00021f15a7a83ee0761ebfd2bd143fa9b714210c665d7435c1066932f4767f26"
+         "294365b2721dea3bf63f23d0dbe53fcafb2147df5ca495fa5a91c89b97eeab64"
+         "ca2ce6bc"
+         "5d3fd983c34c769fe89204e2e8168561867e5e15bc01bfce6a27e0dfcbf87544"
+         "72154e76e4c11ab2fec3f6b32e8d4b8a8f54f8ceacaab39e83844b40ffa9b9f1"),
+        ("000381bc2a838af8d5c44a4eb3172062d08f1bb2531d6460f0caeef038c89b38"
+         "a8acb5137c9260dc74e088a9b9492f258ebdbfe3eb9ac688b9d39cca91551e82"
+         "59cc60b1"
+         "7604e4b4e73695c3e652c71a74667bffe202849da9643a295a9ac6decbd4d3e2"
+         "d4dec9ef83f0be4e80371eb97f81375eecc1cb6347733e847d718d733ff98ff3"),
+        ("00034816c8c69069134bccd3e1cf4f589f8e4ce0af29d115ef24bd625dd961e6"
+         "830b54fa7d28f93435339774bb1e386c4fd5079e681b8f5896838b769da59b74"
+         "a6c3181c"
+         "81e220df848b1df78feb994a81167346d4c0dca8b4c9e755cc9c3adcf515a823"
+         "4da4daeb4f3f87777ad1f45ae9500ec9c5e2486c44a4a8f69dc8db48e86ec9c6"),
+        ("000397846c4454b90f756132e16dce72f18e859835e1f291d322a7353ead4efe"
+         "440e2b4fda9c025a22f1a83185b98f5fc11e60de1b343f52ea748db9e020307a"
+         "aeb6db2c"
+         "3a038a709779ac1f45e9dd320c855fdfa7251af0930cdbd30f0ad2a81b2d19a2"
+         "beaa14a7ff3fe32a30ffc4eed0a7bd04e85bfcdd0227eeb7b9d7d01f5769da05"),
+        ("00002c3296e6bc4d62b47204007ee4fab105d83e85e951862f0981aebc1b00d9"
+         "2838e766ef9b6bf2d037fe2e20b6a8464174e75a5f834da70569c018eb2b5693"
+         "babb7fbb"
+         "0a76c196067cfdcb11457d9cf45e2fa01d7f4275153924800600571fac3a5b26"
+         "3fdf57cd2c0064975c3747465cc36c270e8a35b10828d569c268a20eb78ac332"),
+        ("00009d23b4917fc09f20dbb0dcc93f0e66dfe717c17313394391b6e2e6eacb0f"
+         "0bb7be72bd6d25009aeb7fa0c4169b148d2f527e72daf0a54ef25c0707e33868"
+         "7d1f7157"
+         "5653a45c49390aa51cf5192bbf67da14be11d56ba0b4a2969d8055a9f03f2d71"
+         "581d8e830112ff0f0948eccaf8877acf26c377c13f719726fd70bddacb4deeec"),
+
+        # Next 2 number generated by random.getrandbits(521)
+        ("12b84ae65e920a63ac1f2b64df6dff07870c9d531ae72a47403063238da1a1fe"
+         "3f9d6a179fa50f96cd4aff9261aa92c0e6f17ec940639bc2ccdf572df00790813e3"),
+        ("166049dd332a73fa0b26b75196cf87eb8a09b27ec714307c68c425424a1574f1"
+         "eedf5b0f16cdfdb839424d201e653f53d6883ca1c107ca6e706649889c0c7f38608")
+    ]
+
+    @property
+    def arg_a(self) -> str:
+        # Number of limbs: 2 * N
+        return super().format_arg('{:x}'.format(self.int_a)).zfill(2 * self.hex_digits)
+
+    def result(self) -> List[str]:
+        result = self.int_a % self.int_n
+        return [self.format_result(result)]
+
+    @property
+    def is_valid(self) -> bool:
+        return True
diff --git a/tests/suites/test_suite_ecp.function b/tests/suites/test_suite_ecp.function
index b9d2b5e..4e74d9b 100644
--- a/tests/suites/test_suite_ecp.function
+++ b/tests/suites/test_suite_ecp.function
@@ -4,8 +4,8 @@
 #include "mbedtls/ecdh.h"
 
 #include "bignum_core.h"
-#include "bignum_mod_raw_invasive.h"
 #include "ecp_invasive.h"
+#include "bignum_mod_raw_invasive.h"
 
 #if defined(MBEDTLS_TEST_HOOKS) &&                  \
     (defined(MBEDTLS_ECP_DP_SECP224R1_ENABLED) ||  \
@@ -1344,3 +1344,46 @@
     mbedtls_free(N);
 }
 /* END_CASE */
+
+/* BEGIN_CASE depends_on:MBEDTLS_TEST_HOOKS */
+void ecp_mod_p521_raw(char *input_N,
+                      char *input_X,
+                      char *result)
+{
+    mbedtls_mpi_uint *X = NULL;
+    mbedtls_mpi_uint *N = NULL;
+    mbedtls_mpi_uint *res = NULL;
+    size_t limbs_X;
+    size_t limbs_N;
+    size_t limbs_res;
+
+    mbedtls_mpi_mod_modulus m;
+    mbedtls_mpi_mod_modulus_init(&m);
+
+    TEST_EQUAL(mbedtls_test_read_mpi_core(&X,   &limbs_X,   input_X), 0);
+    TEST_EQUAL(mbedtls_test_read_mpi_core(&N,   &limbs_N,   input_N), 0);
+    TEST_EQUAL(mbedtls_test_read_mpi_core(&res, &limbs_res, result),  0);
+
+    size_t limbs = limbs_N;
+    size_t bytes = limbs * sizeof(mbedtls_mpi_uint);
+
+    TEST_EQUAL(limbs_X, 2 * limbs);
+    TEST_EQUAL(limbs_res, limbs);
+
+    TEST_EQUAL(mbedtls_mpi_mod_modulus_setup(
+                   &m, N, limbs,
+                   MBEDTLS_MPI_MOD_REP_MONTGOMERY), 0);
+
+    TEST_EQUAL(mbedtls_ecp_mod_p521_raw(X, limbs_X), 0);
+    TEST_LE_U(mbedtls_mpi_core_bitlen(X, limbs_X), 522);
+    mbedtls_mpi_mod_raw_fix_quasi_reduction(X, &m);
+    ASSERT_COMPARE(X, bytes, res, bytes);
+
+exit:
+    mbedtls_free(X);
+    mbedtls_free(res);
+
+    mbedtls_mpi_mod_modulus_free(&m);
+    mbedtls_free(N);
+}
+/* END_CASE */