ecp_curves: Refactored `mbedtls_ecp_mod_p255`.

This patch introduces following methods, as implemented in the
design prototype, and updates them to utilise the _core methods
available for multiplication and addition.
* `mbedtls_ecp_mod_p255()`
* `mbedtls_ecp_mod_p255_raw()`

An entry has been exposed in the `ecp_invasive.h` header
to facilitate testing.

Signed-off-by: Minos Galanakis <minos.galanakis@arm.com>
diff --git a/library/ecp_curves.c b/library/ecp_curves.c
index 6573f89..98e2f95 100644
--- a/library/ecp_curves.c
+++ b/library/ecp_curves.c
@@ -4604,6 +4604,8 @@
 /* Additional forward declarations */
 #if defined(MBEDTLS_ECP_DP_CURVE25519_ENABLED)
 static int ecp_mod_p255(mbedtls_mpi *);
+MBEDTLS_STATIC_TESTABLE
+int mbedtls_ecp_mod_p255_raw(mbedtls_mpi_uint *X, size_t X_limbs);
 #endif
 #if defined(MBEDTLS_ECP_DP_CURVE448_ENABLED)
 static int ecp_mod_p448(mbedtls_mpi *);
@@ -5417,26 +5419,47 @@
  */
 static int ecp_mod_p255(mbedtls_mpi *N)
 {
-    mbedtls_mpi_uint Mp[P255_WIDTH];
+    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
+    size_t expected_width = 2 * ((256 + biL - 1) / biL);
+    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(N, expected_width));
+    ret = mbedtls_ecp_mod_p255_raw(N->p, expected_width);
+cleanup:
+    return ret;
+}
 
-    /* Helper references for top part of N */
-    mbedtls_mpi_uint * const NT_p = N->p + P255_WIDTH;
-    const size_t NT_n = N->n - P255_WIDTH;
-    if (N->n <= P255_WIDTH) {
+MBEDTLS_STATIC_TESTABLE
+int mbedtls_ecp_mod_p255_raw(mbedtls_mpi_uint *X, size_t X_Limbs)
+{
+    mbedtls_mpi_uint carry[P255_WIDTH];
+    memset(carry, 0, sizeof(mbedtls_mpi_uint) * P255_WIDTH);
+
+    if (X_Limbs > 2*P255_WIDTH) {
+        X_Limbs = 2*P255_WIDTH;
+    } else if (X_Limbs < P255_WIDTH) {
         return 0;
     }
-    if (NT_n > P255_WIDTH) {
-        return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+
+    /* Step 1: Reduction to P255_WIDTH limbs */
+    if (X_Limbs > P255_WIDTH) {
+        /* Helper references for top part of N */
+        mbedtls_mpi_uint * const NT_p = X + P255_WIDTH;
+        const size_t NT_n = X_Limbs - P255_WIDTH;
+
+        /* N = A0 + 38 * A1, capture carry out */
+        carry[0] = mbedtls_mpi_core_mla(X, P255_WIDTH, NT_p, NT_n, 38);
+        /* Clear top part */
+        memset(NT_p, 0, sizeof(mbedtls_mpi_uint) * NT_n);
     }
 
-    /* Split N as N + 2^256 M */
-    memcpy(Mp,   NT_p, sizeof(mbedtls_mpi_uint) * NT_n);
-    memset(NT_p, 0,    sizeof(mbedtls_mpi_uint) * NT_n);
+    /* Step 2: Reduce to <p
+     * Split as A0 + 2^255*c, with c a scalar, and compute A0 + 19*c */
+    carry[0] <<= 1;
+    carry[0] += (X[P255_WIDTH-1] >> (biL - 1));
+    carry[0] *= 19;
 
-    /* N = A0 + 38 * A1 */
-    mbedtls_mpi_core_mla(N->p, P255_WIDTH + 1,
-                         Mp, NT_n,
-                         38);
+    /* Clear top bit */
+    X[P255_WIDTH-1] <<= 1; X[P255_WIDTH-1] >>= 1;
+    (void) mbedtls_mpi_core_add(X, X, &carry[0], P255_WIDTH);
 
     return 0;
 }