Merge pull request #5377 from hanno-arm/ecp_add_mixed_fewer_mpis

Minor improvements to ECC arithmetic subroutines
diff --git a/library/ecp.c b/library/ecp.c
index 0212069..ba76abb 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -342,6 +342,18 @@
 
 #endif /* MBEDTLS_ECP_RESTARTABLE */
 
+static void mpi_init_many( mbedtls_mpi *arr, size_t size )
+{
+    while( size-- )
+        mbedtls_mpi_init( arr++ );
+}
+
+static void mpi_free_many( mbedtls_mpi *arr, size_t size )
+{
+    while( size-- )
+        mbedtls_mpi_free( arr++ );
+}
+
 /*
  * List of supported curves:
  *  - internal ID
@@ -1093,9 +1105,11 @@
  * Reduce a mbedtls_mpi mod p in-place, to use after mbedtls_mpi_sub_mpi
  * N->s < 0 is a very fast test, which fails only if N is 0
  */
-#define MOD_SUB( N )                                                    \
-    while( (N).s < 0 && mbedtls_mpi_cmp_int( &(N), 0 ) != 0 )           \
-        MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( &(N), &(N), &grp->P ) )
+#define MOD_SUB( N )                                                          \
+    do {                                                                      \
+        while( (N)->s < 0 && mbedtls_mpi_cmp_int( (N), 0 ) != 0 )             \
+            MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( (N), (N), &grp->P ) );      \
+    } while( 0 )
 
 #if ( defined(MBEDTLS_ECP_SHORT_WEIERSTRASS_ENABLED) && \
       !( defined(MBEDTLS_ECP_NO_FALLBACK) && \
@@ -1111,7 +1125,7 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( X, A, B ) );
-    MOD_SUB( *X );
+    MOD_SUB( X );
 cleanup:
     return( ret );
 }
@@ -1122,9 +1136,9 @@
  * We known P, N and the result are positive, so sub_abs is correct, and
  * a bit faster.
  */
-#define MOD_ADD( N )                                                    \
-    while( mbedtls_mpi_cmp_mpi( &(N), &grp->P ) >= 0 )                  \
-        MBEDTLS_MPI_CHK( mbedtls_mpi_sub_abs( &(N), &(N), &grp->P ) )
+#define MOD_ADD( N )                                                   \
+    while( mbedtls_mpi_cmp_mpi( (N), &grp->P ) >= 0 )                  \
+        MBEDTLS_MPI_CHK( mbedtls_mpi_sub_abs( (N), (N), &grp->P ) )
 
 static inline int mbedtls_mpi_add_mod( const mbedtls_ecp_group *grp,
                                        mbedtls_mpi *X,
@@ -1133,11 +1147,40 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( X, A, B ) );
-    MOD_ADD( *X );
+    MOD_ADD( X );
 cleanup:
     return( ret );
 }
 
+static inline int mbedtls_mpi_mul_int_mod( const mbedtls_ecp_group *grp,
+                                           mbedtls_mpi *X,
+                                           const mbedtls_mpi *A,
+                                           mbedtls_mpi_uint c )
+{
+    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
+
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_int( X, A, c ) );
+    MOD_ADD( X );
+cleanup:
+    return( ret );
+}
+
+static inline int mbedtls_mpi_sub_int_mod( const mbedtls_ecp_group *grp,
+                                           mbedtls_mpi *X,
+                                           const mbedtls_mpi *A,
+                                           mbedtls_mpi_uint c )
+{
+    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
+
+    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_int( X, A, c ) );
+    MOD_SUB( X );
+cleanup:
+    return( ret );
+}
+
+#define MPI_ECP_SUB_INT( X, A, c )             \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_int_mod( grp, X, A, c ) )
+
 #if defined(MBEDTLS_ECP_SHORT_WEIERSTRASS_ENABLED) && \
     !( defined(MBEDTLS_ECP_NO_FALLBACK) && \
        defined(MBEDTLS_ECP_DOUBLE_JAC_ALT) && \
@@ -1148,12 +1191,77 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l( X, count ) );
-    MOD_ADD( *X );
+    MOD_ADD( X );
 cleanup:
     return( ret );
 }
 #endif /* All functions referencing mbedtls_mpi_shift_l_mod() are alt-implemented without fallback */
 
+/*
+ * Macro wrappers around ECP modular arithmetic
+ *
+ * Currently, these wrappers are defined via the bignum module.
+ */
+
+#define MPI_ECP_ADD( X, A, B )                                                  \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, X, A, B ) )
+
+#define MPI_ECP_SUB( X, A, B )                                                  \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, X, A, B ) )
+
+#define MPI_ECP_MUL( X, A, B )                                                  \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, X, A, B ) )
+
+#define MPI_ECP_SQR( X, A )                                                     \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, X, A, A ) )
+
+#define MPI_ECP_MUL_INT( X, A, c )                                              \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_int_mod( grp, X, A, c ) )
+
+#define MPI_ECP_INV( dst, src )                                                 \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_inv_mod( (dst), (src), &grp->P ) )
+
+#define MPI_ECP_MOV( X, A )                                                     \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, A ) )
+
+#define MPI_ECP_SHIFT_L( X, count )                                             \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l_mod( grp, X, count ) )
+
+#define MPI_ECP_LSET( X, c )                                                    \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( X, c ) )
+
+#define MPI_ECP_CMP_INT( X, c )                                                 \
+    mbedtls_mpi_cmp_int( X, c )
+
+#define MPI_ECP_CMP( X, Y )                                                     \
+    mbedtls_mpi_cmp_mpi( X, Y )
+
+/* Needs f_rng, p_rng to be defined. */
+#define MPI_ECP_RAND( X )                                                       \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_random( (X), 2, &grp->P, f_rng, p_rng ) )
+
+/* Conditional negation
+ * Needs grp and a temporary MPI tmp to be defined. */
+#define MPI_ECP_COND_NEG( X, cond )                                        \
+    do                                                                     \
+    {                                                                      \
+        unsigned char nonzero = mbedtls_mpi_cmp_int( (X), 0 ) != 0;        \
+        MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &tmp, &grp->P, (X) ) );      \
+        MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_assign( (X), &tmp,          \
+                                                       nonzero & cond ) ); \
+    } while( 0 )
+
+#define MPI_ECP_NEG( X ) MPI_ECP_COND_NEG( (X), 1 )
+
+#define MPI_ECP_VALID( X )                      \
+    ( (X)->p != NULL )
+
+#define MPI_ECP_COND_ASSIGN( X, Y, cond )       \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_assign( (X), (Y), (cond) ) )
+
+#define MPI_ECP_COND_SWAP( X, Y, cond )       \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_swap( (X), (Y), (cond) ) )
+
 #if defined(MBEDTLS_ECP_SHORT_WEIERSTRASS_ENABLED)
 /*
  * For curves in short Weierstrass form, we do all the internal operations in
@@ -1169,7 +1277,7 @@
  */
 static int ecp_normalize_jac( const mbedtls_ecp_group *grp, mbedtls_ecp_point *pt )
 {
-    if( mbedtls_mpi_cmp_int( &pt->Z, 0 ) == 0 )
+    if( MPI_ECP_CMP_INT( &pt->Z, 0 ) == 0 )
         return( 0 );
 
 #if defined(MBEDTLS_ECP_NORMALIZE_JAC_ALT)
@@ -1181,30 +1289,20 @@
     return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
 #else
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    mbedtls_mpi Zi, ZZi;
-    mbedtls_mpi_init( &Zi ); mbedtls_mpi_init( &ZZi );
+    mbedtls_mpi T;
+    mbedtls_mpi_init( &T );
 
-    /*
-     * X = X / Z^2  mod p
-     */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_inv_mod( &Zi,      &pt->Z,     &grp->P ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &ZZi,     &Zi,        &Zi     ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &pt->X,   &pt->X,     &ZZi    ) );
+    MPI_ECP_INV( &T,       &pt->Z );          /* T   <-          1 / Z   */
+    MPI_ECP_MUL( &pt->Y,   &pt->Y,     &T );  /* Y'  <- Y*T    = Y / Z   */
+    MPI_ECP_SQR( &T,       &T             );  /* T   <- T^2    = 1 / Z^2 */
+    MPI_ECP_MUL( &pt->X,   &pt->X,     &T );  /* X   <- X  * T = X / Z^2 */
+    MPI_ECP_MUL( &pt->Y,   &pt->Y,     &T );  /* Y'' <- Y' * T = Y / Z^3 */
 
-    /*
-     * Y = Y / Z^3  mod p
-     */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &pt->Y,   &pt->Y,     &ZZi    ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &pt->Y,   &pt->Y,     &Zi     ) );
-
-    /*
-     * Z = 1
-     */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &pt->Z, 1 ) );
+    MPI_ECP_LSET( &pt->Z, 1 );
 
 cleanup:
 
-    mbedtls_mpi_free( &Zi ); mbedtls_mpi_free( &ZZi );
+    mbedtls_mpi_free( &T );
 
     return( ret );
 #endif /* !defined(MBEDTLS_ECP_NO_FALLBACK) || !defined(MBEDTLS_ECP_NORMALIZE_JAC_ALT) */
@@ -1237,52 +1335,58 @@
 #else
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     size_t i;
-    mbedtls_mpi *c, u, Zi, ZZi;
+    mbedtls_mpi *c, t;
 
     if( ( c = mbedtls_calloc( T_size, sizeof( mbedtls_mpi ) ) ) == NULL )
         return( MBEDTLS_ERR_ECP_ALLOC_FAILED );
 
-    for( i = 0; i < T_size; i++ )
-        mbedtls_mpi_init( &c[i] );
+    mbedtls_mpi_init( &t );
 
-    mbedtls_mpi_init( &u ); mbedtls_mpi_init( &Zi ); mbedtls_mpi_init( &ZZi );
-
+    mpi_init_many( c, T_size );
     /*
-     * c[i] = Z_0 * ... * Z_i
+     * c[i] = Z_0 * ... * Z_i,   i = 0,..,n := T_size-1
      */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &c[0], &T[0]->Z ) );
+    MPI_ECP_MOV( &c[0], &T[0]->Z );
     for( i = 1; i < T_size; i++ )
     {
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &c[i], &c[i-1], &T[i]->Z ) );
+        MPI_ECP_MUL( &c[i], &c[i-1], &T[i]->Z );
     }
 
     /*
-     * u = 1 / (Z_0 * ... * Z_n) mod P
+     * c[n] = 1 / (Z_0 * ... * Z_n) mod P
      */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_inv_mod( &u, &c[T_size-1], &grp->P ) );
+    MPI_ECP_INV( &c[T_size-1], &c[T_size-1] );
 
     for( i = T_size - 1; ; i-- )
     {
-        /*
-         * Zi = 1 / Z_i mod p
-         * u = 1 / (Z_0 * ... * Z_i) mod P
+        /* At the start of iteration i (note that i decrements), we have
+         * - c[j] = Z_0 * .... * Z_j        for j  < i,
+         * - c[j] = 1 / (Z_0 * .... * Z_j)  for j == i,
+         *
+         * This is maintained via
+         * - c[i-1] <- c[i] * Z_i
+         *
+         * We also derive 1/Z_i = c[i] * c[i-1] for i>0 and use that
+         * to do the actual normalization. For i==0, we already have
+         * c[0] = 1 / Z_0.
          */
-        if( i == 0 ) {
-            MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &Zi, &u ) );
+
+        if( i > 0 )
+        {
+            /* Compute 1/Z_i and establish invariant for the next iteration. */
+            MPI_ECP_MUL( &t,      &c[i], &c[i-1]  );
+            MPI_ECP_MUL( &c[i-1], &c[i], &T[i]->Z );
         }
         else
         {
-            MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &Zi, &u, &c[i-1]  ) );
-            MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &u,  &u, &T[i]->Z ) );
+            MPI_ECP_MOV( &t, &c[0] );
         }
 
-        /*
-         * proceed as in normalize()
-         */
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &ZZi,     &Zi,      &Zi  ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T[i]->X, &T[i]->X, &ZZi ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T[i]->Y, &T[i]->Y, &ZZi ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T[i]->Y, &T[i]->Y, &Zi  ) );
+        /* Now t holds 1 / Z_i; normalize as in ecp_normalize_jac() */
+        MPI_ECP_MUL( &T[i]->Y, &T[i]->Y, &t );
+        MPI_ECP_SQR( &t,       &t           );
+        MPI_ECP_MUL( &T[i]->X, &T[i]->X, &t );
+        MPI_ECP_MUL( &T[i]->Y, &T[i]->Y, &t );
 
         /*
          * Post-precessing: reclaim some memory by shrinking coordinates
@@ -1292,7 +1396,8 @@
          */
         MBEDTLS_MPI_CHK( mbedtls_mpi_shrink( &T[i]->X, grp->P.n ) );
         MBEDTLS_MPI_CHK( mbedtls_mpi_shrink( &T[i]->Y, grp->P.n ) );
-        mbedtls_mpi_free( &T[i]->Z );
+
+        MPI_ECP_LSET( &T[i]->Z, 1 );
 
         if( i == 0 )
             break;
@@ -1300,9 +1405,8 @@
 
 cleanup:
 
-    mbedtls_mpi_free( &u ); mbedtls_mpi_free( &Zi ); mbedtls_mpi_free( &ZZi );
-    for( i = 0; i < T_size; i++ )
-        mbedtls_mpi_free( &c[i] );
+    mbedtls_mpi_free( &t );
+    mpi_free_many( c, T_size );
     mbedtls_free( c );
 
     return( ret );
@@ -1318,19 +1422,13 @@
                             unsigned char inv )
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    unsigned char nonzero;
-    mbedtls_mpi mQY;
+    mbedtls_mpi tmp;
+    mbedtls_mpi_init( &tmp );
 
-    mbedtls_mpi_init( &mQY );
-
-    /* Use the fact that -Q.Y mod P = P - Q.Y unless Q.Y == 0 */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &mQY, &grp->P, &Q->Y ) );
-    nonzero = mbedtls_mpi_cmp_int( &Q->Y, 0 ) != 0;
-    MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_assign( &Q->Y, &mQY, inv & nonzero ) );
+    MPI_ECP_COND_NEG( &Q->Y, inv );
 
 cleanup:
-    mbedtls_mpi_free( &mQY );
-
+    mbedtls_mpi_free( &tmp );
     return( ret );
 }
 
@@ -1349,7 +1447,8 @@
  *             3M + 6S + 1a     otherwise
  */
 static int ecp_double_jac( const mbedtls_ecp_group *grp, mbedtls_ecp_point *R,
-                           const mbedtls_ecp_point *P )
+                           const mbedtls_ecp_point *P,
+                           mbedtls_mpi tmp[4] )
 {
 #if defined(MBEDTLS_SELF_TEST)
     dbl_count++;
@@ -1364,67 +1463,64 @@
     return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
 #else
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    mbedtls_mpi M, S, T, U;
-
-    mbedtls_mpi_init( &M ); mbedtls_mpi_init( &S ); mbedtls_mpi_init( &T ); mbedtls_mpi_init( &U );
 
     /* Special case for A = -3 */
     if( grp->A.p == NULL )
     {
-        /* M = 3(X + Z^2)(X - Z^2) */
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S,  &P->Z,  &P->Z   ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &T,  &P->X,  &S      ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &U,  &P->X,  &S      ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S,  &T,     &U      ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_int( &M,  &S,     3       ) ); MOD_ADD( M );
+        /* tmp[0] <- M = 3(X + Z^2)(X - Z^2) */
+        MPI_ECP_SQR(     &tmp[1],  &P->Z                );
+        MPI_ECP_ADD(     &tmp[2],  &P->X,  &tmp[1]      );
+        MPI_ECP_SUB(     &tmp[3],  &P->X,  &tmp[1]      );
+        MPI_ECP_MUL(     &tmp[1],  &tmp[2],     &tmp[3] );
+        MPI_ECP_MUL_INT( &tmp[0],  &tmp[1],     3       );
     }
     else
     {
-        /* M = 3.X^2 */
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S,  &P->X,  &P->X   ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_int( &M,  &S,     3       ) ); MOD_ADD( M );
+        /* tmp[0] <- M = 3.X^2 + A.Z^4 */
+        MPI_ECP_SQR(     &tmp[1],  &P->X  );
+        MPI_ECP_MUL_INT( &tmp[0],  &tmp[1],  3 );
 
         /* Optimize away for "koblitz" curves with A = 0 */
-        if( mbedtls_mpi_cmp_int( &grp->A, 0 ) != 0 )
+        if( MPI_ECP_CMP_INT( &grp->A, 0 ) != 0 )
         {
             /* M += A.Z^4 */
-            MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S,  &P->Z,  &P->Z   ) );
-            MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T,  &S,     &S      ) );
-            MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S,  &T,     &grp->A ) );
-            MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &M,  &M,     &S      ) );
+            MPI_ECP_SQR( &tmp[1],  &P->Z                );
+            MPI_ECP_SQR( &tmp[2],  &tmp[1]              );
+            MPI_ECP_MUL( &tmp[1],  &tmp[2],     &grp->A );
+            MPI_ECP_ADD( &tmp[0],  &tmp[0],     &tmp[1] );
         }
     }
 
-    /* S = 4.X.Y^2 */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T,  &P->Y,  &P->Y   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l_mod( grp, &T,  1               ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S,  &P->X,  &T      ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l_mod( grp, &S,  1               ) );
+    /* tmp[1] <- S = 4.X.Y^2 */
+    MPI_ECP_SQR(     &tmp[2],  &P->Y     );
+    MPI_ECP_SHIFT_L( &tmp[2],  1         );
+    MPI_ECP_MUL(     &tmp[1],  &P->X, &tmp[2] );
+    MPI_ECP_SHIFT_L( &tmp[1],  1         );
 
-    /* U = 8.Y^4 */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &U,  &T,     &T      ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l_mod( grp, &U,  1               ) );
+    /* tmp[3] <- U = 8.Y^4 */
+    MPI_ECP_SQR(     &tmp[3],  &tmp[2] );
+    MPI_ECP_SHIFT_L( &tmp[3],  1       );
 
-    /* T = M^2 - 2.S */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T,  &M,     &M      ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &T,  &T,     &S      ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &T,  &T,     &S      ) );
+    /* tmp[2] <- T = M^2 - 2.S */
+    MPI_ECP_SQR( &tmp[2],  &tmp[0]     );
+    MPI_ECP_SUB( &tmp[2],  &tmp[2], &tmp[1] );
+    MPI_ECP_SUB( &tmp[2],  &tmp[2], &tmp[1] );
 
-    /* S = M(S - T) - U */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &S,  &S,     &T      ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S,  &S,     &M      ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &S,  &S,     &U      ) );
+    /* tmp[1] <- S = M(S - T) - U */
+    MPI_ECP_SUB( &tmp[1],  &tmp[1],     &tmp[2] );
+    MPI_ECP_MUL( &tmp[1],  &tmp[1],     &tmp[0] );
+    MPI_ECP_SUB( &tmp[1],  &tmp[1],     &tmp[3] );
 
-    /* U = 2.Y.Z */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &U,  &P->Y,  &P->Z   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l_mod( grp, &U,  1               ) );
+    /* tmp[3] <- U = 2.Y.Z */
+    MPI_ECP_MUL(     &tmp[3],  &P->Y,  &P->Z   );
+    MPI_ECP_SHIFT_L( &tmp[3],  1               );
 
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &R->X, &T ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &R->Y, &S ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &R->Z, &U ) );
+    /* Store results */
+    MPI_ECP_MOV( &R->X, &tmp[2] );
+    MPI_ECP_MOV( &R->Y, &tmp[1] );
+    MPI_ECP_MOV( &R->Z, &tmp[3] );
 
 cleanup:
-    mbedtls_mpi_free( &M ); mbedtls_mpi_free( &S ); mbedtls_mpi_free( &T ); mbedtls_mpi_free( &U );
 
     return( ret );
 #endif /* !defined(MBEDTLS_ECP_NO_FALLBACK) || !defined(MBEDTLS_ECP_DOUBLE_JAC_ALT) */
@@ -1436,6 +1532,10 @@
  * The coordinates of Q must be normalized (= affine),
  * but those of P don't need to. R is not normalized.
  *
+ * P,Q,R may alias, but only at the level of EC points: they must be either
+ * equal as pointers, or disjoint (including the coordinate data buffers).
+ * Fine-grained aliasing at the level of coordinates is not supported.
+ *
  * Special cases: (1) P or Q is zero, (2) R is zero, (3) P == Q.
  * None of these cases can happen as intermediate step in ecp_mul_comb():
  * - at each step, P, Q and R are multiples of the base point, the factor
@@ -1444,12 +1544,11 @@
  *   due to the choice of precomputed points in the modified comb method.
  * So branches for these cases do not leak secret information.
  *
- * We accept Q->Z being unset (saving memory in tables) as meaning 1.
- *
  * Cost: 1A := 8M + 3S
  */
 static int ecp_add_mixed( const mbedtls_ecp_group *grp, mbedtls_ecp_point *R,
-                          const mbedtls_ecp_point *P, const mbedtls_ecp_point *Q )
+                          const mbedtls_ecp_point *P, const mbedtls_ecp_point *Q,
+                          mbedtls_mpi tmp[4] )
 {
 #if defined(MBEDTLS_SELF_TEST)
     add_count++;
@@ -1464,39 +1563,45 @@
     return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
 #else
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    mbedtls_mpi T1, T2, T3, T4, X, Y, Z;
+
+    /* NOTE: Aliasing between input and output is allowed, so one has to make
+     *       sure that at the point X,Y,Z are written, {P,Q}->{X,Y,Z} are no
+     *       longer read from. */
+    mbedtls_mpi * const X = &R->X;
+    mbedtls_mpi * const Y = &R->Y;
+    mbedtls_mpi * const Z = &R->Z;
+
+    if( !MPI_ECP_VALID( &Q->Z ) )
+        return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
 
     /*
      * Trivial cases: P == 0 or Q == 0 (case 1)
      */
-    if( mbedtls_mpi_cmp_int( &P->Z, 0 ) == 0 )
+    if( MPI_ECP_CMP_INT( &P->Z, 0 ) == 0 )
         return( mbedtls_ecp_copy( R, Q ) );
 
-    if( Q->Z.p != NULL && mbedtls_mpi_cmp_int( &Q->Z, 0 ) == 0 )
+    if( MPI_ECP_CMP_INT( &Q->Z, 0 ) == 0 )
         return( mbedtls_ecp_copy( R, P ) );
 
     /*
      * Make sure Q coordinates are normalized
      */
-    if( Q->Z.p != NULL && mbedtls_mpi_cmp_int( &Q->Z, 1 ) != 0 )
+    if( MPI_ECP_CMP_INT( &Q->Z, 1 ) != 0 )
         return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
 
-    mbedtls_mpi_init( &T1 ); mbedtls_mpi_init( &T2 ); mbedtls_mpi_init( &T3 ); mbedtls_mpi_init( &T4 );
-    mbedtls_mpi_init( &X ); mbedtls_mpi_init( &Y ); mbedtls_mpi_init( &Z );
-
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T1,  &P->Z,  &P->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T2,  &T1,    &P->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T1,  &T1,    &Q->X ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T2,  &T2,    &Q->Y ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &T1,  &T1,    &P->X ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &T2,  &T2,    &P->Y ) );
+    MPI_ECP_SQR( &tmp[0], &P->Z         );
+    MPI_ECP_MUL( &tmp[1], &tmp[0], &P->Z );
+    MPI_ECP_MUL( &tmp[0], &tmp[0], &Q->X );
+    MPI_ECP_MUL( &tmp[1], &tmp[1], &Q->Y );
+    MPI_ECP_SUB( &tmp[0], &tmp[0], &P->X );
+    MPI_ECP_SUB( &tmp[1], &tmp[1], &P->Y );
 
     /* Special cases (2) and (3) */
-    if( mbedtls_mpi_cmp_int( &T1, 0 ) == 0 )
+    if( MPI_ECP_CMP_INT( &tmp[0], 0 ) == 0 )
     {
-        if( mbedtls_mpi_cmp_int( &T2, 0 ) == 0 )
+        if( MPI_ECP_CMP_INT( &tmp[1], 0 ) == 0 )
         {
-            ret = ecp_double_jac( grp, R, P );
+            ret = ecp_double_jac( grp, R, P, tmp );
             goto cleanup;
         }
         else
@@ -1506,29 +1611,27 @@
         }
     }
 
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &Z,   &P->Z,  &T1   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T3,  &T1,    &T1   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T4,  &T3,    &T1   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T3,  &T3,    &P->X ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &T1, &T3 ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l_mod( grp, &T1,  1     ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &X,   &T2,    &T2   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &X,   &X,     &T1   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &X,   &X,     &T4   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &T3,  &T3,    &X    ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T3,  &T3,    &T2   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &T4,  &T4,    &P->Y ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &Y,   &T3,    &T4   ) );
+    /* {P,Q}->Z no longer used, so OK to write to Z even if there's aliasing. */
+    MPI_ECP_MUL( Z,        &P->Z,    &tmp[0] );
+    MPI_ECP_SQR( &tmp[2],  &tmp[0]           );
+    MPI_ECP_MUL( &tmp[3],  &tmp[2],  &tmp[0] );
+    MPI_ECP_MUL( &tmp[2],  &tmp[2],  &P->X   );
 
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &R->X, &X ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &R->Y, &Y ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &R->Z, &Z ) );
+    MPI_ECP_MOV( &tmp[0], &tmp[2] );
+    MPI_ECP_SHIFT_L( &tmp[0], 1 );
+
+    /* {P,Q}->X no longer used, so OK to write to X even if there's aliasing. */
+    MPI_ECP_SQR( X,        &tmp[1]           );
+    MPI_ECP_SUB( X,        X,        &tmp[0] );
+    MPI_ECP_SUB( X,        X,        &tmp[3] );
+    MPI_ECP_SUB( &tmp[2],  &tmp[2],  X       );
+    MPI_ECP_MUL( &tmp[2],  &tmp[2],  &tmp[1] );
+    MPI_ECP_MUL( &tmp[3],  &tmp[3],  &P->Y   );
+    /* {P,Q}->Y no longer used, so OK to write to Y even if there's aliasing. */
+    MPI_ECP_SUB( Y,     &tmp[2],     &tmp[3] );
 
 cleanup:
 
-    mbedtls_mpi_free( &T1 ); mbedtls_mpi_free( &T2 ); mbedtls_mpi_free( &T3 ); mbedtls_mpi_free( &T4 );
-    mbedtls_mpi_free( &X ); mbedtls_mpi_free( &Y ); mbedtls_mpi_free( &Z );
-
     return( ret );
 #endif /* !defined(MBEDTLS_ECP_NO_FALLBACK) || !defined(MBEDTLS_ECP_ADD_MIXED_ALT) */
 }
@@ -1552,26 +1655,28 @@
     return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
 #else
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    mbedtls_mpi l, ll;
+    mbedtls_mpi l;
 
-    mbedtls_mpi_init( &l ); mbedtls_mpi_init( &ll );
+    mbedtls_mpi_init( &l );
 
     /* Generate l such that 1 < l < p */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_random( &l, 2, &grp->P, f_rng, p_rng ) );
+    MPI_ECP_RAND( &l );
 
-    /* Z = l * Z */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &pt->Z,   &pt->Z,     &l  ) );
+    /* Z' = l * Z */
+    MPI_ECP_MUL( &pt->Z,   &pt->Z,     &l );
 
-    /* X = l^2 * X */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &ll,      &l,         &l  ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &pt->X,   &pt->X,     &ll ) );
+    /* Y' = l * Y */
+    MPI_ECP_MUL( &pt->Y,   &pt->Y,     &l );
 
-    /* Y = l^3 * Y */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &ll,      &ll,        &l  ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &pt->Y,   &pt->Y,     &ll ) );
+    /* X' = l^2 * X */
+    MPI_ECP_SQR( &l,       &l             );
+    MPI_ECP_MUL( &pt->X,   &pt->X,     &l );
+
+    /* Y'' = l^2 * Y' = l^3 * Y */
+    MPI_ECP_MUL( &pt->Y,   &pt->Y,     &l );
 
 cleanup:
-    mbedtls_mpi_free( &l ); mbedtls_mpi_free( &ll );
+    mbedtls_mpi_free( &l );
 
     if( ret == MBEDTLS_ERR_MPI_NOT_ACCEPTABLE )
         ret = MBEDTLS_ERR_ECP_RANDOM_FAILED;
@@ -1714,6 +1819,10 @@
     const unsigned char T_size = 1U << ( w - 1 );
     mbedtls_ecp_point *cur, *TT[COMB_MAX_PRE - 1];
 
+    mbedtls_mpi tmp[4];
+
+    mpi_init_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
+
 #if defined(MBEDTLS_ECP_RESTARTABLE)
     if( rs_ctx != NULL && rs_ctx->rsm != NULL )
     {
@@ -1764,7 +1873,7 @@
         if( j % d == 0 )
             MBEDTLS_MPI_CHK( mbedtls_ecp_copy( cur, T + ( i >> 1 ) ) );
 
-        MBEDTLS_MPI_CHK( ecp_double_jac( grp, cur, cur ) );
+        MBEDTLS_MPI_CHK( ecp_double_jac( grp, cur, cur, tmp ) );
     }
 
 #if defined(MBEDTLS_ECP_RESTARTABLE)
@@ -1774,8 +1883,11 @@
 norm_dbl:
 #endif
     /*
-     * Normalize current elements in T. As T has holes,
-     * use an auxiliary array of pointers to elements in T.
+     * Normalize current elements in T to allow them to be used in
+     * ecp_add_mixed() below, which requires one normalized input.
+     *
+     * As T has holes, use an auxiliary array of pointers to elements in T.
+     *
      */
     j = 0;
     for( i = 1; i < T_size; i <<= 1 )
@@ -1801,7 +1913,7 @@
     {
         j = i;
         while( j-- )
-            MBEDTLS_MPI_CHK( ecp_add_mixed( grp, &T[i + j], &T[j], &T[i] ) );
+            MBEDTLS_MPI_CHK( ecp_add_mixed( grp, &T[i + j], &T[j], &T[i], tmp ) );
     }
 
 #if defined(MBEDTLS_ECP_RESTARTABLE)
@@ -1822,7 +1934,18 @@
 
     MBEDTLS_MPI_CHK( ecp_normalize_jac_many( grp, TT, j ) );
 
+    /* Free Z coordinate (=1 after normalization) to save RAM.
+     * This makes T[i] invalid as mbedtls_ecp_points, but this is OK
+     * since from this point onwards, they are only accessed indirectly
+     * via the getter function ecp_select_comb() which does set the
+     * target's Z coordinate to 1. */
+    for( i = 0; i < T_size; i++ )
+        mbedtls_mpi_free( &T[i].Z );
+
 cleanup:
+
+    mpi_free_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
+
 #if defined(MBEDTLS_ECP_RESTARTABLE)
     if( rs_ctx != NULL && rs_ctx->rsm != NULL &&
         ret == MBEDTLS_ERR_ECP_IN_PROGRESS )
@@ -1853,13 +1976,15 @@
     /* Read the whole table to thwart cache-based timing attacks */
     for( j = 0; j < T_size; j++ )
     {
-        MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_assign( &R->X, &T[j].X, j == ii ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_assign( &R->Y, &T[j].Y, j == ii ) );
+        MPI_ECP_COND_ASSIGN( &R->X, &T[j].X, j == ii );
+        MPI_ECP_COND_ASSIGN( &R->Y, &T[j].Y, j == ii );
     }
 
     /* Safely invert result if i is "negative" */
     MBEDTLS_MPI_CHK( ecp_safe_invert_jac( grp, R, i >> 7 ) );
 
+    MPI_ECP_LSET( &R->Z, 1 );
+
 cleanup:
     return( ret );
 }
@@ -1879,9 +2004,11 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     mbedtls_ecp_point Txi;
+    mbedtls_mpi tmp[4];
     size_t i;
 
     mbedtls_ecp_point_init( &Txi );
+    mpi_init_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
 
 #if !defined(MBEDTLS_ECP_RESTARTABLE)
     (void) rs_ctx;
@@ -1907,7 +2034,6 @@
         /* Start with a non-zero point and randomize its coordinates */
         i = d;
         MBEDTLS_MPI_CHK( ecp_select_comb( grp, R, T, T_size, x[i] ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &R->Z, 1 ) );
         if( f_rng != 0 )
             MBEDTLS_MPI_CHK( ecp_randomize_jac( grp, R, f_rng, p_rng ) );
     }
@@ -1917,14 +2043,15 @@
         MBEDTLS_ECP_BUDGET( MBEDTLS_ECP_OPS_DBL + MBEDTLS_ECP_OPS_ADD );
         --i;
 
-        MBEDTLS_MPI_CHK( ecp_double_jac( grp, R, R ) );
+        MBEDTLS_MPI_CHK( ecp_double_jac( grp, R, R, tmp ) );
         MBEDTLS_MPI_CHK( ecp_select_comb( grp, &Txi, T, T_size, x[i] ) );
-        MBEDTLS_MPI_CHK( ecp_add_mixed( grp, R, R, &Txi ) );
+        MBEDTLS_MPI_CHK( ecp_add_mixed( grp, R, R, &Txi, tmp ) );
     }
 
 cleanup:
 
     mbedtls_ecp_point_free( &Txi );
+    mpi_free_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
 
 #if defined(MBEDTLS_ECP_RESTARTABLE)
     if( rs_ctx != NULL && rs_ctx->rsm != NULL &&
@@ -2127,8 +2254,8 @@
 
     /* Is P the base point ? */
 #if MBEDTLS_ECP_FIXED_POINT_OPTIM == 1
-    p_eq_g = ( mbedtls_mpi_cmp_mpi( &P->Y, &grp->G.Y ) == 0 &&
-               mbedtls_mpi_cmp_mpi( &P->X, &grp->G.X ) == 0 );
+    p_eq_g = ( MPI_ECP_CMP( &P->Y, &grp->G.Y ) == 0 &&
+               MPI_ECP_CMP( &P->X, &grp->G.X ) == 0 );
 #else
     p_eq_g = 0;
 #endif
@@ -2258,9 +2385,9 @@
     return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
 #else
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    MBEDTLS_MPI_CHK( mbedtls_mpi_inv_mod( &P->Z, &P->Z, &grp->P ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &P->X, &P->X, &P->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &P->Z, 1 ) );
+    MPI_ECP_INV( &P->Z, &P->Z );
+    MPI_ECP_MUL( &P->X, &P->X, &P->Z );
+    MPI_ECP_LSET( &P->Z, 1 );
 
 cleanup:
     return( ret );
@@ -2291,10 +2418,10 @@
     mbedtls_mpi_init( &l );
 
     /* Generate l such that 1 < l < p */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_random( &l, 2, &grp->P, f_rng, p_rng ) );
+    MPI_ECP_RAND( &l );
 
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &P->X, &P->X, &l ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &P->Z, &P->Z, &l ) );
+    MPI_ECP_MUL( &P->X, &P->X, &l );
+    MPI_ECP_MUL( &P->Z, &P->Z, &l );
 
 cleanup:
     mbedtls_mpi_free( &l );
@@ -2323,7 +2450,8 @@
 static int ecp_double_add_mxz( const mbedtls_ecp_group *grp,
                                mbedtls_ecp_point *R, mbedtls_ecp_point *S,
                                const mbedtls_ecp_point *P, const mbedtls_ecp_point *Q,
-                               const mbedtls_mpi *d )
+                               const mbedtls_mpi *d,
+                               mbedtls_mpi T[4] )
 {
 #if defined(MBEDTLS_ECP_DOUBLE_ADD_MXZ_ALT)
     if( mbedtls_internal_ecp_grp_capable( grp ) )
@@ -2334,35 +2462,27 @@
     return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
 #else
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    mbedtls_mpi A, AA, B, BB, E, C, D, DA, CB;
 
-    mbedtls_mpi_init( &A ); mbedtls_mpi_init( &AA ); mbedtls_mpi_init( &B );
-    mbedtls_mpi_init( &BB ); mbedtls_mpi_init( &E ); mbedtls_mpi_init( &C );
-    mbedtls_mpi_init( &D ); mbedtls_mpi_init( &DA ); mbedtls_mpi_init( &CB );
-
-    MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &A,    &P->X,   &P->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &AA,   &A,      &A    ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &B,    &P->X,   &P->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &BB,   &B,      &B    ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &E,    &AA,     &BB   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &C,    &Q->X,   &Q->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &D,    &Q->X,   &Q->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &DA,   &D,      &A    ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &CB,   &C,      &B    ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &S->X, &DA,     &CB   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S->X, &S->X,   &S->X ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mod( grp, &S->Z, &DA,     &CB   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S->Z, &S->Z,   &S->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &S->Z, d,       &S->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &R->X, &AA,     &BB   ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &R->Z, &grp->A, &E    ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &R->Z, &BB,     &R->Z ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &R->Z, &E,      &R->Z ) );
+    MPI_ECP_ADD( &T[0], &P->X,   &P->Z ); /* Pp := PX + PZ                    */
+    MPI_ECP_SUB( &T[1], &P->X,   &P->Z ); /* Pm := PX - PZ                    */
+    MPI_ECP_ADD( &T[2], &Q->X,   &Q->Z ); /* Qp := QX + XZ                    */
+    MPI_ECP_SUB( &T[3], &Q->X,   &Q->Z ); /* Qm := QX - QZ                    */
+    MPI_ECP_MUL( &T[3], &T[3],   &T[0] ); /* Qm * Pp                          */
+    MPI_ECP_MUL( &T[2], &T[2],   &T[1] ); /* Qp * Pm                          */
+    MPI_ECP_SQR( &T[0], &T[0]          ); /* Pp^2                             */
+    MPI_ECP_SQR( &T[1], &T[1]          ); /* Pm^2                             */
+    MPI_ECP_MUL( &R->X, &T[0],   &T[1] ); /* Pp^2 * Pm^2                      */
+    MPI_ECP_SUB( &T[0], &T[0],   &T[1] ); /* Pp^2 - Pm^2                      */
+    MPI_ECP_MUL( &R->Z, &grp->A, &T[0] ); /* A * (Pp^2 - Pm^2)                */
+    MPI_ECP_ADD( &R->Z, &T[1],   &R->Z ); /* [ A * (Pp^2-Pm^2) ] + Pm^2       */
+    MPI_ECP_ADD( &S->X, &T[3],   &T[2] ); /* Qm*Pp + Qp*Pm                    */
+    MPI_ECP_SQR( &S->X, &S->X          ); /* (Qm*Pp + Qp*Pm)^2                */
+    MPI_ECP_SUB( &S->Z, &T[3],   &T[2] ); /* Qm*Pp - Qp*Pm                    */
+    MPI_ECP_SQR( &S->Z, &S->Z          ); /* (Qm*Pp - Qp*Pm)^2                */
+    MPI_ECP_MUL( &S->Z, d,       &S->Z ); /* d * ( Qm*Pp - Qp*Pm )^2          */
+    MPI_ECP_MUL( &R->Z, &T[0],   &R->Z ); /* [A*(Pp^2-Pm^2)+Pm^2]*(Pp^2-Pm^2) */
 
 cleanup:
-    mbedtls_mpi_free( &A ); mbedtls_mpi_free( &AA ); mbedtls_mpi_free( &B );
-    mbedtls_mpi_free( &BB ); mbedtls_mpi_free( &E ); mbedtls_mpi_free( &C );
-    mbedtls_mpi_free( &D ); mbedtls_mpi_free( &DA ); mbedtls_mpi_free( &CB );
 
     return( ret );
 #endif /* !defined(MBEDTLS_ECP_NO_FALLBACK) || !defined(MBEDTLS_ECP_DOUBLE_ADD_MXZ_ALT) */
@@ -2382,22 +2502,25 @@
     unsigned char b;
     mbedtls_ecp_point RP;
     mbedtls_mpi PX;
+    mbedtls_mpi tmp[4];
     mbedtls_ecp_point_init( &RP ); mbedtls_mpi_init( &PX );
 
+    mpi_init_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
+
     if( f_rng == NULL )
         return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
 
     /* Save PX and read from P before writing to R, in case P == R */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &PX, &P->X ) );
+    MPI_ECP_MOV( &PX, &P->X );
     MBEDTLS_MPI_CHK( mbedtls_ecp_copy( &RP, P ) );
 
     /* Set R to zero in modified x/z coordinates */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &R->X, 1 ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &R->Z, 0 ) );
+    MPI_ECP_LSET( &R->X, 1 );
+    MPI_ECP_LSET( &R->Z, 0 );
     mbedtls_mpi_free( &R->Y );
 
     /* RP.X might be sligtly larger than P, so reduce it */
-    MOD_ADD( RP.X );
+    MOD_ADD( &RP.X );
 
     /* Randomize coordinates of the starting point */
     MBEDTLS_MPI_CHK( ecp_randomize_mxz( grp, &RP, f_rng, p_rng ) );
@@ -2414,11 +2537,11 @@
          *  else   double_add( R, RP, R, RP )
          * but using safe conditional swaps to avoid leaks
          */
-        MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_swap( &R->X, &RP.X, b ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_swap( &R->Z, &RP.Z, b ) );
-        MBEDTLS_MPI_CHK( ecp_double_add_mxz( grp, R, &RP, R, &RP, &PX ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_swap( &R->X, &RP.X, b ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_safe_cond_swap( &R->Z, &RP.Z, b ) );
+        MPI_ECP_COND_SWAP( &R->X, &RP.X, b );
+        MPI_ECP_COND_SWAP( &R->Z, &RP.Z, b );
+        MBEDTLS_MPI_CHK( ecp_double_add_mxz( grp, R, &RP, R, &RP, &PX, tmp ) );
+        MPI_ECP_COND_SWAP( &R->X, &RP.X, b );
+        MPI_ECP_COND_SWAP( &R->Z, &RP.Z, b );
     }
 
     /*
@@ -2438,6 +2561,7 @@
 cleanup:
     mbedtls_ecp_point_free( &RP ); mbedtls_mpi_free( &PX );
 
+    mpi_free_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
     return( ret );
 }
 
@@ -2566,23 +2690,23 @@
      * YY = Y^2
      * RHS = X (X^2 + A) + B = X^3 + A X + B
      */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &YY,  &pt->Y,   &pt->Y  ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &RHS, &pt->X,   &pt->X  ) );
+    MPI_ECP_SQR( &YY,  &pt->Y );
+    MPI_ECP_SQR( &RHS, &pt->X );
 
     /* Special case for A = -3 */
     if( grp->A.p == NULL )
     {
-        MBEDTLS_MPI_CHK( mbedtls_mpi_sub_int( &RHS, &RHS, 3       ) );  MOD_SUB( RHS );
+        MPI_ECP_SUB_INT( &RHS, &RHS, 3 );
     }
     else
     {
-        MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &RHS, &RHS, &grp->A ) );
+        MPI_ECP_ADD( &RHS, &RHS, &grp->A );
     }
 
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mod( grp, &RHS, &RHS,     &pt->X  ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_add_mod( grp, &RHS, &RHS,     &grp->B ) );
+    MPI_ECP_MUL( &RHS, &RHS, &pt->X  );
+    MPI_ECP_ADD( &RHS, &RHS, &grp->B );
 
-    if( mbedtls_mpi_cmp_mpi( &YY, &RHS ) != 0 )
+    if( MPI_ECP_CMP( &YY, &RHS ) != 0 )
         ret = MBEDTLS_ERR_ECP_INVALID_KEY;
 
 cleanup:
@@ -2605,6 +2729,8 @@
                                       mbedtls_ecp_restart_ctx *rs_ctx )
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
+    mbedtls_mpi tmp;
+    mbedtls_mpi_init( &tmp );
 
     if( mbedtls_mpi_cmp_int( m, 0 ) == 0 )
     {
@@ -2617,8 +2743,7 @@
     else if( mbedtls_mpi_cmp_int( m, -1 ) == 0 )
     {
         MBEDTLS_MPI_CHK( mbedtls_ecp_copy( R, P ) );
-        if( mbedtls_mpi_cmp_int( &R->Y, 0 ) != 0 )
-            MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &R->Y, &grp->P, &R->Y ) );
+        MPI_ECP_NEG( &R->Y );
     }
     else
     {
@@ -2627,6 +2752,8 @@
     }
 
 cleanup:
+    mbedtls_mpi_free( &tmp );
+
     return( ret );
 }
 
@@ -2644,6 +2771,7 @@
     mbedtls_ecp_point mP;
     mbedtls_ecp_point *pmP = &mP;
     mbedtls_ecp_point *pR = R;
+    mbedtls_mpi tmp[4];
 #if defined(MBEDTLS_ECP_INTERNAL_ALT)
     char is_grp_capable = 0;
 #endif
@@ -2658,6 +2786,7 @@
         return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
 
     mbedtls_ecp_point_init( &mP );
+    mpi_init_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
 
     ECP_RS_ENTER( ma );
 
@@ -2699,7 +2828,7 @@
 add:
 #endif
     MBEDTLS_ECP_BUDGET( MBEDTLS_ECP_OPS_ADD );
-    MBEDTLS_MPI_CHK( ecp_add_mixed( grp, pR, pmP, pR ) );
+    MBEDTLS_MPI_CHK( ecp_add_mixed( grp, pR, pmP, pR, tmp ) );
 #if defined(MBEDTLS_ECP_RESTARTABLE)
     if( rs_ctx != NULL && rs_ctx->ma != NULL )
         rs_ctx->ma->state = ecp_rsma_norm;
@@ -2715,6 +2844,9 @@
 #endif
 
 cleanup:
+
+    mpi_free_many( tmp, sizeof( tmp ) / sizeof( mbedtls_mpi ) );
+
 #if defined(MBEDTLS_ECP_INTERNAL_ALT)
     if( is_grp_capable )
         mbedtls_internal_ecp_free( grp );