Merge remote-tracking branch 'dave/fast_xor' into fast_unaligned
diff --git a/library/aes.c b/library/aes.c
index 7d03524..6e1bcd2 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -978,7 +978,6 @@
                     const unsigned char *input,
                     unsigned char *output )
 {
-    int i;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char temp[16];
 
@@ -1009,8 +1008,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 16 );
 
             memcpy( iv, temp, 16 );
 
@@ -1023,8 +1021,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 16 );
 
             ret = mbedtls_aes_crypt_ecb( ctx, mode, output, output );
             if( ret != 0 )
@@ -1106,8 +1103,6 @@
 
     while( blocks-- )
     {
-        size_t i;
-
         if( leftover && ( mode == MBEDTLS_AES_DECRYPT ) && blocks == 0 )
         {
             /* We are on the last block in a decrypt operation that has
@@ -1119,15 +1114,13 @@
             mbedtls_gf128mul_x_ble( tweak, tweak );
         }
 
-        for( i = 0; i < 16; i++ )
-            tmp[i] = input[i] ^ tweak[i];
+        mbedtls_xor( tmp, input, tweak, 16 );
 
         ret = mbedtls_aes_crypt_ecb( &ctx->crypt, mode, tmp, tmp );
         if( ret != 0 )
             return( ret );
 
-        for( i = 0; i < 16; i++ )
-            output[i] = tmp[i] ^ tweak[i];
+        mbedtls_xor( output, tmp, tweak, 16 );
 
         /* Update the tweak for the next block. */
         mbedtls_gf128mul_x_ble( tweak, tweak );
@@ -1148,19 +1141,18 @@
         unsigned char *prev_output = output - 16;
 
         /* Copy ciphertext bytes from the previous block to our output for each
-         * byte of ciphertext we won't steal. At the same time, copy the
-         * remainder of the input for this final round (since the loop bounds
-         * are the same). */
+         * byte of ciphertext we won't steal. */
         for( i = 0; i < leftover; i++ )
         {
             output[i] = prev_output[i];
-            tmp[i] = input[i] ^ t[i];
         }
 
+        /* Copy the remainder of the input for this final round. */
+        mbedtls_xor( tmp, input, t, leftover );
+
         /* Copy ciphertext bytes from the previous block for input in this
          * round. */
-        for( ; i < 16; i++ )
-            tmp[i] = prev_output[i] ^ t[i];
+        mbedtls_xor( tmp + i, prev_output + i, t + i, 16 - i );
 
         ret = mbedtls_aes_crypt_ecb( &ctx->crypt, mode, tmp, tmp );
         if( ret != 0 )
@@ -1168,8 +1160,7 @@
 
         /* Write the result back to the previous block, overriding the previous
          * output we copied. */
-        for( i = 0; i < 16; i++ )
-            prev_output[i] = tmp[i] ^ t[i];
+        mbedtls_xor( prev_output, tmp, t, 16 );
     }
 
     return( 0 );
diff --git a/library/alignment.h b/library/alignment.h
new file mode 100644
index 0000000..de1ab91
--- /dev/null
+++ b/library/alignment.h
@@ -0,0 +1,379 @@
+/**
+ * \file alignment.h
+ *
+ * \brief Utility code for dealing with unaligned memory accesses
+ */
+/*
+ *  Copyright The Mbed TLS Contributors
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef MBEDTLS_LIBRARY_ALIGNMENT_H
+#define MBEDTLS_LIBRARY_ALIGNMENT_H
+
+#include <stdint.h>
+#include <string.h>
+
+#include "mbedtls/build_info.h"
+
+/**
+ * Read the unsigned 32 bits integer from the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 4 bytes of data
+ * \return  Data at the given address
+ */
+inline uint32_t mbedtls_get_unaligned_uint32( const void *p )
+{
+    uint32_t r;
+    memcpy( &r, p, sizeof( r ) );
+    return r;
+}
+
+/**
+ * Write the unsigned 32 bits integer to the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 4 bytes of data
+ * \param   x data to write
+ */
+inline void mbedtls_put_unaligned_uint32( void *p, uint32_t x )
+{
+    memcpy( p, &x, sizeof( x ) );
+}
+
+/** Byte Reading Macros
+ *
+ * Given a multi-byte integer \p x, MBEDTLS_BYTE_n retrieves the n-th
+ * byte from x, where byte 0 is the least significant byte.
+ */
+#define MBEDTLS_BYTE_0( x ) ( (uint8_t) (   ( x )         & 0xff ) )
+#define MBEDTLS_BYTE_1( x ) ( (uint8_t) ( ( ( x ) >> 8  ) & 0xff ) )
+#define MBEDTLS_BYTE_2( x ) ( (uint8_t) ( ( ( x ) >> 16 ) & 0xff ) )
+#define MBEDTLS_BYTE_3( x ) ( (uint8_t) ( ( ( x ) >> 24 ) & 0xff ) )
+#define MBEDTLS_BYTE_4( x ) ( (uint8_t) ( ( ( x ) >> 32 ) & 0xff ) )
+#define MBEDTLS_BYTE_5( x ) ( (uint8_t) ( ( ( x ) >> 40 ) & 0xff ) )
+#define MBEDTLS_BYTE_6( x ) ( (uint8_t) ( ( ( x ) >> 48 ) & 0xff ) )
+#define MBEDTLS_BYTE_7( x ) ( (uint8_t) ( ( ( x ) >> 56 ) & 0xff ) )
+
+/**
+ * Get the unsigned 32 bits integer corresponding to four bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the four bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the four bytes to build the 32 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT32_BE
+#define MBEDTLS_GET_UINT32_BE( data , offset )                  \
+    (                                                           \
+          ( (uint32_t) ( data )[( offset )    ] << 24 )         \
+        | ( (uint32_t) ( data )[( offset ) + 1] << 16 )         \
+        | ( (uint32_t) ( data )[( offset ) + 2] <<  8 )         \
+        | ( (uint32_t) ( data )[( offset ) + 3]       )         \
+    )
+#endif
+
+/**
+ * Put in memory a 32 bits unsigned integer in big-endian order.
+ *
+ * \param   n       32 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 32
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 32 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT32_BE
+#define MBEDTLS_PUT_UINT32_BE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_3( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_2( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 3] = MBEDTLS_BYTE_0( n );             \
+}
+#endif
+
+/**
+ * Get the unsigned 32 bits integer corresponding to four bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the four bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the four bytes to build the 32 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT32_LE
+#define MBEDTLS_GET_UINT32_LE( data, offset )                   \
+    (                                                           \
+          ( (uint32_t) ( data )[( offset )    ]       )         \
+        | ( (uint32_t) ( data )[( offset ) + 1] <<  8 )         \
+        | ( (uint32_t) ( data )[( offset ) + 2] << 16 )         \
+        | ( (uint32_t) ( data )[( offset ) + 3] << 24 )         \
+    )
+#endif
+
+/**
+ * Put in memory a 32 bits unsigned integer in little-endian order.
+ *
+ * \param   n       32 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 32
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 32 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT32_LE
+#define MBEDTLS_PUT_UINT32_LE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
+    ( data )[( offset ) + 3] = MBEDTLS_BYTE_3( n );             \
+}
+#endif
+
+/**
+ * Get the unsigned 16 bits integer corresponding to two bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the two bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the two bytes to build the 16 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT16_LE
+#define MBEDTLS_GET_UINT16_LE( data, offset )                   \
+    (                                                           \
+          ( (uint16_t) ( data )[( offset )    ]       )         \
+        | ( (uint16_t) ( data )[( offset ) + 1] <<  8 )         \
+    )
+#endif
+
+/**
+ * Put in memory a 16 bits unsigned integer in little-endian order.
+ *
+ * \param   n       16 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 16
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 16 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT16_LE
+#define MBEDTLS_PUT_UINT16_LE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
+}
+#endif
+
+/**
+ * Get the unsigned 16 bits integer corresponding to two bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the two bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the two bytes to build the 16 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT16_BE
+#define MBEDTLS_GET_UINT16_BE( data, offset )                   \
+    (                                                           \
+          ( (uint16_t) ( data )[( offset )    ] << 8 )          \
+        | ( (uint16_t) ( data )[( offset ) + 1]      )          \
+    )
+#endif
+
+/**
+ * Put in memory a 16 bits unsigned integer in big-endian order.
+ *
+ * \param   n       16 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 16
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 16 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT16_BE
+#define MBEDTLS_PUT_UINT16_BE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_0( n );             \
+}
+#endif
+
+/**
+ * Get the unsigned 24 bits integer corresponding to three bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the three bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the three bytes to build the 24 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT24_BE
+#define MBEDTLS_GET_UINT24_BE( data , offset )                  \
+    (                                                           \
+          ( (uint32_t) ( data )[( offset )    ] << 16 )         \
+        | ( (uint32_t) ( data )[( offset ) + 1] << 8  )         \
+        | ( (uint32_t) ( data )[( offset ) + 2]       )         \
+    )
+#endif
+
+/**
+ * Put in memory a 24 bits unsigned integer in big-endian order.
+ *
+ * \param   n       24 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 24
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 24 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT24_BE
+#define MBEDTLS_PUT_UINT24_BE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_2( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_0( n );             \
+}
+#endif
+
+/**
+ * Get the unsigned 24 bits integer corresponding to three bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the three bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the three bytes to build the 24 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT24_LE
+#define MBEDTLS_GET_UINT24_LE( data, offset )                   \
+    (                                                           \
+          ( (uint32_t) ( data )[( offset )    ]       )         \
+        | ( (uint32_t) ( data )[( offset ) + 1] <<  8 )         \
+        | ( (uint32_t) ( data )[( offset ) + 2] << 16 )         \
+    )
+#endif
+
+/**
+ * Put in memory a 24 bits unsigned integer in little-endian order.
+ *
+ * \param   n       24 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 24
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 24 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT24_LE
+#define MBEDTLS_PUT_UINT24_LE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
+}
+#endif
+
+/**
+ * Get the unsigned 64 bits integer corresponding to eight bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the eight bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the eight bytes to build the 64 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT64_BE
+#define MBEDTLS_GET_UINT64_BE( data, offset )                   \
+    (                                                           \
+          ( (uint64_t) ( data )[( offset )    ] << 56 )         \
+        | ( (uint64_t) ( data )[( offset ) + 1] << 48 )         \
+        | ( (uint64_t) ( data )[( offset ) + 2] << 40 )         \
+        | ( (uint64_t) ( data )[( offset ) + 3] << 32 )         \
+        | ( (uint64_t) ( data )[( offset ) + 4] << 24 )         \
+        | ( (uint64_t) ( data )[( offset ) + 5] << 16 )         \
+        | ( (uint64_t) ( data )[( offset ) + 6] <<  8 )         \
+        | ( (uint64_t) ( data )[( offset ) + 7]       )         \
+    )
+#endif
+
+/**
+ * Put in memory a 64 bits unsigned integer in big-endian order.
+ *
+ * \param   n       64 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 64
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 64 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT64_BE
+#define MBEDTLS_PUT_UINT64_BE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_7( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_6( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_5( n );             \
+    ( data )[( offset ) + 3] = MBEDTLS_BYTE_4( n );             \
+    ( data )[( offset ) + 4] = MBEDTLS_BYTE_3( n );             \
+    ( data )[( offset ) + 5] = MBEDTLS_BYTE_2( n );             \
+    ( data )[( offset ) + 6] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 7] = MBEDTLS_BYTE_0( n );             \
+}
+#endif
+
+/**
+ * Get the unsigned 64 bits integer corresponding to eight bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the eight bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the eight bytes to build the 64 bits unsigned
+ *                  integer from.
+ */
+#ifndef MBEDTLS_GET_UINT64_LE
+#define MBEDTLS_GET_UINT64_LE( data, offset )                   \
+    (                                                           \
+          ( (uint64_t) ( data )[( offset ) + 7] << 56 )         \
+        | ( (uint64_t) ( data )[( offset ) + 6] << 48 )         \
+        | ( (uint64_t) ( data )[( offset ) + 5] << 40 )         \
+        | ( (uint64_t) ( data )[( offset ) + 4] << 32 )         \
+        | ( (uint64_t) ( data )[( offset ) + 3] << 24 )         \
+        | ( (uint64_t) ( data )[( offset ) + 2] << 16 )         \
+        | ( (uint64_t) ( data )[( offset ) + 1] <<  8 )         \
+        | ( (uint64_t) ( data )[( offset )    ]       )         \
+    )
+#endif
+
+/**
+ * Put in memory a 64 bits unsigned integer in little-endian order.
+ *
+ * \param   n       64 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 64
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 64 bits unsigned integer \p n.
+ */
+#ifndef MBEDTLS_PUT_UINT64_LE
+#define MBEDTLS_PUT_UINT64_LE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
+    ( data )[( offset ) + 3] = MBEDTLS_BYTE_3( n );             \
+    ( data )[( offset ) + 4] = MBEDTLS_BYTE_4( n );             \
+    ( data )[( offset ) + 5] = MBEDTLS_BYTE_5( n );             \
+    ( data )[( offset ) + 6] = MBEDTLS_BYTE_6( n );             \
+    ( data )[( offset ) + 7] = MBEDTLS_BYTE_7( n );             \
+}
+#endif
+
+#endif /* MBEDTLS_LIBRARY_ALIGNMENT_H */
diff --git a/library/aria.c b/library/aria.c
index 5e52eea..682d4ec 100644
--- a/library/aria.c
+++ b/library/aria.c
@@ -583,7 +583,6 @@
                             const unsigned char *input,
                             unsigned char *output )
 {
-    int i;
     unsigned char temp[MBEDTLS_ARIA_BLOCKSIZE];
 
     ARIA_VALIDATE_RET( ctx != NULL );
@@ -603,8 +602,7 @@
             memcpy( temp, input, MBEDTLS_ARIA_BLOCKSIZE );
             mbedtls_aria_crypt_ecb( ctx, input, output );
 
-            for( i = 0; i < MBEDTLS_ARIA_BLOCKSIZE; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, MBEDTLS_ARIA_BLOCKSIZE );
 
             memcpy( iv, temp, MBEDTLS_ARIA_BLOCKSIZE );
 
@@ -617,8 +615,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < MBEDTLS_ARIA_BLOCKSIZE; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, MBEDTLS_ARIA_BLOCKSIZE );
 
             mbedtls_aria_crypt_ecb( ctx, output, output );
             memcpy( iv, output, MBEDTLS_ARIA_BLOCKSIZE );
diff --git a/library/camellia.c b/library/camellia.c
index 5dd6c56..6e781c7 100644
--- a/library/camellia.c
+++ b/library/camellia.c
@@ -526,7 +526,6 @@
                                 const unsigned char *input,
                                 unsigned char *output )
 {
-    int i;
     unsigned char temp[16];
     if( mode != MBEDTLS_CAMELLIA_ENCRYPT && mode != MBEDTLS_CAMELLIA_DECRYPT )
         return MBEDTLS_ERR_CAMELLIA_BAD_INPUT_DATA;
@@ -541,8 +540,7 @@
             memcpy( temp, input, 16 );
             mbedtls_camellia_crypt_ecb( ctx, mode, input, output );
 
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 16 );
 
             memcpy( iv, temp, 16 );
 
@@ -555,8 +553,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 16 );
 
             mbedtls_camellia_crypt_ecb( ctx, mode, output, output );
             memcpy( iv, output, 16 );
diff --git a/library/ccm.c b/library/ccm.c
index 3edfba3..7bf9e9f 100644
--- a/library/ccm.c
+++ b/library/ccm.c
@@ -112,7 +112,6 @@
                               const unsigned char *input,
                               unsigned char *output )
 {
-    size_t i;
     size_t olen = 0;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char tmp_buf[16] = {0};
@@ -125,8 +124,7 @@
         return ret;
     }
 
-    for( i = 0; i < use_len; i++ )
-        output[i] = input[i] ^ tmp_buf[offset + i];
+    mbedtls_xor( output, input, tmp_buf + offset, use_len );
 
     mbedtls_platform_zeroize(tmp_buf, sizeof(tmp_buf));
     return ret;
@@ -269,7 +267,6 @@
                            size_t add_len )
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    unsigned char i;
     size_t olen, use_len, offset;
 
     if( ctx->state & CCM_STATE__ERROR )
@@ -310,8 +307,7 @@
             if( use_len > add_len )
                 use_len = add_len;
 
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i + offset] ^= add[i];
+            mbedtls_xor( ctx->y + offset, ctx->y + offset, add, use_len );
 
             ctx->processed += use_len;
             add_len -= use_len;
@@ -381,8 +377,7 @@
         if( ctx->mode == MBEDTLS_CCM_ENCRYPT || \
             ctx->mode == MBEDTLS_CCM_STAR_ENCRYPT )
         {
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i + offset] ^= input[i];
+            mbedtls_xor( ctx->y + offset, ctx->y + offset, input, use_len );
 
             if( use_len + offset == 16 || ctx->processed == ctx->plaintext_len )
             {
@@ -411,8 +406,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i + offset] ^= local_output[i];
+            mbedtls_xor( ctx->y + offset, ctx->y + offset, local_output, use_len );
 
             memcpy( output, local_output, use_len );
             mbedtls_platform_zeroize( local_output, 16 );
diff --git a/library/chacha20.c b/library/chacha20.c
index 85d7461..d17c58c 100644
--- a/library/chacha20.c
+++ b/library/chacha20.c
@@ -217,7 +217,6 @@
                               unsigned char *output )
 {
     size_t offset = 0U;
-    size_t i;
 
     /* Use leftover keystream bytes, if available */
     while( size > 0U && ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES )
@@ -237,17 +236,7 @@
         chacha20_block( ctx->state, ctx->keystream8 );
         ctx->state[CHACHA20_CTR_INDEX]++;
 
-        for( i = 0U; i < 64U; i += 8U )
-        {
-            output[offset + i  ] = input[offset + i  ] ^ ctx->keystream8[i  ];
-            output[offset + i+1] = input[offset + i+1] ^ ctx->keystream8[i+1];
-            output[offset + i+2] = input[offset + i+2] ^ ctx->keystream8[i+2];
-            output[offset + i+3] = input[offset + i+3] ^ ctx->keystream8[i+3];
-            output[offset + i+4] = input[offset + i+4] ^ ctx->keystream8[i+4];
-            output[offset + i+5] = input[offset + i+5] ^ ctx->keystream8[i+5];
-            output[offset + i+6] = input[offset + i+6] ^ ctx->keystream8[i+6];
-            output[offset + i+7] = input[offset + i+7] ^ ctx->keystream8[i+7];
-        }
+        mbedtls_xor( output + offset, input + offset, ctx->keystream8, 64U );
 
         offset += CHACHA20_BLOCK_SIZE_BYTES;
         size   -= CHACHA20_BLOCK_SIZE_BYTES;
@@ -260,10 +249,7 @@
         chacha20_block( ctx->state, ctx->keystream8 );
         ctx->state[CHACHA20_CTR_INDEX]++;
 
-        for( i = 0U; i < size; i++)
-        {
-            output[offset + i] = input[offset + i] ^ ctx->keystream8[i];
-        }
+        mbedtls_xor( output + offset, input + offset, ctx->keystream8, size );
 
         ctx->keystream_bytes_used = size;
 
diff --git a/library/cmac.c b/library/cmac.c
index 3cc49d1..9870856 100644
--- a/library/cmac.c
+++ b/library/cmac.c
@@ -148,15 +148,6 @@
 #endif /* !defined(MBEDTLS_CMAC_ALT) || defined(MBEDTLS_SELF_TEST) */
 
 #if !defined(MBEDTLS_CMAC_ALT)
-static void cmac_xor_block( unsigned char *output, const unsigned char *input1,
-                            const unsigned char *input2,
-                            const size_t block_size )
-{
-    size_t idx;
-
-    for( idx = 0; idx < block_size; idx++ )
-        output[ idx ] = input1[ idx ] ^ input2[ idx ];
-}
 
 /*
  * Create padded last block from (partial) last block.
@@ -247,7 +238,7 @@
                 input,
                 block_size - cmac_ctx->unprocessed_len );
 
-        cmac_xor_block( state, cmac_ctx->unprocessed_block, state, block_size );
+        mbedtls_xor( state, cmac_ctx->unprocessed_block, state, block_size );
 
         if( ( ret = mbedtls_cipher_update( ctx, state, block_size, state,
                                            &olen ) ) != 0 )
@@ -267,7 +258,7 @@
      * final partial or complete block */
     for( j = 1; j < n; j++ )
     {
-        cmac_xor_block( state, input, state, block_size );
+        mbedtls_xor( state, input, state, block_size );
 
         if( ( ret = mbedtls_cipher_update( ctx, state, block_size, state,
                                            &olen ) ) != 0 )
@@ -319,16 +310,16 @@
     if( cmac_ctx->unprocessed_len < block_size )
     {
         cmac_pad( M_last, block_size, last_block, cmac_ctx->unprocessed_len );
-        cmac_xor_block( M_last, M_last, K2, block_size );
+        mbedtls_xor( M_last, M_last, K2, block_size );
     }
     else
     {
         /* Last block is complete block */
-        cmac_xor_block( M_last, last_block, K1, block_size );
+        mbedtls_xor( M_last, last_block, K1, block_size );
     }
 
 
-    cmac_xor_block( state, M_last, state, block_size );
+    mbedtls_xor( state, M_last, state, block_size );
     if( ( ret = mbedtls_cipher_update( ctx, state, block_size, state,
                                        &olen ) ) != 0 )
     {
diff --git a/library/common.h b/library/common.h
index 25d5294..9d3b8fe 100644
--- a/library/common.h
+++ b/library/common.h
@@ -24,9 +24,11 @@
 #define MBEDTLS_LIBRARY_COMMON_H
 
 #include "mbedtls/build_info.h"
+#include "alignment.h"
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stddef.h>
 
 /** Helper to define a function as static except when building invasive tests.
  *
@@ -107,327 +109,30 @@
     return( p == NULL ? NULL : p + n );
 }
 
-/** Byte Reading Macros
- *
- * Given a multi-byte integer \p x, MBEDTLS_BYTE_n retrieves the n-th
- * byte from x, where byte 0 is the least significant byte.
- */
-#define MBEDTLS_BYTE_0( x ) ( (uint8_t) (   ( x )         & 0xff ) )
-#define MBEDTLS_BYTE_1( x ) ( (uint8_t) ( ( ( x ) >> 8  ) & 0xff ) )
-#define MBEDTLS_BYTE_2( x ) ( (uint8_t) ( ( ( x ) >> 16 ) & 0xff ) )
-#define MBEDTLS_BYTE_3( x ) ( (uint8_t) ( ( ( x ) >> 24 ) & 0xff ) )
-#define MBEDTLS_BYTE_4( x ) ( (uint8_t) ( ( ( x ) >> 32 ) & 0xff ) )
-#define MBEDTLS_BYTE_5( x ) ( (uint8_t) ( ( ( x ) >> 40 ) & 0xff ) )
-#define MBEDTLS_BYTE_6( x ) ( (uint8_t) ( ( ( x ) >> 48 ) & 0xff ) )
-#define MBEDTLS_BYTE_7( x ) ( (uint8_t) ( ( ( x ) >> 56 ) & 0xff ) )
-
 /**
- * Get the unsigned 32 bits integer corresponding to four bytes in
- * big-endian order (MSB first).
+ * Perform a fast block XOR operation, such that
+ * r[i] = a[i] ^ b[i] where 0 <= i < n
  *
- * \param   data    Base address of the memory to get the four bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the four bytes to build the 32 bits unsigned
- *                  integer from.
+ * \param   r Pointer to result (buffer of at least \p n bytes). \p r
+ *            may be equal to either \p a or \p b, but behaviour when
+ *            it overlaps in other ways is undefined.
+ * \param   a Pointer to input (buffer of at least \p n bytes)
+ * \param   b Pointer to input (buffer of at least \p n bytes)
+ * \param   n Number of bytes to process.
  */
-#ifndef MBEDTLS_GET_UINT32_BE
-#define MBEDTLS_GET_UINT32_BE( data , offset )                  \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ] << 24 )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] << 16 )         \
-        | ( (uint32_t) ( data )[( offset ) + 2] <<  8 )         \
-        | ( (uint32_t) ( data )[( offset ) + 3]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 32 bits unsigned integer in big-endian order.
- *
- * \param   n       32 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 32
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 32 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT32_BE
-#define MBEDTLS_PUT_UINT32_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_3( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_0( n );             \
+inline void mbedtls_xor( unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n )
+{
+    size_t i;
+    for ( i = 0; ( i + 4 ) <= n; i += 4 )
+    {
+        uint32_t x = mbedtls_get_unaligned_uint32( a + i ) ^ mbedtls_get_unaligned_uint32( b + i );
+        mbedtls_put_unaligned_uint32( r + i, x );
+    }
+    for ( ; i < n; i++ )
+    {
+        r[i] = a[i] ^ b[i];
+    }
 }
-#endif
-
-/**
- * Get the unsigned 32 bits integer corresponding to four bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the four bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the four bytes to build the 32 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT32_LE
-#define MBEDTLS_GET_UINT32_LE( data, offset )                   \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ]       )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] <<  8 )         \
-        | ( (uint32_t) ( data )[( offset ) + 2] << 16 )         \
-        | ( (uint32_t) ( data )[( offset ) + 3] << 24 )         \
-    )
-#endif
-
-/**
- * Put in memory a 32 bits unsigned integer in little-endian order.
- *
- * \param   n       32 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 32
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 32 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT32_LE
-#define MBEDTLS_PUT_UINT32_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_3( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 16 bits integer corresponding to two bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the two bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the two bytes to build the 16 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT16_LE
-#define MBEDTLS_GET_UINT16_LE( data, offset )                   \
-    (                                                           \
-          ( (uint16_t) ( data )[( offset )    ]       )         \
-        | ( (uint16_t) ( data )[( offset ) + 1] <<  8 )         \
-    )
-#endif
-
-/**
- * Put in memory a 16 bits unsigned integer in little-endian order.
- *
- * \param   n       16 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 16
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 16 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT16_LE
-#define MBEDTLS_PUT_UINT16_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 16 bits integer corresponding to two bytes in
- * big-endian order (MSB first).
- *
- * \param   data    Base address of the memory to get the two bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the two bytes to build the 16 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT16_BE
-#define MBEDTLS_GET_UINT16_BE( data, offset )                   \
-    (                                                           \
-          ( (uint16_t) ( data )[( offset )    ] << 8 )          \
-        | ( (uint16_t) ( data )[( offset ) + 1]      )          \
-    )
-#endif
-
-/**
- * Put in memory a 16 bits unsigned integer in big-endian order.
- *
- * \param   n       16 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 16
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 16 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT16_BE
-#define MBEDTLS_PUT_UINT16_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_0( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 24 bits integer corresponding to three bytes in
- * big-endian order (MSB first).
- *
- * \param   data    Base address of the memory to get the three bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the three bytes to build the 24 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT24_BE
-#define MBEDTLS_GET_UINT24_BE( data , offset )                  \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ] << 16 )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] << 8  )         \
-        | ( (uint32_t) ( data )[( offset ) + 2]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 24 bits unsigned integer in big-endian order.
- *
- * \param   n       24 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 24
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 24 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT24_BE
-#define MBEDTLS_PUT_UINT24_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_0( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 24 bits integer corresponding to three bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the three bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the three bytes to build the 24 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT24_LE
-#define MBEDTLS_GET_UINT24_LE( data, offset )                   \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ]       )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] <<  8 )         \
-        | ( (uint32_t) ( data )[( offset ) + 2] << 16 )         \
-    )
-#endif
-
-/**
- * Put in memory a 24 bits unsigned integer in little-endian order.
- *
- * \param   n       24 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 24
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 24 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT24_LE
-#define MBEDTLS_PUT_UINT24_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 64 bits integer corresponding to eight bytes in
- * big-endian order (MSB first).
- *
- * \param   data    Base address of the memory to get the eight bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the eight bytes to build the 64 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT64_BE
-#define MBEDTLS_GET_UINT64_BE( data, offset )                   \
-    (                                                           \
-          ( (uint64_t) ( data )[( offset )    ] << 56 )         \
-        | ( (uint64_t) ( data )[( offset ) + 1] << 48 )         \
-        | ( (uint64_t) ( data )[( offset ) + 2] << 40 )         \
-        | ( (uint64_t) ( data )[( offset ) + 3] << 32 )         \
-        | ( (uint64_t) ( data )[( offset ) + 4] << 24 )         \
-        | ( (uint64_t) ( data )[( offset ) + 5] << 16 )         \
-        | ( (uint64_t) ( data )[( offset ) + 6] <<  8 )         \
-        | ( (uint64_t) ( data )[( offset ) + 7]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 64 bits unsigned integer in big-endian order.
- *
- * \param   n       64 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 64
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 64 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT64_BE
-#define MBEDTLS_PUT_UINT64_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_7( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_6( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_5( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_4( n );             \
-    ( data )[( offset ) + 4] = MBEDTLS_BYTE_3( n );             \
-    ( data )[( offset ) + 5] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 6] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 7] = MBEDTLS_BYTE_0( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 64 bits integer corresponding to eight bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the eight bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the eight bytes to build the 64 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT64_LE
-#define MBEDTLS_GET_UINT64_LE( data, offset )                   \
-    (                                                           \
-          ( (uint64_t) ( data )[( offset ) + 7] << 56 )         \
-        | ( (uint64_t) ( data )[( offset ) + 6] << 48 )         \
-        | ( (uint64_t) ( data )[( offset ) + 5] << 40 )         \
-        | ( (uint64_t) ( data )[( offset ) + 4] << 32 )         \
-        | ( (uint64_t) ( data )[( offset ) + 3] << 24 )         \
-        | ( (uint64_t) ( data )[( offset ) + 2] << 16 )         \
-        | ( (uint64_t) ( data )[( offset ) + 1] <<  8 )         \
-        | ( (uint64_t) ( data )[( offset )    ]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 64 bits unsigned integer in little-endian order.
- *
- * \param   n       64 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 64
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 64 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT64_LE
-#define MBEDTLS_PUT_UINT64_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_3( n );             \
-    ( data )[( offset ) + 4] = MBEDTLS_BYTE_4( n );             \
-    ( data )[( offset ) + 5] = MBEDTLS_BYTE_5( n );             \
-    ( data )[( offset ) + 6] = MBEDTLS_BYTE_6( n );             \
-    ( data )[( offset ) + 7] = MBEDTLS_BYTE_7( n );             \
-}
-#endif
 
 /* Fix MSVC C99 compatible issue
  *      MSVC support __func__ from visual studio 2015( 1900 )
diff --git a/library/ctr_drbg.c b/library/ctr_drbg.c
index 71c48af..f5c5e7b 100644
--- a/library/ctr_drbg.c
+++ b/library/ctr_drbg.c
@@ -174,8 +174,7 @@
 
         while( use_len > 0 )
         {
-            for( i = 0; i < MBEDTLS_CTR_DRBG_BLOCKSIZE; i++ )
-                chain[i] ^= p[i];
+            mbedtls_xor( chain, chain, p, MBEDTLS_CTR_DRBG_BLOCKSIZE );
             p += MBEDTLS_CTR_DRBG_BLOCKSIZE;
             use_len -= ( use_len >= MBEDTLS_CTR_DRBG_BLOCKSIZE ) ?
                        MBEDTLS_CTR_DRBG_BLOCKSIZE : use_len;
diff --git a/library/des.c b/library/des.c
index 65f5681..c56d4d4 100644
--- a/library/des.c
+++ b/library/des.c
@@ -635,7 +635,6 @@
                     const unsigned char *input,
                     unsigned char *output )
 {
-    int i;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char temp[8];
 
@@ -646,8 +645,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 8 );
 
             ret = mbedtls_des_crypt_ecb( ctx, output, output );
             if( ret != 0 )
@@ -668,8 +666,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 8 );
 
             memcpy( iv, temp, 8 );
 
@@ -741,7 +738,6 @@
                      const unsigned char *input,
                      unsigned char *output )
 {
-    int i;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char temp[8];
 
@@ -752,8 +748,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 8 );
 
             ret = mbedtls_des3_crypt_ecb( ctx, output, output );
             if( ret != 0 )
@@ -774,8 +769,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 8 );
 
             memcpy( iv, temp, 8 );
 
diff --git a/library/gcm.c b/library/gcm.c
index f004a73c..0178b5b 100644
--- a/library/gcm.c
+++ b/library/gcm.c
@@ -235,7 +235,6 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char work_buf[16];
-    size_t i;
     const unsigned char *p;
     size_t use_len, olen = 0;
     uint64_t iv_bits;
@@ -268,8 +267,7 @@
         {
             use_len = ( iv_len < 16 ) ? iv_len : 16;
 
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i] ^= p[i];
+            mbedtls_xor( ctx->y, ctx->y, p, use_len );
 
             gcm_mult( ctx, ctx->y, ctx->y );
 
@@ -277,8 +275,7 @@
             p += use_len;
         }
 
-        for( i = 0; i < 16; i++ )
-            ctx->y[i] ^= work_buf[i];
+        mbedtls_xor( ctx->y, ctx->y, work_buf, 16);
 
         gcm_mult( ctx, ctx->y, ctx->y );
     }
@@ -313,7 +310,7 @@
                            const unsigned char *add, size_t add_len )
 {
     const unsigned char *p;
-    size_t use_len, i, offset;
+    size_t use_len, offset;
 
     /* IV is limited to 2^64 bits, so 2^61 bytes */
     if( (uint64_t) add_len >> 61 != 0 )
@@ -328,8 +325,7 @@
         if( use_len > add_len )
             use_len = add_len;
 
-        for( i = 0; i < use_len; i++ )
-            ctx->buf[i+offset] ^= p[i];
+        mbedtls_xor( ctx->buf + offset, ctx->buf + offset, p, use_len );
 
         if( offset + use_len == 16 )
             gcm_mult( ctx, ctx->buf, ctx->buf );
@@ -343,8 +339,7 @@
 
     while( add_len >= 16 )
     {
-        for( i = 0; i < 16; i++ )
-            ctx->buf[i] ^= p[i];
+        mbedtls_xor( ctx->buf, ctx->buf, p, 16 );
 
         gcm_mult( ctx, ctx->buf, ctx->buf );
 
@@ -354,8 +349,7 @@
 
     if( add_len > 0 )
     {
-        for( i = 0; i < add_len; i++ )
-            ctx->buf[i] ^= p[i];
+        mbedtls_xor( ctx->buf, ctx->buf, p, add_len );
     }
 
     return( 0 );
@@ -378,7 +372,6 @@
                      const unsigned char *input,
                      unsigned char *output )
 {
-    size_t i;
     size_t olen = 0;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
 
@@ -389,14 +382,12 @@
         return( ret );
     }
 
-    for( i = 0; i < use_len; i++ )
-    {
-        if( ctx->mode == MBEDTLS_GCM_DECRYPT )
-            ctx->buf[offset + i] ^= input[i];
-        output[i] = ectr[offset + i] ^ input[i];
-        if( ctx->mode == MBEDTLS_GCM_ENCRYPT )
-            ctx->buf[offset + i] ^= output[i];
-    }
+    if( ctx->mode == MBEDTLS_GCM_DECRYPT )
+        mbedtls_xor( ctx->buf + offset, ctx->buf + offset, input, use_len );
+    mbedtls_xor( output, ectr + offset, input, use_len );
+    if( ctx->mode == MBEDTLS_GCM_ENCRYPT )
+        mbedtls_xor( ctx->buf + offset, ctx->buf + offset, output, use_len );
+
     return( 0 );
 }
 
@@ -489,7 +480,6 @@
                         unsigned char *tag, size_t tag_len )
 {
     unsigned char work_buf[16];
-    size_t i;
     uint64_t orig_len;
     uint64_t orig_add_len;
 
@@ -524,13 +514,11 @@
         MBEDTLS_PUT_UINT32_BE( ( orig_len     >> 32 ), work_buf, 8  );
         MBEDTLS_PUT_UINT32_BE( ( orig_len           ), work_buf, 12 );
 
-        for( i = 0; i < 16; i++ )
-            ctx->buf[i] ^= work_buf[i];
+        mbedtls_xor( ctx->buf, ctx->buf, work_buf, 16 );
 
         gcm_mult( ctx, ctx->buf, ctx->buf );
 
-        for( i = 0; i < tag_len; i++ )
-            tag[i] ^= ctx->buf[i];
+        mbedtls_xor( tag, tag, ctx->buf, tag_len );
     }
 
     return( 0 );
diff --git a/library/md.c b/library/md.c
index 8efcf10..9c161a5 100644
--- a/library/md.c
+++ b/library/md.c
@@ -633,7 +633,6 @@
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char sum[MBEDTLS_MD_MAX_SIZE];
     unsigned char *ipad, *opad;
-    size_t i;
 
     if( ctx == NULL || ctx->md_info == NULL || ctx->hmac_ctx == NULL )
         return( MBEDTLS_ERR_MD_BAD_INPUT_DATA );
@@ -657,11 +656,8 @@
     memset( ipad, 0x36, ctx->md_info->block_size );
     memset( opad, 0x5C, ctx->md_info->block_size );
 
-    for( i = 0; i < keylen; i++ )
-    {
-        ipad[i] = (unsigned char)( ipad[i] ^ key[i] );
-        opad[i] = (unsigned char)( opad[i] ^ key[i] );
-    }
+    mbedtls_xor( ipad, ipad, key, keylen );
+    mbedtls_xor( opad, opad, key, keylen );
 
     if( ( ret = mbedtls_md_starts( ctx ) ) != 0 )
         goto cleanup;
diff --git a/library/pkcs5.c b/library/pkcs5.c
index ac5945a..1e3b17e 100644
--- a/library/pkcs5.c
+++ b/library/pkcs5.c
@@ -211,7 +211,6 @@
                               uint32_t key_length, unsigned char *output )
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    int j;
     unsigned int i;
     unsigned char md1[MBEDTLS_MD_MAX_SIZE];
     unsigned char work[MBEDTLS_MD_MAX_SIZE];
@@ -263,8 +262,7 @@
 
             // U1 xor U2
             //
-            for( j = 0; j < md_size; j++ )
-                work[j] ^= md1[j];
+            mbedtls_xor( work, work, md1, md_size );
         }
 
         use_len = ( key_length < md_size ) ? key_length : md_size;
@@ -324,7 +322,6 @@
     mbedtls_md_free( &md_ctx );
     return( ret );
 #else
-    int j;
     unsigned int i;
     unsigned char md1[PSA_HASH_MAX_SIZE];
     unsigned char work[PSA_HASH_MAX_SIZE];
@@ -396,8 +393,7 @@
 
             // U1 xor U2
             //
-            for( j = 0; j < md_size; j++ )
-                work[j] ^= md1[j];
+            mbedtls_xor( work, work, md1, md_size );
         }
 
         use_len = ( key_length < md_size ) ? key_length : md_size;
diff --git a/library/platform_util.c b/library/platform_util.c
index 916a7f4..9c18dd5 100644
--- a/library/platform_util.c
+++ b/library/platform_util.c
@@ -143,3 +143,12 @@
 void (*mbedtls_test_hook_test_fail)( const char *, int, const char *);
 #endif /* MBEDTLS_TEST_HOOKS */
 
+/*
+ * Provide external definitions of some inline functions so that the compiler
+ * has the option to not inline them
+ */
+extern inline void mbedtls_xor( unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n );
+
+extern inline uint32_t mbedtls_get_unaligned_uint32( const void *p );
+
+extern inline void mbedtls_put_unaligned_uint32( void *p, uint32_t x );
diff --git a/library/ssl_msg.c b/library/ssl_msg.c
index 80471d4..732c346 100644
--- a/library/ssl_msg.c
+++ b/library/ssl_msg.c
@@ -511,15 +511,12 @@
                                     unsigned char const *dynamic_iv,
                                     size_t dynamic_iv_len )
 {
-    size_t i;
-
     /* Start with Fixed IV || 0 */
     memset( dst_iv, 0, dst_iv_len );
     memcpy( dst_iv, fixed_iv, fixed_iv_len );
 
     dst_iv += dst_iv_len - dynamic_iv_len;
-    for( i = 0; i < dynamic_iv_len; i++ )
-        dst_iv[i] ^= dynamic_iv[i];
+    mbedtls_xor( dst_iv, dst_iv, dynamic_iv, dynamic_iv_len );
 }
 #endif /* MBEDTLS_GCM_C || MBEDTLS_CCM_C || MBEDTLS_CHACHAPOLY_C */
 
diff --git a/tests/suites/test_suite_common.data b/tests/suites/test_suite_common.data
new file mode 100644
index 0000000..500852d
--- /dev/null
+++ b/tests/suites/test_suite_common.data
@@ -0,0 +1,20 @@
+Block xor, length 0
+mbedtls_xor:0
+
+Block xor, length 1
+mbedtls_xor:1
+
+Block xor, length 3
+mbedtls_xor:3
+
+Block xor, length 4
+mbedtls_xor:4
+
+Block xor, length 7
+mbedtls_xor:7
+
+Block xor, length 8
+mbedtls_xor:8
+
+Block xor, length 16
+mbedtls_xor:16
diff --git a/tests/suites/test_suite_common.function b/tests/suites/test_suite_common.function
new file mode 100644
index 0000000..4444a52
--- /dev/null
+++ b/tests/suites/test_suite_common.function
@@ -0,0 +1,90 @@
+/* BEGIN_HEADER */
+#include "../library/common.h"
+
+void fill_arrays( unsigned char *a, unsigned char *b, unsigned char *r1, unsigned char *r2, size_t n )
+{
+    for ( size_t i = 0; i < n; i++ )
+    {
+        a[i]  = (unsigned char) i * 3;
+        b[i]  = (unsigned char) i * 3 + 1;
+        r1[i] = (unsigned char) i * 3 + 2;
+        r2[i] = r1[i];
+    }
+}
+/* END_HEADER */
+
+/* BEGIN_CASE */
+void mbedtls_xor( int len )
+{
+    size_t n = (size_t) len;
+    unsigned char *a = NULL, *b = NULL, *r1 = NULL, *r2 = NULL;
+    ASSERT_ALLOC( a, n + 1 );
+    ASSERT_ALLOC( b, n + 1 );
+    ASSERT_ALLOC( r1, n + 1 );
+    ASSERT_ALLOC( r2, n + 1 );
+
+    /* Test non-overlapping */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = a[i] ^ b[i];
+    }
+    mbedtls_xor( r2, a, b, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test r == a */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = r1[i] ^ b[i];
+    }
+    mbedtls_xor( r2, r2, b, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test r == b */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = a[i] ^ r1[i];
+    }
+    mbedtls_xor( r2, a, r2, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test a == b */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = a[i] ^ a[i];
+    }
+    mbedtls_xor( r2, a, a, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test a == b == r */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = r1[i] ^ r1[i];
+    }
+    mbedtls_xor( r2, r2, r2, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test non-word-aligned buffers, for all combinations of alignedness */
+    for ( int i = 0; i < 7; i++ )
+    {
+        int r_off = i & 1, a_off = (i & 2) >> 1, b_off = (i & 4) >> 2;
+        fill_arrays( a, b, r1, r2, n + 1 );
+
+        for ( size_t j = 0; j < n; j++ )
+        {
+            r1[j + r_off] = a[j + a_off] ^ b[j + b_off];
+        }
+        mbedtls_xor( r2 + r_off, a + a_off, b + b_off, n );
+        ASSERT_COMPARE( r1 + r_off, n, r2 + r_off, n );
+    }
+exit:
+    mbedtls_free( a );
+    mbedtls_free( b );
+    mbedtls_free( r1 );
+    mbedtls_free( r2 );
+}
+/* END_CASE */