aria: optimize byte perms on Arm

Use specific instructions for moving bytes around in a word. This speeds
things up, and as a side-effect, slightly lowers code size.

ARIA_P3 and ARIA_P1 are now 1 single-cycle instruction each (those
instructions are available in all architecture versions starting from v6-M).
Note: ARIA_P3 was already translated to a single instruction by Clang 3.8 and
armclang 6.5, but not arm-gcc 5.4 nor armcc 5.06.

ARIA_P2 is already efficiently translated to the minimal number of
instruction (1 in ARM mode, 2 in thumb mode) by all tested compilers

Manually compiled and inspected generated code with the following compilers:
arm-gcc 5.4, clang 3.8, armcc 5.06 (with and without --gnu), armclang 6.5.

Size reduction (arm-none-eabi-gcc -march=armv6-m -mthumb -Os): 5288 -> 5044 B

Effect on executing time of self-tests on a few boards:
FRDM-K64F   (Cortex-M4):    444 ->  385 us (-13%)
LPC1768     (Cortex-M3):    488 ->  432 us (-11%)
FRDM-KL64Z  (Cortex-M0):   1429 -> 1134 us (-20%)

Measured using a config.h with no cipher mode and the following program with
aria.c and aria.h copy-pasted to the online compiler:

 #include "mbed.h"
 #include "aria.h"

int main() {
    Timer t;
    t.start();
    int ret = mbedtls_aria_self_test(0);
    t.stop();
    printf("ret = %d; time = %d us\n", ret, t.read_us());
}
diff --git a/library/aria.c b/library/aria.c
index f6ad7f1..1d1daa1 100644
--- a/library/aria.c
+++ b/library/aria.c
@@ -85,11 +85,33 @@
  * Common compilers fail to translate this to minimal number of instructions,
  * so let's provide asm versions for common platforms with C fallback.
  */
-#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__)
-#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
+#if defined(MBEDTLS_HAVE_ASM)
+#if defined(__arm__)
+/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
+#if defined(__GNUC__) && \
+    ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
+static inline uint32_t aria_p1( uint32_t x )
+{
+    uint32_t r;
+    asm( "rev16 %0, %1" : "=l" (r) : "l" (x) );
+    return( r );
+}
+#define ARIA_P1 aria_p1
+#elif defined(__ARMCC_VERSION) && __ARMCC_VERSION < 6000000
+static __inline uint32_t aria_p1( uint32_t x )
+{
+    uint32_t r;
+    __asm( "rev16 r, x" );
+    return( r );
+}
+#define ARIA_P1 aria_p1
+#endif
+#endif /* arm */
+#if defined(__GNUC__) && \
+    defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
 /* I couldn't find an Intel equivalent of ret16, so two instructions */
 #define ARIA_P1(x) ARIA_P2( ARIA_P3( x ) )
-#endif
+#endif /* x86 gnuc */
 #endif /* MBEDTLS_HAVE_ASM && GNUC */
 #if !defined(ARIA_P1)
 #define ARIA_P1(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8))
@@ -112,15 +134,37 @@
  * Some compilers fail to translate this to a single instruction,
  * so let's provide asm versions for common platforms with C fallback.
  */
-#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__)
-#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
+#if defined(MBEDTLS_HAVE_ASM)
+#if defined(__arm__)
+/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
+#if defined(__GNUC__) && \
+    ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
+static inline uint32_t aria_p3( uint32_t x )
+{
+    uint32_t r;
+    asm( "rev %0, %1" : "=l" (r) : "l" (x) );
+    return( r );
+}
+#define ARIA_P3 aria_p3
+#elif defined(__ARMCC_VERSION) && __ARMCC_VERSION < 6000000
+static __inline uint32_t aria_p3( uint32_t x )
+{
+    uint32_t r;
+    __asm( "rev r, x" );
+    return( r );
+}
+#define ARIA_P3 aria_p3
+#endif
+#endif /* arm */
+#if defined(__GNUC__) && \
+    defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
 static inline uint32_t aria_p3( uint32_t x )
 {
     asm( "bswap %0" : "=r" (x) : "0" (x) );
     return( x );
 }
 #define ARIA_P3 aria_p3
-#endif
+#endif /* x86 gnuc */
 #endif /* MBEDTLS_HAVE_ASM && GNUC */
 #if !defined(ARIA_P3)
 #define ARIA_P3(x) ARIA_P2( ARIA_P1 ( x ) )