aria: define SLA() as sl(a())
This decreases the size with -Os by nearly 1k while
not hurting performance too much with -O2 and -O3
Before:
O aria.o ins
s 8784 41,408
2 11112 37,001
3 13096 27,438
After:
O aria.o ins
s 7976 43,865
2 10520 37,631
3 13040 28,146
(See previous commit for measurement details.)
diff --git a/library/aria.c b/library/aria.c
index b71cc38..dc2192a 100644
--- a/library/aria.c
+++ b/library/aria.c
@@ -84,87 +84,62 @@
#define ARIA_FLIP2(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8))
/*
- * Affine Transform A
+ * ARIA Affine Transform
* (ra, rb, rc, rd) = state in/out
*/
-#define ARIA_A( ra, rb, rc, rd ) { \
- uint32_t ta, tb, tc; \
- ta = rb; \
- rb = ra; \
- ra = ARIA_FLIP1( ta ); \
- tb = ARIA_FLIP1( rd ); \
- rd = ARIA_FLIP2( rc ); \
- rc = ARIA_FLIP2( tb ); \
- ta ^= rd; \
- tc = ARIA_FLIP1( rb ); \
- ta = ARIA_FLIP2( ta ) ^ tc ^ rc; \
- tb ^= ARIA_FLIP1( rd ); \
- tc ^= ARIA_FLIP2( ra ); \
- rb ^= ta ^ tb; \
- tb = ARIA_FLIP1( tb ) ^ ta; \
- ra ^= ARIA_FLIP2( tb ); \
- ta = ARIA_FLIP1( ta ); \
- rd ^= ARIA_FLIP2( ta ) ^ tc; \
- tc = ARIA_FLIP1( tc ); \
- rc ^= ARIA_FLIP2( tc ) ^ ta; \
+static inline void aria_a( uint32_t *a, uint32_t *b,
+ uint32_t *c, uint32_t *d )
+{
+ uint32_t ta, tb, tc;
+ ta = *b;
+ *b = *a;
+ *a = ARIA_FLIP1( ta );
+ tb = ARIA_FLIP1( *d );
+ *d = ARIA_FLIP2( *c );
+ *c = ARIA_FLIP2( tb );
+ ta ^= *d;
+ tc = ARIA_FLIP1( *b );
+ ta = ARIA_FLIP2( ta ) ^ tc ^ *c;
+ tb ^= ARIA_FLIP1( *d );
+ tc ^= ARIA_FLIP2( *a );
+ *b ^= ta ^ tb;
+ tb = ARIA_FLIP1( tb ) ^ ta;
+ *a ^= ARIA_FLIP2( tb );
+ ta = ARIA_FLIP1( ta );
+ *d ^= ARIA_FLIP2( ta ) ^ tc;
+ tc = ARIA_FLIP1( tc );
+ *c ^= ARIA_FLIP2( tc ) ^ ta;
}
-
/*
- * ARIA Round function ( Substitution Layer SLx + Affine Transform A )
- * (ra, rb, rc, rd) = state in/out
+ * ARIA Substitution Layer SL1 / SL2
+ * (a, b, c, d) = state in/out
* (sa, sb, sc, sd) = 256 8-bit S-Boxes (see below)
*
- * By passing sb1, sb2, is1, is2 as S-Boxes you get SL1-then-A.
- * By passing is1, is2, sb1, sb2 as S-Boxes you get SL2-then-A.
+ * By passing sb1, sb2, is1, is2 as S-Boxes you get SL1
+ * By passing is1, is2, sb1, sb2 as S-Boxes you get SL2
*/
-static inline void aria_sla( uint32_t *a, uint32_t *b,
- uint32_t *c, uint32_t *d,
- const uint8_t sa[0x100], const uint8_t sb[0x100],
- const uint8_t sc[0x100], const uint8_t sd[0x100] )
+static inline void aria_sl( uint32_t *a, uint32_t *b,
+ uint32_t *c, uint32_t *d,
+ const uint8_t sa[0x100], const uint8_t sb[0x100],
+ const uint8_t sc[0x100], const uint8_t sd[0x100] )
{
- uint32_t ra, rb, rc, rd, ta, tb, tc;
-
- ra = *a;
- rb = *b;
- rc = *c;
- rd = *d;
-
- ta = ( (uint32_t) sc[(rb >> 16) & 0xFF]) ^
- (((uint32_t) sd[ rb >> 24]) << 8) ^
- (((uint32_t) sa[ rb & 0xFF]) << 16) ^
- (((uint32_t) sb[(rb >> 8) & 0xFF]) << 24);
- rb = ( (uint32_t) sa[ ra & 0xFF]) ^
- (((uint32_t) sb[(ra >> 8) & 0xFF]) << 8) ^
- (((uint32_t) sc[(ra >> 16) & 0xFF]) << 16) ^
- (((uint32_t) sd[ ra >> 24]) << 24);
- ra = ta;
- ta = ( (uint32_t) sd[ rd >> 24]) ^
- (((uint32_t) sc[(rd >> 16) & 0xFF]) << 8) ^
- (((uint32_t) sb[(rd >> 8) & 0xFF]) << 16) ^
- (((uint32_t) sa[ rd & 0xFF]) << 24);
- rd = ( (uint32_t) sb[(rc >> 8) & 0xFF]) ^
- (((uint32_t) sa[ rc & 0xFF]) << 8) ^
- (((uint32_t) sd[ rc >> 24]) << 16) ^
- (((uint32_t) sc[(rc >> 16) & 0xFF]) << 24);
- rc = ta;
- ta = ARIA_FLIP1( ra ) ^ rd;
- tc = ARIA_FLIP1( rb );
- ta = ARIA_FLIP2( ta ) ^ tc ^ rc;
- tb = ARIA_FLIP2( rc ) ^ ARIA_FLIP1( rd );
- tc ^= ARIA_FLIP2( ra );
- rb ^= ta ^ tb;
- tb = ARIA_FLIP1( tb ) ^ ta;
- ra ^= ARIA_FLIP2( tb );
- ta = ARIA_FLIP1( ta );
- rd ^= ARIA_FLIP2( ta ) ^ tc;
- tc = ARIA_FLIP1( tc );
- rc ^= ARIA_FLIP2( tc ) ^ ta;
-
- *a = ra;
- *b = rb;
- *c = rc;
- *d = rd;
+ *a = ( (uint32_t) sa[ *a & 0xFF]) ^
+ (((uint32_t) sb[(*a >> 8) & 0xFF]) << 8) ^
+ (((uint32_t) sc[(*a >> 16) & 0xFF]) << 16) ^
+ (((uint32_t) sd[ *a >> 24 ]) << 24);
+ *b = ( (uint32_t) sa[ *b & 0xFF]) ^
+ (((uint32_t) sb[(*b >> 8) & 0xFF]) << 8) ^
+ (((uint32_t) sc[(*b >> 16) & 0xFF]) << 16) ^
+ (((uint32_t) sd[ *b >> 24 ]) << 24);
+ *c = ( (uint32_t) sa[ *c & 0xFF]) ^
+ (((uint32_t) sb[(*c >> 8) & 0xFF]) << 8) ^
+ (((uint32_t) sc[(*c >> 16) & 0xFF]) << 16) ^
+ (((uint32_t) sd[ *c >> 24 ]) << 24);
+ *d = ( (uint32_t) sa[ *d & 0xFF]) ^
+ (((uint32_t) sb[(*d >> 8) & 0xFF]) << 8) ^
+ (((uint32_t) sc[(*d >> 16) & 0xFF]) << 16) ^
+ (((uint32_t) sd[ *d >> 24 ]) << 24);
}
/*
@@ -287,7 +262,8 @@
c = p[2] ^ k[2];
d = p[3] ^ k[3];
- aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+ aria_sl( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+ aria_a( &a, &b, &c, &d );
r[0] = a ^ x[0];
r[1] = b ^ x[1];
@@ -308,7 +284,8 @@
c = p[2] ^ k[2];
d = p[3] ^ k[3];
- aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
+ aria_sl( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
+ aria_a( &a, &b, &c, &d );
r[0] = a ^ x[0];
r[1] = b ^ x[1];
@@ -430,7 +407,7 @@
/* apply affine transform to middle keys */
for (i = 1; i < ctx->nr; i++ )
- ARIA_A( ctx->rk[i][0], ctx->rk[i][1], ctx->rk[i][2], ctx->rk[i][3] );
+ aria_a( &ctx->rk[i][0], &ctx->rk[i][1], &ctx->rk[i][2], &ctx->rk[i][3] );
return 0;
}
@@ -462,43 +439,27 @@
c ^= ctx->rk[i][2];
d ^= ctx->rk[i][3];
i++;
- aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+
+ aria_sl( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+ aria_a( &a, &b, &c, &d );
a ^= ctx->rk[i][0];
b ^= ctx->rk[i][1];
c ^= ctx->rk[i][2];
d ^= ctx->rk[i][3];
i++;
+
+ aria_sl( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
if (i >= ctx->nr)
break;
-
- aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
+ aria_a( &a, &b, &c, &d );
}
- /* final substitution */
- a = ctx->rk[i][0] ^
- ( (uint32_t) aria_is1[ a & 0xFF]) ^
- (((uint32_t) aria_is2[(a >> 8) & 0xFF]) << 8) ^
- (((uint32_t) aria_sb1[(a >> 16) & 0xFF]) << 16) ^
- (((uint32_t) aria_sb2[ a >> 24 ]) << 24);
-
- b = ctx->rk[i][1] ^
- ( (uint32_t) aria_is1[ b & 0xFF]) ^
- (((uint32_t) aria_is2[(b >> 8) & 0xFF]) << 8) ^
- (((uint32_t) aria_sb1[(b >> 16) & 0xFF]) << 16) ^
- (((uint32_t) aria_sb2[ b >> 24 ]) << 24);
-
- c = ctx->rk[i][2] ^
- ( (uint32_t) aria_is1[ c & 0xFF]) ^
- (((uint32_t) aria_is2[(c >> 8) & 0xFF]) << 8) ^
- (((uint32_t) aria_sb1[(c >> 16) & 0xFF]) << 16) ^
- (((uint32_t) aria_sb2[ c >> 24 ]) << 24);
-
- d = ctx->rk[i][3] ^
- ( (uint32_t) aria_is1[ d & 0xFF]) ^
- (((uint32_t) aria_is2[(d >> 8) & 0xFF]) << 8) ^
- (((uint32_t) aria_sb1[(d >> 16) & 0xFF]) << 16) ^
- (((uint32_t) aria_sb2[ d >> 24 ]) << 24);
+ /* final key mixing */
+ a ^= ctx->rk[i][0];
+ b ^= ctx->rk[i][1];
+ c ^= ctx->rk[i][2];
+ d ^= ctx->rk[i][3];
PUT_UINT32_LE( a, output, 0 );
PUT_UINT32_LE( b, output, 4 );