aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorManuel Pégourié-Gonnard <manuel.pegourie-gonnard@arm.com>2018-02-21 12:03:22 +0100
committerManuel Pégourié-Gonnard <manuel.pegourie-gonnard@arm.com>2018-02-27 12:39:12 +0100
commit8c76a9489e0a947dbf59e22f77ceae5414ed8c34 (patch)
tree8b540c35ca57b88836b54e4a38d562e3c7371aaa
parent62e813ca62a19fa6eaf8014edaa32fc6daa440ad (diff)
downloadmbedtls-8c76a9489e0a947dbf59e22f77ceae5414ed8c34.zip
mbedtls-8c76a9489e0a947dbf59e22f77ceae5414ed8c34.tar.gz
mbedtls-8c76a9489e0a947dbf59e22f77ceae5414ed8c34.tar.bz2
aria: turn macro into static inline function
Besides documenting types better and so on, this give the compiler more room to optimise either for size or performance. Here are some before/after measurements of: - size of aria.o in bytes (less is better) - instruction count for the selftest function (less is better) with various -O flags. Before: O aria.o ins s 10896 37,256 2 11176 37,199 3 12248 27,752 After: O aria.o ins s 8784 41,408 2 11112 37,001 3 13096 27,438 The new version allows the compiler to reach smaller size with -Os while maintaining (actually slightly improving) performance with -O2 and -O3. Measurements were done on x86_64 (but since this is mainly about inlining code, this should transpose well to other platforms) using the following helper program and script, after disabling CBC, CFB and CTR in config.h, in order to focus on the core functions. ==> st.c <== #include "mbedtls/aria.h" int main( void ) { return mbedtls_aria_self_test( 0 ); } ==> p.sh <== #!/bin/sh set -eu ccount () { ( valgrind --tool=callgrind --dump-line=no --callgrind-out-file=/dev/null --collect-atstart=no --toggle-collect=main $1 ) 2>&1 | sed -n -e 's/.*refs: *\([0-9,]*\)/\1/p' } printf "O\taria.o\tins\n" for O in s 2 3; do GCC="gcc -Wall -Wextra -Werror -Iinclude" $GCC -O$O -c library/aria.c $GCC -O1 st.c aria.o -o st ./st SIZE=$( du -b aria.o | cut -f1 ) INS=$( ccount ./st ) printf "$O\t$SIZE\t$INS\n" done
-rw-r--r--library/aria.c87
1 files changed, 51 insertions, 36 deletions
diff --git a/library/aria.c b/library/aria.c
index 6857e99..b71cc38 100644
--- a/library/aria.c
+++ b/library/aria.c
@@ -118,38 +118,53 @@ static void mbedtls_zeroize( void *v, size_t n ) {
* By passing sb1, sb2, is1, is2 as S-Boxes you get SL1-then-A.
* By passing is1, is2, sb1, sb2 as S-Boxes you get SL2-then-A.
*/
-#define ARIA_SLA( ra, rb, rc, rd, sa, sb, sc, sd ) { \
- uint32_t ta, tb, tc; \
- ta = ( (uint32_t) sc[(rb >> 16) & 0xFF]) ^ \
- (((uint32_t) sd[ rb >> 24]) << 8) ^ \
- (((uint32_t) sa[ rb & 0xFF]) << 16) ^ \
- (((uint32_t) sb[(rb >> 8) & 0xFF]) << 24); \
- rb = ( (uint32_t) sa[ ra & 0xFF]) ^ \
- (((uint32_t) sb[(ra >> 8) & 0xFF]) << 8) ^ \
- (((uint32_t) sc[(ra >> 16) & 0xFF]) << 16) ^ \
- (((uint32_t) sd[ ra >> 24]) << 24); \
- ra = ta; \
- ta = ( (uint32_t) sd[ rd >> 24]) ^ \
- (((uint32_t) sc[(rd >> 16) & 0xFF]) << 8) ^ \
- (((uint32_t) sb[(rd >> 8) & 0xFF]) << 16) ^ \
- (((uint32_t) sa[ rd & 0xFF]) << 24); \
- rd = ( (uint32_t) sb[(rc >> 8) & 0xFF]) ^ \
- (((uint32_t) sa[ rc & 0xFF]) << 8) ^ \
- (((uint32_t) sd[ rc >> 24]) << 16) ^ \
- (((uint32_t) sc[(rc >> 16) & 0xFF]) << 24); \
- rc = ta; \
- ta = ARIA_FLIP1( ra ) ^ rd; \
- tc = ARIA_FLIP1( rb ); \
- ta = ARIA_FLIP2( ta ) ^ tc ^ rc; \
- tb = ARIA_FLIP2( rc ) ^ ARIA_FLIP1( rd ); \
- tc ^= ARIA_FLIP2( ra ); \
- rb ^= ta ^ tb; \
- tb = ARIA_FLIP1( tb ) ^ ta; \
- ra ^= ARIA_FLIP2( tb ); \
- ta = ARIA_FLIP1( ta ); \
- rd ^= ARIA_FLIP2( ta ) ^ tc; \
- tc = ARIA_FLIP1( tc ); \
- rc ^= ARIA_FLIP2( tc ) ^ ta; \
+static inline void aria_sla( uint32_t *a, uint32_t *b,
+ uint32_t *c, uint32_t *d,
+ const uint8_t sa[0x100], const uint8_t sb[0x100],
+ const uint8_t sc[0x100], const uint8_t sd[0x100] )
+{
+ uint32_t ra, rb, rc, rd, ta, tb, tc;
+
+ ra = *a;
+ rb = *b;
+ rc = *c;
+ rd = *d;
+
+ ta = ( (uint32_t) sc[(rb >> 16) & 0xFF]) ^
+ (((uint32_t) sd[ rb >> 24]) << 8) ^
+ (((uint32_t) sa[ rb & 0xFF]) << 16) ^
+ (((uint32_t) sb[(rb >> 8) & 0xFF]) << 24);
+ rb = ( (uint32_t) sa[ ra & 0xFF]) ^
+ (((uint32_t) sb[(ra >> 8) & 0xFF]) << 8) ^
+ (((uint32_t) sc[(ra >> 16) & 0xFF]) << 16) ^
+ (((uint32_t) sd[ ra >> 24]) << 24);
+ ra = ta;
+ ta = ( (uint32_t) sd[ rd >> 24]) ^
+ (((uint32_t) sc[(rd >> 16) & 0xFF]) << 8) ^
+ (((uint32_t) sb[(rd >> 8) & 0xFF]) << 16) ^
+ (((uint32_t) sa[ rd & 0xFF]) << 24);
+ rd = ( (uint32_t) sb[(rc >> 8) & 0xFF]) ^
+ (((uint32_t) sa[ rc & 0xFF]) << 8) ^
+ (((uint32_t) sd[ rc >> 24]) << 16) ^
+ (((uint32_t) sc[(rc >> 16) & 0xFF]) << 24);
+ rc = ta;
+ ta = ARIA_FLIP1( ra ) ^ rd;
+ tc = ARIA_FLIP1( rb );
+ ta = ARIA_FLIP2( ta ) ^ tc ^ rc;
+ tb = ARIA_FLIP2( rc ) ^ ARIA_FLIP1( rd );
+ tc ^= ARIA_FLIP2( ra );
+ rb ^= ta ^ tb;
+ tb = ARIA_FLIP1( tb ) ^ ta;
+ ra ^= ARIA_FLIP2( tb );
+ ta = ARIA_FLIP1( ta );
+ rd ^= ARIA_FLIP2( ta ) ^ tc;
+ tc = ARIA_FLIP1( tc );
+ rc ^= ARIA_FLIP2( tc ) ^ ta;
+
+ *a = ra;
+ *b = rb;
+ *c = rc;
+ *d = rd;
}
/*
@@ -272,7 +287,7 @@ static void aria_fo_xor( uint32_t r[4],
c = p[2] ^ k[2];
d = p[3] ^ k[3];
- ARIA_SLA( a, b, c, d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+ aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
r[0] = a ^ x[0];
r[1] = b ^ x[1];
@@ -293,7 +308,7 @@ static void aria_fe_xor(uint32_t r[4],
c = p[2] ^ k[2];
d = p[3] ^ k[3];
- ARIA_SLA( a, b, c, d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
+ aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
r[0] = a ^ x[0];
r[1] = b ^ x[1];
@@ -447,7 +462,7 @@ int mbedtls_aria_crypt_ecb( mbedtls_aria_context *ctx,
c ^= ctx->rk[i][2];
d ^= ctx->rk[i][3];
i++;
- ARIA_SLA( a, b, c, d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+ aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
a ^= ctx->rk[i][0];
b ^= ctx->rk[i][1];
@@ -457,7 +472,7 @@ int mbedtls_aria_crypt_ecb( mbedtls_aria_context *ctx,
if (i >= ctx->nr)
break;
- ARIA_SLA( a, b, c, d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
+ aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
}
/* final substitution */