blob: e36c6d42d87f7d53c53abe9cae7d802ad7c532da [file] [log] [blame]
Jerry Yu49231312023-01-10 16:57:21 +08001/*
Dave Rodgmanf918d422023-03-17 17:52:23 +00002 * Armv8-A Cryptographic Extension support functions for Aarch64
Jerry Yu49231312023-01-10 16:57:21 +08003 *
4 * Copyright The Mbed TLS Contributors
5 * SPDX-License-Identifier: Apache-2.0
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License"); you may
8 * not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
Dave Rodgman27e3c872023-10-08 10:29:26 +010020#if defined(__clang__) && (__clang_major__ >= 4)
21
22/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8 in the following #if,
23 * but that is defined by build_info.h, and we need this block to happen first. */
24#if defined(__ARM_ARCH)
25#if __ARM_ARCH >= 8
26#define MBEDTLS_AESCE_ARCH_IS_ARMV8
27#endif
28#endif
29
30#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8) && !defined(__ARM_FEATURE_CRYPTO)
Jerry Yu48b999c2023-03-03 15:51:07 +080031/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
32 *
33 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
34 * these are normally only enabled by the -march option on the command line.
35 * By defining the macros ourselves we gain access to those declarations without
36 * requiring -march on the command line.
37 *
38 * `arm_neon.h` could be included by any header file, so we put these defines
39 * at the top of this file, before any includes.
40 */
41#define __ARM_FEATURE_CRYPTO 1
Jerry Yuae129c32023-03-03 15:55:56 +080042/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
43 *
Jerry Yu490bf082023-03-06 15:21:44 +080044 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
45 * for older compilers.
Jerry Yuae129c32023-03-03 15:55:56 +080046 */
47#define __ARM_FEATURE_AES 1
Dave Rodgmandb6ab242023-03-14 16:03:57 +000048#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
Jerry Yu490bf082023-03-06 15:21:44 +080049#endif
Jerry Yu48b999c2023-03-03 15:51:07 +080050
Dave Rodgman27e3c872023-10-08 10:29:26 +010051#endif /* defined(__clang__) && (__clang_major__ >= 4) */
52
Jerry Yu49231312023-01-10 16:57:21 +080053#include <string.h>
54#include "common.h"
55
56#if defined(MBEDTLS_AESCE_C)
57
58#include "aesce.h"
59
Dave Rodgmanece803b2023-10-08 20:24:48 +010060#if defined(MBEDTLS_ARCH_IS_ARMV8) && defined(__ARM_NEON)
Jerry Yu49231312023-01-10 16:57:21 +080061
Jerry Yu61c4cfa2023-04-26 11:06:51 +080062/* Compiler version checks. */
Jerry Yudb368de2023-04-26 16:55:37 +080063#if defined(__clang__)
Dave Rodgman48b965d2023-10-09 12:19:44 +010064# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
Dave Rodgmanb34fe8b2023-10-10 09:52:46 +010065# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
Dave Rodgman48b965d2023-10-09 12:19:44 +010066# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
67# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
Jerry Yudb368de2023-04-26 16:55:37 +080068# endif
69#elif defined(__GNUC__)
70# if __GNUC__ < 6
71# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
72# endif
73#elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +080074/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
75 * please update this and document of `MBEDTLS_AESCE_C` in
76 * `mbedtls_config.h`. */
Jerry Yudb368de2023-04-26 16:55:37 +080077# if _MSC_VER < 1929
78# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
79# endif
Dave Rodgman4b8e8dc2023-10-08 21:41:40 +010080#elif defined(__ARMCC_VERSION)
81# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
82/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
83 * If someone verified that, please update this and document of
84 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
85# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
86# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
87# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
88# endif
Jerry Yu61c4cfa2023-04-26 11:06:51 +080089#endif
90
Jerry Yu6b00f5a2023-05-04 16:30:21 +080091#ifdef __ARM_NEON
Jerry Yu08933d32023-04-27 18:28:00 +080092#include <arm_neon.h>
Jerry Yu6b00f5a2023-05-04 16:30:21 +080093#else
94#error "Target does not support NEON instructions"
95#endif
Jerry Yu08933d32023-04-27 18:28:00 +080096
Jerry Yu580e06f2023-04-28 17:42:40 +080097#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
98 defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
Jerry Yub1d06bb2023-05-05 14:05:07 +080099# if defined(__ARMCOMPILER_VERSION)
100# if __ARMCOMPILER_VERSION <= 6090000
101# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
102# else
Jerry Yu893be8d2023-07-13 17:32:11 +0800103# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800104# define MBEDTLS_POP_TARGET_PRAGMA
105# endif
106# elif defined(__clang__)
Jerry Yu893be8d2023-07-13 17:32:11 +0800107# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yuec9be842023-03-14 10:42:47 +0800108# define MBEDTLS_POP_TARGET_PRAGMA
109# elif defined(__GNUC__)
Jerry Yuec9be842023-03-14 10:42:47 +0800110# pragma GCC push_options
Beniamin Sandu471a9752023-06-25 20:16:16 +0300111# pragma GCC target ("+crypto")
Jerry Yuec9be842023-03-14 10:42:47 +0800112# define MBEDTLS_POP_TARGET_PRAGMA
Jerry Yu07d28d82023-03-20 18:12:36 +0800113# elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +0800114# error "Required feature(__ARM_FEATURE_AES) is not enabled."
Jerry Yu49231312023-01-10 16:57:21 +0800115# endif
Jerry Yu580e06f2023-04-28 17:42:40 +0800116#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
117 MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
Jerry Yu49231312023-01-10 16:57:21 +0800118
Dave Rodgman45661322023-08-04 12:31:58 +0100119#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
120
Jerry Yub95c7762023-01-10 16:59:51 +0800121#include <asm/hwcap.h>
122#include <sys/auxv.h>
Dave Rodgman45661322023-08-04 12:31:58 +0100123
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100124signed char mbedtls_aesce_has_support_result = -1;
Jerry Yub95c7762023-01-10 16:59:51 +0800125
Jerry Yu36606232023-04-19 10:44:29 +0800126#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
Jerry Yub95c7762023-01-10 16:59:51 +0800127/*
128 * AES instruction support detection routine
129 */
Dave Rodgman45661322023-08-04 12:31:58 +0100130int mbedtls_aesce_has_support_impl(void)
Jerry Yub95c7762023-01-10 16:59:51 +0800131{
Dave Rodgman45661322023-08-04 12:31:58 +0100132 /* To avoid many calls to getauxval, cache the result. This is
133 * thread-safe, because we store the result in a char so cannot
134 * be vulnerable to non-atomic updates.
135 * It is possible that we could end up setting result more than
136 * once, but that is harmless.
137 */
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100138 if (mbedtls_aesce_has_support_result == -1) {
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100139#if defined(MBEDTLS_ARCH_IS_ARM32)
140 unsigned long auxval = getauxval(AT_HWCAP);
141 unsigned long auxval2 = getauxval(AT_HWCAP2);
142 if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
143 ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
144 mbedtls_aesce_has_support_result = 1;
145 } else {
146 mbedtls_aesce_has_support_result = 0;
147 }
148#else
Dave Rodgman45661322023-08-04 12:31:58 +0100149 unsigned long auxval = getauxval(AT_HWCAP);
150 if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
151 (HWCAP_ASIMD | HWCAP_AES)) {
152 mbedtls_aesce_has_support_result = 1;
153 } else {
154 mbedtls_aesce_has_support_result = 0;
155 }
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100156#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100157 }
158 return mbedtls_aesce_has_support_result;
Jerry Yub95c7762023-01-10 16:59:51 +0800159}
Jerry Yu0d4f4e52023-03-31 14:32:47 +0800160#endif
Jerry Yub95c7762023-01-10 16:59:51 +0800161
Dave Rodgman45661322023-08-04 12:31:58 +0100162#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
163
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100164/* Single round of AESCE encryption */
165#define AESCE_ENCRYPT_ROUND \
166 block = vaeseq_u8(block, vld1q_u8(keys)); \
167 block = vaesmcq_u8(block); \
168 keys += 16
169/* Two rounds of AESCE encryption */
170#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
171
Dave Rodgman9bb7e6f2023-06-16 09:41:21 +0100172MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
Jerry Yu2bb3d812023-01-10 17:38:26 +0800173static uint8x16_t aesce_encrypt_block(uint8x16_t block,
174 unsigned char *keys,
175 int rounds)
176{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100177 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100178 if (rounds == 10) {
179 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800180 }
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100181 if (rounds == 12) {
182 goto rounds_12;
183 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100184 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100185rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100186 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100187rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100188 AESCE_ENCRYPT_ROUND_X2;
189 AESCE_ENCRYPT_ROUND_X2;
190 AESCE_ENCRYPT_ROUND_X2;
191 AESCE_ENCRYPT_ROUND_X2;
192 AESCE_ENCRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800193
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800194 /* AES AddRoundKey for the previous round.
195 * SubBytes, ShiftRows for the final round. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100196 block = vaeseq_u8(block, vld1q_u8(keys));
197 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800198
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800199 /* Final round: no MixColumns */
Jerry Yu3304c202023-02-22 14:37:11 +0800200
201 /* Final AddRoundKey */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100202 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800203
204 return block;
205}
206
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100207/* Single round of AESCE decryption
208 *
209 * AES AddRoundKey, SubBytes, ShiftRows
210 *
211 * block = vaesdq_u8(block, vld1q_u8(keys));
212 *
213 * AES inverse MixColumns for the next round.
214 *
215 * This means that we switch the order of the inverse AddRoundKey and
216 * inverse MixColumns operations. We have to do this as AddRoundKey is
217 * done in an atomic instruction together with the inverses of SubBytes
218 * and ShiftRows.
219 *
220 * It works because MixColumns is a linear operation over GF(2^8) and
221 * AddRoundKey is an exclusive or, which is equivalent to addition over
222 * GF(2^8). (The inverse of MixColumns needs to be applied to the
223 * affected round keys separately which has been done when the
224 * decryption round keys were calculated.)
225 *
226 * block = vaesimcq_u8(block);
227 */
228#define AESCE_DECRYPT_ROUND \
229 block = vaesdq_u8(block, vld1q_u8(keys)); \
230 block = vaesimcq_u8(block); \
231 keys += 16
232/* Two rounds of AESCE decryption */
233#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
234
Jerry Yu2bb3d812023-01-10 17:38:26 +0800235static uint8x16_t aesce_decrypt_block(uint8x16_t block,
236 unsigned char *keys,
237 int rounds)
238{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100239 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100240 if (rounds == 10) {
241 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800242 }
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100243 if (rounds == 12) {
244 goto rounds_12;
245 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100246 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100247rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100248 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100249rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100250 AESCE_DECRYPT_ROUND_X2;
251 AESCE_DECRYPT_ROUND_X2;
252 AESCE_DECRYPT_ROUND_X2;
253 AESCE_DECRYPT_ROUND_X2;
254 AESCE_DECRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800255
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800256 /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
257 * last full round. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100258 block = vaesdq_u8(block, vld1q_u8(keys));
259 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800260
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800261 /* Inverse AddRoundKey for inverting the initial round key addition. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100262 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800263
264 return block;
265}
266
267/*
268 * AES-ECB block en(de)cryption
269 */
270int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
271 int mode,
272 const unsigned char input[16],
273 unsigned char output[16])
274{
275 uint8x16_t block = vld1q_u8(&input[0]);
276 unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
277
278 if (mode == MBEDTLS_AES_ENCRYPT) {
279 block = aesce_encrypt_block(block, keys, ctx->nr);
280 } else {
281 block = aesce_decrypt_block(block, keys, ctx->nr);
282 }
283 vst1q_u8(&output[0], block);
284
285 return 0;
286}
287
Jerry Yue096da12023-01-10 17:07:01 +0800288/*
289 * Compute decryption round keys from encryption round keys
290 */
291void mbedtls_aesce_inverse_key(unsigned char *invkey,
292 const unsigned char *fwdkey,
293 int nr)
294{
295 int i, j;
296 j = nr;
297 vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
298 for (i = 1, j--; j > 0; i++, j--) {
299 vst1q_u8(invkey + i * 16,
300 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
301 }
302 vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
303
304}
305
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800306static inline uint32_t aes_rot_word(uint32_t word)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800307{
308 return (word << (32 - 8)) | (word >> 8);
309}
310
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800311static inline uint32_t aes_sub_word(uint32_t in)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800312{
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800313 uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
Jerry Yu3f2fb712023-01-10 17:05:42 +0800314 uint8x16_t zero = vdupq_n_u8(0);
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800315
316 /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
317 * the correct result as ShiftRows doesn't change the first row. */
318 v = vaeseq_u8(zero, v);
319 return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800320}
321
322/*
Jerry Yubaae4012023-02-21 15:26:13 +0800323 * Key expansion function
Jerry Yu3f2fb712023-01-10 17:05:42 +0800324 */
Jerry Yubaae4012023-02-21 15:26:13 +0800325static void aesce_setkey_enc(unsigned char *rk,
326 const unsigned char *key,
327 const size_t key_bit_length)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800328{
Jerry Yubaae4012023-02-21 15:26:13 +0800329 static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
330 0x20, 0x40, 0x80, 0x1b, 0x36 };
Jerry Yu947bf962023-02-23 11:07:57 +0800331 /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
332 * - Section 5, Nr = Nk + 6
Jerry Yu2c266512023-03-01 11:18:20 +0800333 * - Section 5.2, the length of round keys is Nb*(Nr+1)
Jerry Yu947bf962023-02-23 11:07:57 +0800334 */
335 const uint32_t key_len_in_words = key_bit_length / 32; /* Nk */
336 const size_t round_key_len_in_words = 4; /* Nb */
Jerry Yu2c266512023-03-01 11:18:20 +0800337 const size_t rounds_needed = key_len_in_words + 6; /* Nr */
338 const size_t round_keys_len_in_words =
339 round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
340 const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800341
Jerry Yu3304c202023-02-22 14:37:11 +0800342 memcpy(rk, key, key_len_in_words * 4);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800343
Jerry Yu3304c202023-02-22 14:37:11 +0800344 for (uint32_t *rki = (uint32_t *) rk;
345 rki + key_len_in_words < rko_end;
346 rki += key_len_in_words) {
347
Jerry Yufac5a542023-02-23 10:13:40 +0800348 size_t iteration = (rki - (uint32_t *) rk) / key_len_in_words;
Jerry Yu3304c202023-02-22 14:37:11 +0800349 uint32_t *rko;
Jerry Yubaae4012023-02-21 15:26:13 +0800350 rko = rki + key_len_in_words;
351 rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
Jerry Yu3304c202023-02-22 14:37:11 +0800352 rko[0] ^= rcon[iteration] ^ rki[0];
Jerry Yu3f2fb712023-01-10 17:05:42 +0800353 rko[1] = rko[0] ^ rki[1];
354 rko[2] = rko[1] ^ rki[2];
355 rko[3] = rko[2] ^ rki[3];
Jerry Yufac5a542023-02-23 10:13:40 +0800356 if (rko + key_len_in_words > rko_end) {
Jerry Yu3304c202023-02-22 14:37:11 +0800357 /* Do not write overflow words.*/
358 continue;
359 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800360#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
Jerry Yubaae4012023-02-21 15:26:13 +0800361 switch (key_bit_length) {
Jerry Yu3304c202023-02-22 14:37:11 +0800362 case 128:
363 break;
Jerry Yubaae4012023-02-21 15:26:13 +0800364 case 192:
Jerry Yu3304c202023-02-22 14:37:11 +0800365 rko[4] = rko[3] ^ rki[4];
366 rko[5] = rko[4] ^ rki[5];
Jerry Yubaae4012023-02-21 15:26:13 +0800367 break;
368 case 256:
Jerry Yu3304c202023-02-22 14:37:11 +0800369 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
370 rko[5] = rko[4] ^ rki[5];
371 rko[6] = rko[5] ^ rki[6];
372 rko[7] = rko[6] ^ rki[7];
Jerry Yubaae4012023-02-21 15:26:13 +0800373 break;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800374 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800375#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
Jerry Yu3f2fb712023-01-10 17:05:42 +0800376 }
377}
378
379/*
380 * Key expansion, wrapper
381 */
382int mbedtls_aesce_setkey_enc(unsigned char *rk,
383 const unsigned char *key,
384 size_t bits)
385{
386 switch (bits) {
Jerry Yubaae4012023-02-21 15:26:13 +0800387 case 128:
388 case 192:
389 case 256:
Jerry Yuba1e78f2023-02-24 11:18:16 +0800390 aesce_setkey_enc(rk, key, bits);
391 break;
392 default:
393 return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800394 }
395
396 return 0;
397}
398
Jerry Yudf87a122023-01-10 18:17:15 +0800399#if defined(MBEDTLS_GCM_C)
400
Dave Rodgman46267f62023-10-09 14:47:50 +0100401#if defined(MBEDTLS_ARCH_IS_ARM32)
402
403#if defined(__clang__)
404/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
405 * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
406 * These are only required for GCM.
407 */
Dave Rodgman46267f62023-10-09 14:47:50 +0100408#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
409
410typedef uint8x16_t poly128_t;
411
412static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
413{
414 poly128_t r;
415 asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
416 return r;
417}
418
Dave Rodgman90291df2023-10-10 09:51:16 +0100419/* This is set to cause some more missing intrinsics to be defined below */
420#define COMMON_MISSING_INTRINSICS
Dave Rodgman46267f62023-10-09 14:47:50 +0100421
422static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
423{
424 return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
425 (poly64_t) (vget_high_u64((uint64x2_t) b)));
426}
427
428#endif /* defined(__clang__) */
429
430static inline uint8x16_t vrbitq_u8(uint8x16_t x)
431{
432 /* There is no vrbitq_u8 instruction in A32/T32, so provide
433 * an equivalent non-Neon implementation. Reverse bit order in each
434 * byte with 4x rbit, rev. */
435 asm ("ldm %[p], { r2-r5 } \n\t"
436 "rbit r2, r2 \n\t"
437 "rev r2, r2 \n\t"
438 "rbit r3, r3 \n\t"
439 "rev r3, r3 \n\t"
440 "rbit r4, r4 \n\t"
441 "rev r4, r4 \n\t"
442 "rbit r5, r5 \n\t"
443 "rev r5, r5 \n\t"
444 "stm %[p], { r2-r5 } \n\t"
445 :
446 /* Output: 16 bytes of memory pointed to by &x */
447 "+m" (*(uint8_t(*)[16]) &x)
448 :
449 [p] "r" (&x)
450 :
451 "r2", "r3", "r4", "r5"
452 );
453 return x;
454}
Dave Rodgman46267f62023-10-09 14:47:50 +0100455
Dave Rodgman90291df2023-10-10 09:51:16 +0100456#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
Dave Rodgman46267f62023-10-09 14:47:50 +0100457
458#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800459/* Some intrinsics are not available for GCC 5.X. */
Dave Rodgman90291df2023-10-10 09:51:16 +0100460#define COMMON_MISSING_INTRINSICS
461#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
462
463
464#if defined(COMMON_MISSING_INTRINSICS)
465
466/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
467
468#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
Jerry Yu132d0cb2023-03-02 17:35:53 +0800469#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
Dave Rodgman46267f62023-10-09 14:47:50 +0100470
Dave Rodgman90291df2023-10-10 09:51:16 +0100471static inline poly64x1_t vget_low_p64(poly64x2_t a)
Jerry Yu132d0cb2023-03-02 17:35:53 +0800472{
Dave Rodgman90291df2023-10-10 09:51:16 +0100473 uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
474 return (poly64x1_t) r;
475
Jerry Yu132d0cb2023-03-02 17:35:53 +0800476}
Dave Rodgman90291df2023-10-10 09:51:16 +0100477
478#endif /* COMMON_MISSING_INTRINSICS */
Jerry Yu132d0cb2023-03-02 17:35:53 +0800479
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800480/* vmull_p64/vmull_high_p64 wrappers.
481 *
482 * Older compilers miss some intrinsic functions for `poly*_t`. We use
483 * uint8x16_t and uint8x16x3_t as input/output parameters.
484 */
Dave Rodgman46267f62023-10-09 14:47:50 +0100485#if defined(MBEDTLS_COMPILER_IS_GCC)
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800486/* GCC reports incompatible type error without cast. GCC think poly64_t and
487 * poly64x1_t are different, that is different with MSVC and Clang. */
488#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
489#else
490/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
491 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
492 * cast for clang also. */
493#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
Dave Rodgman46267f62023-10-09 14:47:50 +0100494#endif /* MBEDTLS_COMPILER_IS_GCC */
495
Jerry Yudf87a122023-01-10 18:17:15 +0800496static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
497{
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800498
Jerry Yudf87a122023-01-10 18:17:15 +0800499 return vreinterpretq_u8_p128(
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800500 MBEDTLS_VMULL_P64(
501 vget_low_p64(vreinterpretq_p64_u8(a)),
502 vget_low_p64(vreinterpretq_p64_u8(b))
503 ));
Jerry Yudf87a122023-01-10 18:17:15 +0800504}
505
506static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
507{
508 return vreinterpretq_u8_p128(
509 vmull_high_p64(vreinterpretq_p64_u8(a),
510 vreinterpretq_p64_u8(b)));
511}
512
Jerry Yuf0526a92023-03-14 15:00:29 +0800513/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
Jerry Yu49b43672023-03-13 10:09:34 +0800514 * `x^128 + x^7 + x^2 + x + 1`.
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800515 *
516 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
517 * multiplies to generate a 128b.
518 *
519 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
520 * represented by 3 128b due to code size optimization.
521 *
522 * Output layout:
523 * | | | |
524 * |------------|-------------|-------------|
525 * | ret.val[0] | h3:h2:00:00 | high 128b |
Jerry Yu8f810602023-03-14 17:28:52 +0800526 * | ret.val[1] | :m2:m1:00 | middle 128b |
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800527 * | ret.val[2] | : :l1:l0 | low 128b |
528 */
Jerry Yudf87a122023-01-10 18:17:15 +0800529static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
530{
531 uint8x16x3_t ret;
Jerry Yu8f810602023-03-14 17:28:52 +0800532 uint8x16_t h, m, l; /* retval high/middle/low */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800533 uint8x16_t c, d, e;
534
535 h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
536 l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
537 c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
538 d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
539 e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
540 m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
541
542 ret.val[0] = h;
543 ret.val[1] = m;
544 ret.val[2] = l;
Jerry Yudf87a122023-01-10 18:17:15 +0800545 return ret;
546}
547
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800548/*
549 * Modulo reduction.
550 *
551 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
552 *
553 * Section 4.3
554 *
555 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
556 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
Jerry Yube4fdef2023-03-15 14:50:42 +0800557 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
558 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
559 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800560 * the result is still larger than 128 bits, we reduce again.
561 */
562static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
Jerry Yudf87a122023-01-10 18:17:15 +0800563{
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800564 uint8x16_t const ZERO = vdupq_n_u8(0);
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800565
Jerry Yudf87a122023-01-10 18:17:15 +0800566 uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800567#if defined(__GNUC__)
568 /* use 'asm' as an optimisation barrier to prevent loading MODULO from
569 * memory. It is for GNUC compatible compilers.
570 */
Dave Rodgmancb5c9fb2023-10-10 10:06:02 +0100571 asm volatile ("" : "+w" (r));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800572#endif
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800573 uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
Jerry Yu8f810602023-03-14 17:28:52 +0800574 uint8x16_t h, m, l; /* input high/middle/low 128b */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800575 uint8x16_t c, d, e, f, g, n, o;
576 h = input.val[0]; /* h3:h2:00:00 */
577 m = input.val[1]; /* :m2:m1:00 */
578 l = input.val[2]; /* : :l1:l0 */
579 c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
580 d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
581 e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
582 f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
583 g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
584 n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
585 o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
586 return veorq_u8(o, g); /* = o1:o0 + g1:00 */
Jerry Yudf87a122023-01-10 18:17:15 +0800587}
588
589/*
590 * GCM multiplication: c = a times b in GF(2^128)
591 */
592void mbedtls_aesce_gcm_mult(unsigned char c[16],
593 const unsigned char a[16],
594 const unsigned char b[16])
595{
596 uint8x16_t va, vb, vc;
597 va = vrbitq_u8(vld1q_u8(&a[0]));
598 vb = vrbitq_u8(vld1q_u8(&b[0]));
599 vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
600 vst1q_u8(&c[0], vc);
601}
602
603#endif /* MBEDTLS_GCM_C */
Jerry Yu48b999c2023-03-03 15:51:07 +0800604
605#if defined(MBEDTLS_POP_TARGET_PRAGMA)
606#if defined(__clang__)
607#pragma clang attribute pop
608#elif defined(__GNUC__)
609#pragma GCC pop_options
610#endif
611#undef MBEDTLS_POP_TARGET_PRAGMA
612#endif
613
Dave Rodgman27e3c872023-10-08 10:29:26 +0100614#endif /* MBEDTLS_ARCH_IS_ARMV8 */
Jerry Yu49231312023-01-10 16:57:21 +0800615
616#endif /* MBEDTLS_AESCE_C */