blob: f547eaa93a588d283503d051e84012bb671e4704 [file] [log] [blame]
Jerry Yu49231312023-01-10 16:57:21 +08001/*
Dave Rodgmanf918d422023-03-17 17:52:23 +00002 * Armv8-A Cryptographic Extension support functions for Aarch64
Jerry Yu49231312023-01-10 16:57:21 +08003 *
4 * Copyright The Mbed TLS Contributors
5 * SPDX-License-Identifier: Apache-2.0
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License"); you may
8 * not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
Dave Rodgman27e3c872023-10-08 10:29:26 +010020#if defined(__clang__) && (__clang_major__ >= 4)
21
Dave Rodgman9fd1b522023-10-10 15:23:44 +010022/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
Dave Rodgman27e3c872023-10-08 10:29:26 +010023 * but that is defined by build_info.h, and we need this block to happen first. */
24#if defined(__ARM_ARCH)
25#if __ARM_ARCH >= 8
Dave Rodgman9fd1b522023-10-10 15:23:44 +010026#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
Dave Rodgman27e3c872023-10-08 10:29:26 +010027#endif
28#endif
29
Dave Rodgman9fd1b522023-10-10 15:23:44 +010030#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
Jerry Yu48b999c2023-03-03 15:51:07 +080031/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
32 *
33 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
34 * these are normally only enabled by the -march option on the command line.
35 * By defining the macros ourselves we gain access to those declarations without
36 * requiring -march on the command line.
37 *
38 * `arm_neon.h` could be included by any header file, so we put these defines
39 * at the top of this file, before any includes.
40 */
41#define __ARM_FEATURE_CRYPTO 1
Jerry Yuae129c32023-03-03 15:55:56 +080042/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
43 *
Jerry Yu490bf082023-03-06 15:21:44 +080044 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
45 * for older compilers.
Jerry Yuae129c32023-03-03 15:55:56 +080046 */
47#define __ARM_FEATURE_AES 1
Dave Rodgmandb6ab242023-03-14 16:03:57 +000048#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
Jerry Yu490bf082023-03-06 15:21:44 +080049#endif
Jerry Yu48b999c2023-03-03 15:51:07 +080050
Dave Rodgman27e3c872023-10-08 10:29:26 +010051#endif /* defined(__clang__) && (__clang_major__ >= 4) */
52
Jerry Yu49231312023-01-10 16:57:21 +080053#include <string.h>
54#include "common.h"
55
56#if defined(MBEDTLS_AESCE_C)
57
58#include "aesce.h"
59
Dave Rodgman9fd1b522023-10-10 15:23:44 +010060#if defined(MBEDTLS_ARCH_IS_ARMV8_A) && defined(__ARM_NEON)
Jerry Yu49231312023-01-10 16:57:21 +080061
Jerry Yu61c4cfa2023-04-26 11:06:51 +080062/* Compiler version checks. */
Jerry Yudb368de2023-04-26 16:55:37 +080063#if defined(__clang__)
Dave Rodgman48b965d2023-10-09 12:19:44 +010064# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
Dave Rodgmanb34fe8b2023-10-10 09:52:46 +010065# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
Dave Rodgman48b965d2023-10-09 12:19:44 +010066# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
67# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
Jerry Yudb368de2023-04-26 16:55:37 +080068# endif
69#elif defined(__GNUC__)
70# if __GNUC__ < 6
71# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
72# endif
73#elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +080074/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
75 * please update this and document of `MBEDTLS_AESCE_C` in
76 * `mbedtls_config.h`. */
Jerry Yudb368de2023-04-26 16:55:37 +080077# if _MSC_VER < 1929
78# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
79# endif
Dave Rodgman4b8e8dc2023-10-08 21:41:40 +010080#elif defined(__ARMCC_VERSION)
81# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
82/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
83 * If someone verified that, please update this and document of
84 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
85# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
86# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
87# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
88# endif
Jerry Yu61c4cfa2023-04-26 11:06:51 +080089#endif
90
Jerry Yu6b00f5a2023-05-04 16:30:21 +080091#ifdef __ARM_NEON
Jerry Yu08933d32023-04-27 18:28:00 +080092#include <arm_neon.h>
Jerry Yu6b00f5a2023-05-04 16:30:21 +080093#else
94#error "Target does not support NEON instructions"
95#endif
Jerry Yu08933d32023-04-27 18:28:00 +080096
Jerry Yu580e06f2023-04-28 17:42:40 +080097#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
98 defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
Jerry Yub1d06bb2023-05-05 14:05:07 +080099# if defined(__ARMCOMPILER_VERSION)
100# if __ARMCOMPILER_VERSION <= 6090000
101# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
102# else
Jerry Yu893be8d2023-07-13 17:32:11 +0800103# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800104# define MBEDTLS_POP_TARGET_PRAGMA
105# endif
106# elif defined(__clang__)
Jerry Yu893be8d2023-07-13 17:32:11 +0800107# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yuec9be842023-03-14 10:42:47 +0800108# define MBEDTLS_POP_TARGET_PRAGMA
109# elif defined(__GNUC__)
Jerry Yuec9be842023-03-14 10:42:47 +0800110# pragma GCC push_options
Beniamin Sandu471a9752023-06-25 20:16:16 +0300111# pragma GCC target ("+crypto")
Jerry Yuec9be842023-03-14 10:42:47 +0800112# define MBEDTLS_POP_TARGET_PRAGMA
Jerry Yu07d28d82023-03-20 18:12:36 +0800113# elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +0800114# error "Required feature(__ARM_FEATURE_AES) is not enabled."
Jerry Yu49231312023-01-10 16:57:21 +0800115# endif
Jerry Yu580e06f2023-04-28 17:42:40 +0800116#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
117 MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
Jerry Yu49231312023-01-10 16:57:21 +0800118
Dave Rodgman45661322023-08-04 12:31:58 +0100119#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
120
Jerry Yub95c7762023-01-10 16:59:51 +0800121#include <sys/auxv.h>
Dave Rodgman5e419372023-10-23 15:30:20 +0100122#if !defined(HWCAP_NEON)
123#define HWCAP_NEON (1 << 12)
124#endif
125#if !defined(HWCAP2_AES)
126#define HWCAP2_AES (1 << 0)
127#endif
128#if !defined(HWCAP_AES)
129#define HWCAP_AES (1 << 3)
130#endif
131#if !defined(HWCAP_ASIMD)
132#define HWCAP_ASIMD (1 << 1)
133#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100134
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100135signed char mbedtls_aesce_has_support_result = -1;
Jerry Yub95c7762023-01-10 16:59:51 +0800136
Jerry Yu36606232023-04-19 10:44:29 +0800137#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
Jerry Yub95c7762023-01-10 16:59:51 +0800138/*
139 * AES instruction support detection routine
140 */
Dave Rodgman45661322023-08-04 12:31:58 +0100141int mbedtls_aesce_has_support_impl(void)
Jerry Yub95c7762023-01-10 16:59:51 +0800142{
Dave Rodgman45661322023-08-04 12:31:58 +0100143 /* To avoid many calls to getauxval, cache the result. This is
144 * thread-safe, because we store the result in a char so cannot
145 * be vulnerable to non-atomic updates.
146 * It is possible that we could end up setting result more than
147 * once, but that is harmless.
148 */
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100149 if (mbedtls_aesce_has_support_result == -1) {
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100150#if defined(MBEDTLS_ARCH_IS_ARM32)
151 unsigned long auxval = getauxval(AT_HWCAP);
152 unsigned long auxval2 = getauxval(AT_HWCAP2);
153 if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
154 ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
155 mbedtls_aesce_has_support_result = 1;
156 } else {
157 mbedtls_aesce_has_support_result = 0;
158 }
159#else
Dave Rodgman45661322023-08-04 12:31:58 +0100160 unsigned long auxval = getauxval(AT_HWCAP);
161 if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
162 (HWCAP_ASIMD | HWCAP_AES)) {
163 mbedtls_aesce_has_support_result = 1;
164 } else {
165 mbedtls_aesce_has_support_result = 0;
166 }
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100167#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100168 }
169 return mbedtls_aesce_has_support_result;
Jerry Yub95c7762023-01-10 16:59:51 +0800170}
Jerry Yu0d4f4e52023-03-31 14:32:47 +0800171#endif
Jerry Yub95c7762023-01-10 16:59:51 +0800172
Dave Rodgman45661322023-08-04 12:31:58 +0100173#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
174
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100175/* Single round of AESCE encryption */
176#define AESCE_ENCRYPT_ROUND \
177 block = vaeseq_u8(block, vld1q_u8(keys)); \
178 block = vaesmcq_u8(block); \
179 keys += 16
180/* Two rounds of AESCE encryption */
181#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
182
Dave Rodgman9bb7e6f2023-06-16 09:41:21 +0100183MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
Jerry Yu2bb3d812023-01-10 17:38:26 +0800184static uint8x16_t aesce_encrypt_block(uint8x16_t block,
185 unsigned char *keys,
186 int rounds)
187{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100188 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100189 if (rounds == 10) {
190 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800191 }
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100192 if (rounds == 12) {
193 goto rounds_12;
194 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100195 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100196rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100197 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100198rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100199 AESCE_ENCRYPT_ROUND_X2;
200 AESCE_ENCRYPT_ROUND_X2;
201 AESCE_ENCRYPT_ROUND_X2;
202 AESCE_ENCRYPT_ROUND_X2;
203 AESCE_ENCRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800204
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800205 /* AES AddRoundKey for the previous round.
206 * SubBytes, ShiftRows for the final round. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100207 block = vaeseq_u8(block, vld1q_u8(keys));
208 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800209
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800210 /* Final round: no MixColumns */
Jerry Yu3304c202023-02-22 14:37:11 +0800211
212 /* Final AddRoundKey */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100213 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800214
215 return block;
216}
217
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100218/* Single round of AESCE decryption
219 *
220 * AES AddRoundKey, SubBytes, ShiftRows
221 *
222 * block = vaesdq_u8(block, vld1q_u8(keys));
223 *
224 * AES inverse MixColumns for the next round.
225 *
226 * This means that we switch the order of the inverse AddRoundKey and
227 * inverse MixColumns operations. We have to do this as AddRoundKey is
228 * done in an atomic instruction together with the inverses of SubBytes
229 * and ShiftRows.
230 *
231 * It works because MixColumns is a linear operation over GF(2^8) and
232 * AddRoundKey is an exclusive or, which is equivalent to addition over
233 * GF(2^8). (The inverse of MixColumns needs to be applied to the
234 * affected round keys separately which has been done when the
235 * decryption round keys were calculated.)
236 *
237 * block = vaesimcq_u8(block);
238 */
239#define AESCE_DECRYPT_ROUND \
240 block = vaesdq_u8(block, vld1q_u8(keys)); \
241 block = vaesimcq_u8(block); \
242 keys += 16
243/* Two rounds of AESCE decryption */
244#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
245
Jerry Yu2bb3d812023-01-10 17:38:26 +0800246static uint8x16_t aesce_decrypt_block(uint8x16_t block,
247 unsigned char *keys,
248 int rounds)
249{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100250 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100251 if (rounds == 10) {
252 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800253 }
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100254 if (rounds == 12) {
255 goto rounds_12;
256 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100257 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100258rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100259 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100260rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100261 AESCE_DECRYPT_ROUND_X2;
262 AESCE_DECRYPT_ROUND_X2;
263 AESCE_DECRYPT_ROUND_X2;
264 AESCE_DECRYPT_ROUND_X2;
265 AESCE_DECRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800266
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800267 /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
268 * last full round. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100269 block = vaesdq_u8(block, vld1q_u8(keys));
270 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800271
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800272 /* Inverse AddRoundKey for inverting the initial round key addition. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100273 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800274
275 return block;
276}
277
278/*
279 * AES-ECB block en(de)cryption
280 */
281int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
282 int mode,
283 const unsigned char input[16],
284 unsigned char output[16])
285{
286 uint8x16_t block = vld1q_u8(&input[0]);
287 unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
288
289 if (mode == MBEDTLS_AES_ENCRYPT) {
290 block = aesce_encrypt_block(block, keys, ctx->nr);
291 } else {
292 block = aesce_decrypt_block(block, keys, ctx->nr);
293 }
294 vst1q_u8(&output[0], block);
295
296 return 0;
297}
298
Jerry Yue096da12023-01-10 17:07:01 +0800299/*
300 * Compute decryption round keys from encryption round keys
301 */
302void mbedtls_aesce_inverse_key(unsigned char *invkey,
303 const unsigned char *fwdkey,
304 int nr)
305{
306 int i, j;
307 j = nr;
308 vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
309 for (i = 1, j--; j > 0; i++, j--) {
310 vst1q_u8(invkey + i * 16,
311 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
312 }
313 vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
314
315}
316
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800317static inline uint32_t aes_rot_word(uint32_t word)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800318{
319 return (word << (32 - 8)) | (word >> 8);
320}
321
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800322static inline uint32_t aes_sub_word(uint32_t in)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800323{
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800324 uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
Jerry Yu3f2fb712023-01-10 17:05:42 +0800325 uint8x16_t zero = vdupq_n_u8(0);
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800326
327 /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
328 * the correct result as ShiftRows doesn't change the first row. */
329 v = vaeseq_u8(zero, v);
330 return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800331}
332
333/*
Jerry Yubaae4012023-02-21 15:26:13 +0800334 * Key expansion function
Jerry Yu3f2fb712023-01-10 17:05:42 +0800335 */
Jerry Yubaae4012023-02-21 15:26:13 +0800336static void aesce_setkey_enc(unsigned char *rk,
337 const unsigned char *key,
338 const size_t key_bit_length)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800339{
Jerry Yubaae4012023-02-21 15:26:13 +0800340 static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
341 0x20, 0x40, 0x80, 0x1b, 0x36 };
Jerry Yu947bf962023-02-23 11:07:57 +0800342 /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
343 * - Section 5, Nr = Nk + 6
Jerry Yu2c266512023-03-01 11:18:20 +0800344 * - Section 5.2, the length of round keys is Nb*(Nr+1)
Jerry Yu947bf962023-02-23 11:07:57 +0800345 */
346 const uint32_t key_len_in_words = key_bit_length / 32; /* Nk */
347 const size_t round_key_len_in_words = 4; /* Nb */
Jerry Yu2c266512023-03-01 11:18:20 +0800348 const size_t rounds_needed = key_len_in_words + 6; /* Nr */
349 const size_t round_keys_len_in_words =
350 round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
351 const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800352
Jerry Yu3304c202023-02-22 14:37:11 +0800353 memcpy(rk, key, key_len_in_words * 4);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800354
Jerry Yu3304c202023-02-22 14:37:11 +0800355 for (uint32_t *rki = (uint32_t *) rk;
356 rki + key_len_in_words < rko_end;
357 rki += key_len_in_words) {
358
Jerry Yufac5a542023-02-23 10:13:40 +0800359 size_t iteration = (rki - (uint32_t *) rk) / key_len_in_words;
Jerry Yu3304c202023-02-22 14:37:11 +0800360 uint32_t *rko;
Jerry Yubaae4012023-02-21 15:26:13 +0800361 rko = rki + key_len_in_words;
362 rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
Jerry Yu3304c202023-02-22 14:37:11 +0800363 rko[0] ^= rcon[iteration] ^ rki[0];
Jerry Yu3f2fb712023-01-10 17:05:42 +0800364 rko[1] = rko[0] ^ rki[1];
365 rko[2] = rko[1] ^ rki[2];
366 rko[3] = rko[2] ^ rki[3];
Jerry Yufac5a542023-02-23 10:13:40 +0800367 if (rko + key_len_in_words > rko_end) {
Jerry Yu3304c202023-02-22 14:37:11 +0800368 /* Do not write overflow words.*/
369 continue;
370 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800371#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
Jerry Yubaae4012023-02-21 15:26:13 +0800372 switch (key_bit_length) {
Jerry Yu3304c202023-02-22 14:37:11 +0800373 case 128:
374 break;
Jerry Yubaae4012023-02-21 15:26:13 +0800375 case 192:
Jerry Yu3304c202023-02-22 14:37:11 +0800376 rko[4] = rko[3] ^ rki[4];
377 rko[5] = rko[4] ^ rki[5];
Jerry Yubaae4012023-02-21 15:26:13 +0800378 break;
379 case 256:
Jerry Yu3304c202023-02-22 14:37:11 +0800380 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
381 rko[5] = rko[4] ^ rki[5];
382 rko[6] = rko[5] ^ rki[6];
383 rko[7] = rko[6] ^ rki[7];
Jerry Yubaae4012023-02-21 15:26:13 +0800384 break;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800385 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800386#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
Jerry Yu3f2fb712023-01-10 17:05:42 +0800387 }
388}
389
390/*
391 * Key expansion, wrapper
392 */
393int mbedtls_aesce_setkey_enc(unsigned char *rk,
394 const unsigned char *key,
395 size_t bits)
396{
397 switch (bits) {
Jerry Yubaae4012023-02-21 15:26:13 +0800398 case 128:
399 case 192:
400 case 256:
Jerry Yuba1e78f2023-02-24 11:18:16 +0800401 aesce_setkey_enc(rk, key, bits);
402 break;
403 default:
404 return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800405 }
406
407 return 0;
408}
409
Jerry Yudf87a122023-01-10 18:17:15 +0800410#if defined(MBEDTLS_GCM_C)
411
Dave Rodgman46267f62023-10-09 14:47:50 +0100412#if defined(MBEDTLS_ARCH_IS_ARM32)
413
414#if defined(__clang__)
415/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
416 * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
417 * These are only required for GCM.
418 */
Dave Rodgman46267f62023-10-09 14:47:50 +0100419#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
420
421typedef uint8x16_t poly128_t;
422
423static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
424{
425 poly128_t r;
426 asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
427 return r;
428}
429
Dave Rodgman90291df2023-10-10 09:51:16 +0100430/* This is set to cause some more missing intrinsics to be defined below */
431#define COMMON_MISSING_INTRINSICS
Dave Rodgman46267f62023-10-09 14:47:50 +0100432
433static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
434{
435 return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
436 (poly64_t) (vget_high_u64((uint64x2_t) b)));
437}
438
439#endif /* defined(__clang__) */
440
441static inline uint8x16_t vrbitq_u8(uint8x16_t x)
442{
443 /* There is no vrbitq_u8 instruction in A32/T32, so provide
444 * an equivalent non-Neon implementation. Reverse bit order in each
445 * byte with 4x rbit, rev. */
446 asm ("ldm %[p], { r2-r5 } \n\t"
447 "rbit r2, r2 \n\t"
448 "rev r2, r2 \n\t"
449 "rbit r3, r3 \n\t"
450 "rev r3, r3 \n\t"
451 "rbit r4, r4 \n\t"
452 "rev r4, r4 \n\t"
453 "rbit r5, r5 \n\t"
454 "rev r5, r5 \n\t"
455 "stm %[p], { r2-r5 } \n\t"
456 :
457 /* Output: 16 bytes of memory pointed to by &x */
458 "+m" (*(uint8_t(*)[16]) &x)
459 :
460 [p] "r" (&x)
461 :
462 "r2", "r3", "r4", "r5"
463 );
464 return x;
465}
Dave Rodgman46267f62023-10-09 14:47:50 +0100466
Dave Rodgman90291df2023-10-10 09:51:16 +0100467#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
Dave Rodgman46267f62023-10-09 14:47:50 +0100468
469#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800470/* Some intrinsics are not available for GCC 5.X. */
Dave Rodgman90291df2023-10-10 09:51:16 +0100471#define COMMON_MISSING_INTRINSICS
472#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
473
474
475#if defined(COMMON_MISSING_INTRINSICS)
476
477/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
478
479#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
Jerry Yu132d0cb2023-03-02 17:35:53 +0800480#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
Dave Rodgman46267f62023-10-09 14:47:50 +0100481
Dave Rodgman90291df2023-10-10 09:51:16 +0100482static inline poly64x1_t vget_low_p64(poly64x2_t a)
Jerry Yu132d0cb2023-03-02 17:35:53 +0800483{
Dave Rodgman90291df2023-10-10 09:51:16 +0100484 uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
485 return (poly64x1_t) r;
486
Jerry Yu132d0cb2023-03-02 17:35:53 +0800487}
Dave Rodgman90291df2023-10-10 09:51:16 +0100488
489#endif /* COMMON_MISSING_INTRINSICS */
Jerry Yu132d0cb2023-03-02 17:35:53 +0800490
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800491/* vmull_p64/vmull_high_p64 wrappers.
492 *
493 * Older compilers miss some intrinsic functions for `poly*_t`. We use
494 * uint8x16_t and uint8x16x3_t as input/output parameters.
495 */
Dave Rodgman46267f62023-10-09 14:47:50 +0100496#if defined(MBEDTLS_COMPILER_IS_GCC)
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800497/* GCC reports incompatible type error without cast. GCC think poly64_t and
498 * poly64x1_t are different, that is different with MSVC and Clang. */
499#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
500#else
501/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
502 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
503 * cast for clang also. */
504#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
Dave Rodgman46267f62023-10-09 14:47:50 +0100505#endif /* MBEDTLS_COMPILER_IS_GCC */
506
Jerry Yudf87a122023-01-10 18:17:15 +0800507static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
508{
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800509
Jerry Yudf87a122023-01-10 18:17:15 +0800510 return vreinterpretq_u8_p128(
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800511 MBEDTLS_VMULL_P64(
512 vget_low_p64(vreinterpretq_p64_u8(a)),
513 vget_low_p64(vreinterpretq_p64_u8(b))
514 ));
Jerry Yudf87a122023-01-10 18:17:15 +0800515}
516
517static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
518{
519 return vreinterpretq_u8_p128(
520 vmull_high_p64(vreinterpretq_p64_u8(a),
521 vreinterpretq_p64_u8(b)));
522}
523
Jerry Yuf0526a92023-03-14 15:00:29 +0800524/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
Jerry Yu49b43672023-03-13 10:09:34 +0800525 * `x^128 + x^7 + x^2 + x + 1`.
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800526 *
527 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
528 * multiplies to generate a 128b.
529 *
530 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
531 * represented by 3 128b due to code size optimization.
532 *
533 * Output layout:
534 * | | | |
535 * |------------|-------------|-------------|
536 * | ret.val[0] | h3:h2:00:00 | high 128b |
Jerry Yu8f810602023-03-14 17:28:52 +0800537 * | ret.val[1] | :m2:m1:00 | middle 128b |
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800538 * | ret.val[2] | : :l1:l0 | low 128b |
539 */
Jerry Yudf87a122023-01-10 18:17:15 +0800540static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
541{
542 uint8x16x3_t ret;
Jerry Yu8f810602023-03-14 17:28:52 +0800543 uint8x16_t h, m, l; /* retval high/middle/low */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800544 uint8x16_t c, d, e;
545
546 h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
547 l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
548 c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
549 d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
550 e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
551 m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
552
553 ret.val[0] = h;
554 ret.val[1] = m;
555 ret.val[2] = l;
Jerry Yudf87a122023-01-10 18:17:15 +0800556 return ret;
557}
558
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800559/*
560 * Modulo reduction.
561 *
562 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
563 *
564 * Section 4.3
565 *
566 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
567 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
Jerry Yube4fdef2023-03-15 14:50:42 +0800568 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
569 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
570 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800571 * the result is still larger than 128 bits, we reduce again.
572 */
573static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
Jerry Yudf87a122023-01-10 18:17:15 +0800574{
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800575 uint8x16_t const ZERO = vdupq_n_u8(0);
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800576
Jerry Yudf87a122023-01-10 18:17:15 +0800577 uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800578#if defined(__GNUC__)
579 /* use 'asm' as an optimisation barrier to prevent loading MODULO from
580 * memory. It is for GNUC compatible compilers.
581 */
Dave Rodgmancb5c9fb2023-10-10 10:06:02 +0100582 asm volatile ("" : "+w" (r));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800583#endif
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800584 uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
Jerry Yu8f810602023-03-14 17:28:52 +0800585 uint8x16_t h, m, l; /* input high/middle/low 128b */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800586 uint8x16_t c, d, e, f, g, n, o;
587 h = input.val[0]; /* h3:h2:00:00 */
588 m = input.val[1]; /* :m2:m1:00 */
589 l = input.val[2]; /* : :l1:l0 */
590 c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
591 d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
592 e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
593 f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
594 g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
595 n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
596 o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
597 return veorq_u8(o, g); /* = o1:o0 + g1:00 */
Jerry Yudf87a122023-01-10 18:17:15 +0800598}
599
600/*
601 * GCM multiplication: c = a times b in GF(2^128)
602 */
603void mbedtls_aesce_gcm_mult(unsigned char c[16],
604 const unsigned char a[16],
605 const unsigned char b[16])
606{
607 uint8x16_t va, vb, vc;
608 va = vrbitq_u8(vld1q_u8(&a[0]));
609 vb = vrbitq_u8(vld1q_u8(&b[0]));
610 vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
611 vst1q_u8(&c[0], vc);
612}
613
614#endif /* MBEDTLS_GCM_C */
Jerry Yu48b999c2023-03-03 15:51:07 +0800615
616#if defined(MBEDTLS_POP_TARGET_PRAGMA)
617#if defined(__clang__)
618#pragma clang attribute pop
619#elif defined(__GNUC__)
620#pragma GCC pop_options
621#endif
622#undef MBEDTLS_POP_TARGET_PRAGMA
623#endif
624
Dave Rodgman9fd1b522023-10-10 15:23:44 +0100625#endif /* MBEDTLS_ARCH_IS_ARMV8_A */
Jerry Yu49231312023-01-10 16:57:21 +0800626
627#endif /* MBEDTLS_AESCE_C */