CMSIS-NN: Add MVE scalar versions (#1555)
Adds scalar versions for mat_mul_core_4x_s8/mat_mul_core_1x_s8 under
flag ARM_MATH_AUTOVECTORIZE, which is required with -O0.
Updates README about this.
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index 0d683dc..d21a061 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -93,6 +93,7 @@
### Compiler options
Default optimization level is Ofast. Please change according to project needs. Just bear in mind it will impact performance.
+With only optimization level -O0, ARM_MATH_AUTOVECTORIZE needs to be defined.
The compiler option '-fomit-frame-pointer' is enabled by default at -O and higher. With no optimization level you may need to specifiy '-fomit-frame-pointer' as a minimum.
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
index 5524ed8..8d87d79 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
@@ -21,8 +21,8 @@
* Title: arm_nn_mat_mul_core_1x_s8.c
* Description: General Matrix-multiplication function
*
- * $Date: 7 July 2022
- * $Revision: V.3.0.0
+ * $Date: 22 Aug 2022
+ * $Revision: V.3.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
@@ -54,7 +54,7 @@
const int32_t *bias,
int8_t *output)
{
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_MVEI)
const int8_t *col_base = col_base_ref;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
@@ -70,6 +70,14 @@
int32_t sum_tmp = 0;
+#if defined(ARM_MATH_AUTOVECTORIZE)
+ for (int j = 0; j < row_elements; j++)
+ {
+ int32_t col = col_base[j];
+ sum_tmp += col;
+ acc_n0 += row_base[j] * col;
+ }
+#else
__ASM volatile(" vldrb.8 q0, [%[col]], #16 \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
@@ -82,6 +90,7 @@
: [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0)
: [cnt] "r"(row_elements)
: "q0", "q1", "memory", "r14");
+#endif
sum_tmp *= conv_params->input_offset;
acc_n0 += sum_tmp;
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
index ff427ad..46c7966 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_nn_mat_mul_core_4x_s8.c
* Description: General matrix multiplication function for MVE extension
*
- * $Date: 19. April 2022
- * $Revision: V.3.0.1
+ * $Date: 22. Aug 2022
+ * $Revision: V.3.1.0
*
* Target Processor: Cortex-M processors
* -------------------------------------------------------------------- */
@@ -70,6 +70,17 @@
const int8_t *col_base = col_base_ref + i * row_elements;
int32_t sum_tmp = 0;
+#if defined(ARM_MATH_AUTOVECTORIZE)
+ for (int j = 0; j < row_elements; j++)
+ {
+ int32_t col = col_base[j];
+ sum_tmp += col;
+ acc_n0 += ip_row_0[j] * col;
+ acc_n1 += ip_row_1[j] * col;
+ acc_n2 += ip_row_2[j] * col;
+ acc_n3 += ip_row_3[j] * col;
+ }
+#else
__ASM volatile(" vldrb.8 q0, [%[col]], #16 \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
@@ -97,6 +108,7 @@
[out3] "+Te"(acc_n3)
: [cnt] "r"(row_elements)
: "q0", "q1", "q2", "q3", "q4", "memory", "r14");
+#endif
int32x4_t res = {acc_n0, acc_n1, acc_n2, acc_n3};
sum_tmp *= conv_params->input_offset;