CMSIS-NN: Add MVE scalar versions (#1555)

Adds scalar versions for mat_mul_core_4x_s8/mat_mul_core_1x_s8 under
flag ARM_MATH_AUTOVECTORIZE, which is required with -O0.
Updates README about this.
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index 0d683dc..d21a061 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -93,6 +93,7 @@
 
 ### Compiler options
 Default optimization level is Ofast. Please change according to project needs. Just bear in mind it will impact performance.
+With only optimization level -O0, ARM_MATH_AUTOVECTORIZE needs to be defined.
 
 The compiler option '-fomit-frame-pointer' is enabled by default at -O and higher. With no optimization level you may need to specifiy '-fomit-frame-pointer' as a minimum.
 
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
index 5524ed8..8d87d79 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mat_mul_core_1x_s8.c
  * Description:  General Matrix-multiplication function
  *
- * $Date:        7 July 2022
- * $Revision:    V.3.0.0
+ * $Date:        22 Aug 2022
+ * $Revision:    V.3.1.0
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -54,7 +54,7 @@
                                               const int32_t *bias,
                                               int8_t *output)
 {
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_MVEI)
     const int8_t *col_base = col_base_ref;
     int32_t *output_mult = quant_params->multiplier;
     int32_t *output_shift = quant_params->shift;
@@ -70,6 +70,14 @@
 
         int32_t sum_tmp = 0;
 
+#if defined(ARM_MATH_AUTOVECTORIZE)
+        for (int j = 0; j < row_elements; j++)
+        {
+            int32_t col = col_base[j];
+            sum_tmp += col;
+            acc_n0 += row_base[j] * col;
+        }
+#else
         __ASM volatile("   vldrb.8         q0, [%[col]], #16     \n"
                        "   wlstp.8         lr, %[cnt], 1f       \n"
                        "2:                                      \n"
@@ -82,6 +90,7 @@
                        : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0)
                        : [cnt] "r"(row_elements)
                        : "q0", "q1", "memory", "r14");
+#endif
 
         sum_tmp *= conv_params->input_offset;
         acc_n0 += sum_tmp;
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
index ff427ad..46c7966 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mat_mul_core_4x_s8.c
  * Description:  General matrix multiplication function for MVE extension
  *
- * $Date:        19. April 2022
- * $Revision:    V.3.0.1
+ * $Date:        22. Aug 2022
+ * $Revision:    V.3.1.0
  *
  * Target Processor:  Cortex-M processors
  * -------------------------------------------------------------------- */
@@ -70,6 +70,17 @@
         const int8_t *col_base = col_base_ref + i * row_elements;
         int32_t sum_tmp = 0;
 
+#if defined(ARM_MATH_AUTOVECTORIZE)
+        for (int j = 0; j < row_elements; j++)
+        {
+            int32_t col = col_base[j];
+            sum_tmp += col;
+            acc_n0 += ip_row_0[j] * col;
+            acc_n1 += ip_row_1[j] * col;
+            acc_n2 += ip_row_2[j] * col;
+            acc_n3 += ip_row_3[j] * col;
+        }
+#else
         __ASM volatile("   vldrb.8         q0, [%[col]], #16     \n"
                        "   wlstp.8         lr, %[cnt], 1f       \n"
                        "2:                                      \n"
@@ -97,6 +108,7 @@
                          [out3] "+Te"(acc_n3)
                        : [cnt] "r"(row_elements)
                        : "q0", "q1", "q2", "q3", "q4", "memory", "r14");
+#endif
 
         int32x4_t res = {acc_n0, acc_n1, acc_n2, acc_n3};
         sum_tmp *= conv_params->input_offset;