CMSIS-NN: Add SVDF state tensor with 8 bit precision (#1461)

diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 030eb6f..41c6ff2 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -23,6 +23,7 @@
        - Support for int16 average and max pooling for reference implementation
        - Support for elementwise add and mul int16 scalar version
        - Support for softmax int16 scalar version
+       - Support for SVDF with 8 bit state tensor
       CMSIS-RTOS2:
         - RTX 5.5.4 (see revision history for details)
     </release>
@@ -2242,6 +2243,7 @@
         <file category="source" name="CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c"/>
         <file category="source" name="CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c"/>
         <file category="source" name="CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c"/>
+        <file category="source" name="CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c"/>
         <file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c"/>
         <file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c"/>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 02df6f8..deaade7 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        7 April 2022
- * $Revision:    V.8.1.2
+ * $Date:        19 April 2022
+ * $Revision:    V.9.0.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -2430,7 +2430,7 @@
  */
 
 /**
- * @brief s8 SVDF function
+ * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
  *
  * @param[in]   input_ctx Temporary scratch buffer
  * @param[in]   output_ctx Temporary output scratch buffer
@@ -2467,16 +2467,64 @@
                        const cmsis_nn_dims *input_dims,
                        const q7_t *input_data,
                        const cmsis_nn_dims *state_dims,
-                       q15_t *state_data,
+                       q7_t *state_data,
                        const cmsis_nn_dims *weights_feature_dims,
                        const q7_t *weights_feature_data,
                        const cmsis_nn_dims *weights_time_dims,
-                       const q15_t *weights_time_data,
+                       const q7_t *weights_time_data,
                        const cmsis_nn_dims *bias_dims,
                        const q31_t *bias_data,
                        const cmsis_nn_dims *output_dims,
                        q7_t *output_data);
 
+/**
+ * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
+ *
+ * @param[in]   input_ctx Temporary scratch buffer
+ * @param[in]   output_ctx Temporary output scratch buffer
+ * @param[in]   svdf_params SVDF Parameters
+ *              Range of svdf_params->input_offset  : [-128, 127]
+ *              Range of svdf_params->output_offset  : [-128, 127]
+ * @param[in]   input_quant_params Input quantization parameters
+ * @param[in]   output_quant_params Output quantization parameters
+ * @param[in]   input_dims Input tensor dimensions
+ * @param[in]   input_data Pointer to input tensor
+ * @param[in]   state_dims State tensor dimensions
+ * @param[in]   state_data Pointer to state tensor
+ * @param[in]   weights_feature_dims Weights (feature) tensor dimensions
+ * @param[in]   weights_feature_data Pointer to the weights (feature) tensor
+ * @param[in]   weights_time_dims Weights (time) tensor dimensions
+ * @param[in]   weights_time_data Pointer to the weights (time) tensor
+ * @param[in]   bias_dims Bias tensor dimensions
+ * @param[in]   bias_data Pointer to bias tensor
+ * @param[in]   output_dims Output tensor dimensions
+ * @param[out]  output_data Pointer to the output tensor
+ *
+ * @return     The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite micro
+ *    2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
+ *
+ */
+arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
+                                 const cmsis_nn_context *output_ctx,
+                                 const cmsis_nn_svdf_params *svdf_params,
+                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
+                                 const cmsis_nn_per_tensor_quant_params *output_quant_params,
+                                 const cmsis_nn_dims *input_dims,
+                                 const q7_t *input_data,
+                                 const cmsis_nn_dims *state_dims,
+                                 q15_t *state_data,
+                                 const cmsis_nn_dims *weights_feature_dims,
+                                 const q7_t *weights_feature_data,
+                                 const cmsis_nn_dims *weights_time_dims,
+                                 const q15_t *weights_time_data,
+                                 const cmsis_nn_dims *bias_dims,
+                                 const q31_t *bias_data,
+                                 const cmsis_nn_dims *output_dims,
+                                 q7_t *output_data);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index b5e968d..4294f26 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,9 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
-
- * $Date:        16. March 2022
- * $Revision:    V.6.2.1
+ * $Date:        16 March 2022
+ * $Revision:    V.7.0.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -388,6 +387,8 @@
  * @param[in]      rhs_rows        Number of rows in the right-hand side input matrix
  * @param[in]      activation_min  Minimum value to clamp the output to. Range: int8
  * @param[in]      activation_max  Maximum value to clamp the output to. Range: int8
+ * @param[in]      address_offset  Memory position offset for dst. First output is stored at 'dst', the
+ *                                 second at 'dst + address_offset' and so on. Default value is typically 1.
  *
  * @return         The function returns <code>ARM_MATH_SUCCESS</code>
  *
@@ -404,7 +405,8 @@
                                     const int32_t rhs_cols,
                                     const int32_t rhs_rows,
                                     const int32_t activation_min,
-                                    const int32_t activation_max);
+                                    const int32_t activation_max,
+                                    const int32_t address_offset);
 
 /**
  * @brief s16 Vector by Matrix (transposed) multiplication
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index f0784fe..1efb154 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -53,6 +53,7 @@
 ||arm_softmax_u8()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
 |[SVDF](https://arm-software.github.io/CMSIS_5/NN/html/group__SVDF.html)||||| |  ||
 ||arm_svdf_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
+||arm_svdf_state_s16_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
 |[Misc](https://arm-software.github.io/CMSIS_5/NN/html/group__groupNN.html)||||| |  ||
 ||arm_reshape_s8()| SOFTMAX | None | None | No | No | |
 ||arm_elementwise_add_s8()| ELEMENTWISE ADD | None | None | Yes| Yes| Reshape is not done in this function <br/> Only minor improvements are expected |
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
index dbb0807..9615701 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_s8
  * Description:  Fully connected function compatible with TF Lite.
  *
- * $Date:        19. March 2021
- * $Revision:    V.3.0.0
+ * $Date:        8 April 2022
+ * $Revision:    V.3.1.0
  *
  * Target Processor:  Cortex-M and Cortex-A cores
  *
@@ -79,7 +79,8 @@
                                  filter_dims->n, /* col_dim or accum_depth */
                                  output_dims->c, /* row_dim or output_depth */
                                  fc_params->activation.min,
-                                 fc_params->activation.max);
+                                 fc_params->activation.max,
+                                 1L);
         input += filter_dims->n;
         output += output_dims->c;
         batch_cnt--;
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
index 956fae5..d794819 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2020-2022 Arm Limited or its affiliates.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_vec_mat_mult_t_s8
  * Description:  s8 vector by matrix (transposed) multiplication
  *
- * $Date:        19. August 2021
- * $Revision:    V.2.5.2
+ * $Date:        16 March 2022
+ * $Revision:    V.3.0.0
  *
  * Target Processor:  Cortex-M
  *
@@ -57,11 +57,13 @@
                                     const int32_t rhs_cols,
                                     const int32_t rhs_rows,
                                     const int32_t activation_min,
-                                    const int32_t activation_max)
+                                    const int32_t activation_max,
+                                    const int32_t address_offset)
 {
     (void)rhs_offset;
 #if defined(ARM_MATH_MVEI)
     const int32_t row_loop_cnt = rhs_rows / 3;
+    const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3};
 
     for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
     {
@@ -123,8 +125,9 @@
         acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
         acc = vmaxq_s32(acc, vdupq_n_s32(activation_min));
         acc = vminq_s32(acc, vdupq_n_s32(activation_max));
-        vstrbq_p_s32(dst, acc, p);
-        dst += 3;
+
+        vstrbq_scatter_offset_s32(dst, address_offset_array, acc);
+        dst += 3 * address_offset;
     }
 
     const int loop_cnt = rhs_rows % 3;
@@ -165,14 +168,12 @@
         // Clamp the result
         acc_0 = MAX(acc_0, activation_min);
         *dst = MIN(acc_0, activation_max);
-        dst++;
+        dst += address_offset;
     }
 
 #elif defined(ARM_MATH_DSP)
     const int32_t row_loop_cnt = rhs_rows / 2;
-
     const int16_t lhs_offset_s16 = (int16_t)lhs_offset;
-
     const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
 
     for (int32_t i = 0; i < row_loop_cnt; i++)
@@ -235,9 +236,9 @@
         acc_0 = MIN(acc_0, activation_max);
         acc_1 = MAX(acc_1, activation_min);
         acc_1 = MIN(acc_1, activation_max);
-
-        *dst++ = (q7_t)acc_0;
-        *dst++ = (q7_t)acc_1;
+        *dst = (int8_t)acc_0;
+        *(dst + address_offset) = (int8_t)acc_1;
+        dst += 2 * address_offset;
     }
 
     if (rhs_rows & 0x1)
@@ -281,8 +282,8 @@
         // Clamp the result
         acc_0 = MAX(acc_0, activation_min);
         acc_0 = MIN(acc_0, activation_max);
-
-        *dst++ = (q7_t)acc_0;
+        *dst = (int8_t)acc_0;
+        dst += address_offset;
     }
 
 #else
@@ -339,9 +340,10 @@
         res02 = MAX(res02, activation_min);
         res02 = MIN(res02, activation_max);
 
-        *dst++ = (q7_t)res00;
-        *dst++ = (q7_t)res01;
-        *dst++ = (q7_t)res02;
+        *dst = (q7_t)res00;
+        *(dst + address_offset) = (q7_t)res01;
+        *(dst + 2 * address_offset) = (q7_t)res02;
+        dst += 3 * address_offset;
 
         rhs += 3 * rhs_cols;
     }
@@ -380,7 +382,8 @@
         res00 = MAX(res00, activation_min);
         res00 = MIN(res00, activation_max);
 
-        *dst++ = (q7_t)res00;
+        *dst = (int8_t)res00;
+        dst += address_offset;
         rhs += rhs_cols;
     }
 #endif
diff --git a/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c b/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
index 7108d19..d5b61e7 100644
--- a/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
+++ b/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_svdf_s8.c
  * Description:  S8 basic SVDF layer function
  *
- * $Date:        17. August 2021
- * $Revision:    V.1.5.1
+ * $Date:        16 March 2022
+ * $Revision:    V.3.0.0
  *
  * Target Processor:  Cortex-M processors
  *
@@ -41,7 +41,7 @@
  */
 
 /*
- * S8 SVDF layer function for TensorFlow Lite
+ * S8 SVDF layer function for TensorFlow Lite with 8 bit state tensor
  *
  * Refer to header file for details.
  *
@@ -55,11 +55,11 @@
                        const cmsis_nn_dims *input_dims,
                        const q7_t *input_data,
                        const cmsis_nn_dims *state_dims,
-                       q15_t *state_data,
+                       q7_t *state_data,
                        const cmsis_nn_dims *weights_feature_dims,
                        const q7_t *weights_feature_data,
                        const cmsis_nn_dims *weights_time_dims,
-                       const q15_t *weights_time_data,
+                       const q7_t *weights_time_data,
                        const cmsis_nn_dims *bias_dims,
                        const q31_t *bias_data,
                        const cmsis_nn_dims *output_dims,
@@ -99,28 +99,30 @@
     }
     q31_t *buffer_b = (q31_t *)output_ctx->buf;
 
-    memmove((q15_t *)state_data,
-            (q15_t *)state_data + 1,
-            (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t)));
+    memmove((int8_t *)state_data,
+            (int8_t *)state_data + 1,
+            (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int8_t)));
 
     for (int i_batch = 0; i_batch < input_batches; i_batch++)
     {
-        q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
+        q7_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
         const q7_t *weight = weights_feature_data;
         const q7_t *input = input_data + i_batch * input_height;
 
-        arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
-                                                       weight,
-                                                       res_ptr,
-                                                       -zp_in,
-                                                       0,
-                                                       time_batches,
-                                                       multiplier_in,
-                                                       shift_in,
-                                                       input_height,
-                                                       feature_batches,
-                                                       in_activation_min,
-                                                       in_activation_max);
+        arm_status res = arm_nn_vec_mat_mult_t_s8(input,
+                                                  weight,
+                                                  NULL,
+                                                  res_ptr,
+                                                  -zp_in,
+                                                  0,
+                                                  0,
+                                                  multiplier_in,
+                                                  shift_in,
+                                                  input_height,
+                                                  feature_batches,
+                                                  in_activation_min,
+                                                  in_activation_max,
+                                                  time_batches);
 
         if (res != ARM_MATH_SUCCESS)
         {
@@ -130,10 +132,10 @@
 
     {
         q31_t *ptr_a = buffer_a;
-        const q15_t *v2 = state_data;
+        const int8_t *v2 = state_data;
         for (int i_batch = 0; i_batch < input_batches; i_batch++)
         {
-            const q15_t *v1 = weights_time_data;
+            const int8_t *v1 = weights_time_data;
 
             for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
             {
@@ -145,8 +147,12 @@
                 for (int i = 0; i < block_count; i++)
                 {
                     j += 2;
-                    q31_t r1 = arm_nn_read_q15x2_ia(&v1);
-                    q31_t r2 = arm_nn_read_q15x2_ia(&v2);
+
+                    q31_t r1 = arm_nn_read_q7x4_ia(&v1);
+                    r1 = __SXTB16(r1);
+
+                    q31_t r2 = arm_nn_read_q7x4_ia(&v2);
+                    r2 = __SXTB16(r2);
 
                     sum = __SMLAD(r1, r2, sum);
                 }
diff --git a/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c b/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c
new file mode 100644
index 0000000..49c7149
--- /dev/null
+++ b/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2022 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_svdf_s8.c
+ * Description:  S8 basic SVDF layer function
+ *
+ * $Date:        8 April 2022
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M processors
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupNN
+ */
+
+/**
+ * @addtogroup SVDF
+ * @{
+ */
+
+/*
+ * S8 SVDF layer function for TensorFlow Lite with 16 bit state tensor
+ *
+ * Refer to header file for details.
+ *
+ */
+
+arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
+                                 const cmsis_nn_context *output_ctx,
+                                 const cmsis_nn_svdf_params *svdf_params,
+                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
+                                 const cmsis_nn_per_tensor_quant_params *output_quant_params,
+                                 const cmsis_nn_dims *input_dims,
+                                 const q7_t *input_data,
+                                 const cmsis_nn_dims *state_dims,
+                                 q15_t *state_data,
+                                 const cmsis_nn_dims *weights_feature_dims,
+                                 const q7_t *weights_feature_data,
+                                 const cmsis_nn_dims *weights_time_dims,
+                                 const q15_t *weights_time_data,
+                                 const cmsis_nn_dims *bias_dims,
+                                 const q31_t *bias_data,
+                                 const cmsis_nn_dims *output_dims,
+                                 q7_t *output_data)
+{
+    (void)bias_dims;
+    (void)state_dims;
+    (void)output_dims;
+
+    const q31_t multiplier_in = input_quant_params->multiplier;
+    const q31_t shift_in = input_quant_params->shift;
+    const q31_t multiplier_out = output_quant_params->multiplier;
+    const q31_t shift_2 = output_quant_params->shift;
+    const int32_t zp_in = svdf_params->input_offset;
+    const int32_t zp_out = svdf_params->output_offset;
+    const int32_t in_activation_min = svdf_params->input_activation.min;
+    const int32_t in_activation_max = svdf_params->input_activation.max;
+    const int32_t out_activation_min = svdf_params->output_activation.min;
+    const int32_t out_activation_max = svdf_params->output_activation.max;
+    const int16_t rank = svdf_params->rank;
+
+    const int32_t input_batches = input_dims->n;
+    const int32_t input_height = input_dims->h;
+    const int32_t feature_batches = weights_feature_dims->n;
+    const int32_t time_batches = weights_time_dims->h;
+    const int32_t unit_count = feature_batches / rank;
+
+    if (input_ctx->buf == NULL)
+    {
+        return ARM_MATH_ARGUMENT_ERROR;
+    }
+    q31_t *buffer_a = (q31_t *)input_ctx->buf;
+
+    if (output_ctx->buf == NULL)
+    {
+        return ARM_MATH_ARGUMENT_ERROR;
+    }
+    q31_t *buffer_b = (q31_t *)output_ctx->buf;
+
+    memmove((q15_t *)state_data,
+            (q15_t *)state_data + 1,
+            (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t)));
+
+    for (int i_batch = 0; i_batch < input_batches; i_batch++)
+    {
+        q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
+        const q7_t *weight = weights_feature_data;
+        const q7_t *input = input_data + i_batch * input_height;
+
+        arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
+                                                       weight,
+                                                       res_ptr,
+                                                       -zp_in,
+                                                       0,
+                                                       time_batches,
+                                                       multiplier_in,
+                                                       shift_in,
+                                                       input_height,
+                                                       feature_batches,
+                                                       in_activation_min,
+                                                       in_activation_max);
+
+        if (res != ARM_MATH_SUCCESS)
+        {
+            return res;
+        }
+    }
+
+    {
+        q31_t *ptr_a = buffer_a;
+        const q15_t *v2 = state_data;
+        for (int i_batch = 0; i_batch < input_batches; i_batch++)
+        {
+            const q15_t *v1 = weights_time_data;
+
+            for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
+            {
+                *ptr_a = 0;
+                int32_t sum = 0;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+                int j = 0;
+                int32_t block_count = time_batches >> 1;
+                for (int i = 0; i < block_count; i++)
+                {
+                    j += 2;
+                    q31_t r1 = arm_nn_read_q15x2_ia(&v1);
+                    q31_t r2 = arm_nn_read_q15x2_ia(&v2);
+
+                    sum = __SMLAD(r1, r2, sum);
+                }
+
+                // Process the remaining data
+                for (; j < time_batches; j++)
+                {
+                    sum += *v1 * *v2;
+                    v1++;
+                    v2++;
+                }
+#else
+                for (int j = 0; j < time_batches; j++)
+                {
+                    sum += *v1 * *v2;
+                    v1++;
+                    v2++;
+                }
+#endif
+
+                *ptr_a = sum;
+                ptr_a++;
+            }
+        }
+    }
+
+    if (bias_data)
+    {
+        if (unit_count == feature_batches)
+        {
+            for (int i = 0; i < input_batches; i++)
+            {
+                q31_t *output_temp = buffer_b + i * feature_batches;
+                const q31_t *ptr_a = buffer_a + i * feature_batches;
+
+                const int32_t *bi = bias_data;
+                for (int j = 0; j < feature_batches; j++)
+                {
+                    output_temp[j] = ptr_a[j] + bi[j];
+                }
+            }
+        }
+        else
+        {
+            for (int i_batch = 0; i_batch < input_batches; i_batch++)
+            {
+                q31_t *output_data_temp = buffer_b + i_batch * unit_count;
+                q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+
+                for (int i = 0; i < unit_count; i++)
+                {
+                    int32_t sum = bias_data[i];
+                    for (int j = 0; j < rank; j++)
+                    {
+                        sum += *ptr_a;
+                        ptr_a++;
+                    }
+                    output_data_temp[i] = sum;
+                }
+            }
+        }
+    }
+    else
+    {
+        for (int i_batch = 0; i_batch < input_batches; i_batch++)
+        {
+            q31_t *output_data_temp = buffer_b + i_batch * unit_count;
+            q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+
+            for (int i = 0; i < unit_count; i++)
+            {
+                int32_t sum = 0;
+                for (int j = 0; j < rank; j++)
+                {
+                    sum += *ptr_a;
+                    ptr_a++;
+                }
+                output_data_temp[i] = sum;
+            }
+        }
+    }
+
+#if defined(ARM_MATH_MVEI)
+    int32_t num_elements = input_batches * unit_count;
+    const int32_t loop_count = (num_elements + 3) / 4;
+    for (int i_op = 0; i_op < loop_count; i_op++)
+    {
+        mve_pred16_t p = vctp32q((uint32_t)num_elements);
+        int32x4_t op = vldrwq_z_s32(buffer_b, p);
+        op = arm_requantize_mve(op, multiplier_out, shift_2);
+        op = vaddq_n_s32(op, zp_out);
+        const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);
+        const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);
+        op = vmaxq_s32(op, min_vec);
+        op = vminq_s32(op, max_vec);
+        vstrbq_p_s32(output_data, op, p);
+        output_data += 4;
+        buffer_b += 4;
+        num_elements -= 4;
+    }
+#else
+    for (int i = 0; i < input_batches * unit_count; i++)
+    {
+        output_data[i] = (q7_t)CLAMP(
+            arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
+    }
+#endif
+
+    return (ARM_MATH_SUCCESS);
+}
+
+/**
+ * @} end of SVDF group
+ */
diff --git a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
index 0cf59c6..74cf218 100644
--- a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
@@ -87,7 +87,7 @@
 add_subdirectory(TestCases/test_arm_softmax_s8)
 add_subdirectory(TestCases/test_arm_softmax_s8_s16)
 add_subdirectory(TestCases/test_arm_softmax_s16)
-add_subdirectory(TestCases/test_arm_svdf_s8)
+add_subdirectory(TestCases/test_arm_svdf_state_s16_s8)
 
 set(MAKE_CMD "python3")
 set(MAKE_CMD_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/unittest_targets.py")
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/CMakeLists.txt
similarity index 67%
rename from CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/CMakeLists.txt
rename to CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/CMakeLists.txt
index 2d20098..a7e5e82 100644
--- a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2010-2022 Arm Limited or its affiliates.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -16,8 +16,8 @@
 # limitations under the License.
 #
 
-add_cmsis_nn_unit_test_executable(test_arm_svdf_s8)
+add_cmsis_nn_unit_test_executable(test_arm_svdf_state_s16_s8)
 
-target_sources(test_arm_svdf_s8 PRIVATE
-    Unity/unity_test_arm_svdf_s8.c
-    Unity/TestRunner/unity_test_arm_svdf_s8_runner.c)
+target_sources(test_arm_svdf_state_s16_s8 PRIVATE
+    Unity/unity_test_arm_svdf_state_s16_s8.c
+    Unity/TestRunner/unity_test_arm_svdf_state_s16_s8_runner.c)
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/Unity/unity_test_arm_svdf_state_s16_s8.c
similarity index 73%
rename from CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
rename to CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/Unity/unity_test_arm_svdf_state_s16_s8.c
index 8c10a8b..d666a56 100644
--- a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/Unity/unity_test_arm_svdf_state_s16_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,7 +22,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "../test_arm_svdf_s8.c"
+#include "../test_arm_svdf_state_s16_s8.c"
 #include "unity.h"
 
 #ifdef USING_FVP_CORSTONE_300
@@ -44,10 +44,10 @@
  */
 void tearDown(void) {}
 
-void test_svdf_arm_svdf_s8(void) { svdf_arm_svdf_s8(); }
+void test_svdf_arm_state_s16_s8(void) { svdf_arm_svdf_state_s16_s8(); }
 
-void test_svdf_1_arm_svdf_s8(void) { svdf_1_arm_svdf_s8(); }
+void test_svdf_1_arm_state_s16_s8(void) { svdf_1_arm_svdf_state_s16_s8(); }
 
-void test_svdf_2_arm_svdf_s8(void) { svdf_2_arm_svdf_s8(); }
+void test_svdf_2_arm_state_s16_s8(void) { svdf_2_arm_svdf_state_s16_s8(); }
 
-void test_svdf_3_arm_svdf_s8(void) { svdf_3_arm_svdf_s8(); }
+void test_svdf_3_arm_state_s16_s8(void) { svdf_3_arm_svdf_state_s16_s8(); }
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/test_arm_svdf_state_s16_s8.c
similarity index 70%
rename from CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
rename to CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/test_arm_svdf_state_s16_s8.c
index 41a5f78..5db3eb1 100644
--- a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/test_arm_svdf_state_s16_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -41,7 +41,7 @@
     return null_bias;
 }
 
-void svdf_arm_svdf_s8(void)
+void svdf_arm_svdf_state_s16_s8(void)
 {
     const arm_status expected = ARM_MATH_SUCCESS;
     cmsis_nn_context input_ctx;
@@ -99,23 +99,23 @@
         for (int j = 0; j < number_inputs; j++)
         {
             memcpy(input_data, svdf_input_sequence + j * input_round_size, input_round_size);
-            arm_status result = arm_svdf_s8(&input_ctx,
-                                            &output_ctx,
-                                            &svdf_params,
-                                            &input_quant_params,
-                                            &output_quant_params,
-                                            &input_dims,
-                                            input_data,
-                                            &state_dims,
-                                            state_data,
-                                            &weights_feature_dims,
-                                            weights_feature_data,
-                                            &weights_time_dims,
-                                            weights_time_data,
-                                            &bias_dims,
-                                            null_bias == true ? NULL : svdf_biases,
-                                            &output_dims,
-                                            output_data);
+            arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+                                                      &output_ctx,
+                                                      &svdf_params,
+                                                      &input_quant_params,
+                                                      &output_quant_params,
+                                                      &input_dims,
+                                                      input_data,
+                                                      &state_dims,
+                                                      state_data,
+                                                      &weights_feature_dims,
+                                                      weights_feature_data,
+                                                      &weights_time_dims,
+                                                      weights_time_data,
+                                                      &bias_dims,
+                                                      null_bias == true ? NULL : svdf_biases,
+                                                      &output_dims,
+                                                      output_data);
             TEST_ASSERT_EQUAL(expected, result);
         }
 
@@ -127,7 +127,7 @@
     free(output_ctx.buf);
 }
 
-void svdf_1_arm_svdf_s8(void)
+void svdf_1_arm_svdf_state_s16_s8(void)
 {
     const arm_status expected = ARM_MATH_SUCCESS;
     cmsis_nn_context input_ctx;
@@ -184,23 +184,23 @@
         for (int j = 0; j < number_inputs; j++)
         {
             memcpy(input_data, svdf_1_input_sequence + j * input_round_size, input_round_size);
-            arm_status result = arm_svdf_s8(&input_ctx,
-                                            &output_ctx,
-                                            &svdf_1_params,
-                                            &input_quant_params,
-                                            &output_quant_params,
-                                            &input_dims,
-                                            input_data,
-                                            &state_dims,
-                                            state_data,
-                                            &weights_feature_dims,
-                                            weights_feature_data,
-                                            &weights_time_dims,
-                                            weights_time_data,
-                                            &bias_dims,
-                                            null_bias == true ? NULL : svdf_1_biases,
-                                            &output_dims,
-                                            output_data);
+            arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+                                                      &output_ctx,
+                                                      &svdf_1_params,
+                                                      &input_quant_params,
+                                                      &output_quant_params,
+                                                      &input_dims,
+                                                      input_data,
+                                                      &state_dims,
+                                                      state_data,
+                                                      &weights_feature_dims,
+                                                      weights_feature_data,
+                                                      &weights_time_dims,
+                                                      weights_time_data,
+                                                      &bias_dims,
+                                                      null_bias == true ? NULL : svdf_1_biases,
+                                                      &output_dims,
+                                                      output_data);
             TEST_ASSERT_EQUAL(expected, result);
         }
 
@@ -212,7 +212,7 @@
     free(output_ctx.buf);
 }
 
-void svdf_2_arm_svdf_s8(void)
+void svdf_2_arm_svdf_state_s16_s8(void)
 {
     const arm_status expected = ARM_MATH_SUCCESS;
     cmsis_nn_context input_ctx;
@@ -269,23 +269,23 @@
         for (int j = 0; j < number_inputs; j++)
         {
             memcpy(input_data, svdf_2_input_sequence + j * input_round_size, input_round_size);
-            arm_status result = arm_svdf_s8(&input_ctx,
-                                            &output_ctx,
-                                            &svdf_2_params,
-                                            &input_quant_params,
-                                            &output_quant_params,
-                                            &input_dims,
-                                            input_data,
-                                            &state_dims,
-                                            state_data,
-                                            &weights_feature_dims,
-                                            weights_feature_data,
-                                            &weights_time_dims,
-                                            weights_time_data,
-                                            &bias_dims,
-                                            null_bias == true ? NULL : svdf_2_biases,
-                                            &output_dims,
-                                            output_data);
+            arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+                                                      &output_ctx,
+                                                      &svdf_2_params,
+                                                      &input_quant_params,
+                                                      &output_quant_params,
+                                                      &input_dims,
+                                                      input_data,
+                                                      &state_dims,
+                                                      state_data,
+                                                      &weights_feature_dims,
+                                                      weights_feature_data,
+                                                      &weights_time_dims,
+                                                      weights_time_data,
+                                                      &bias_dims,
+                                                      null_bias == true ? NULL : svdf_2_biases,
+                                                      &output_dims,
+                                                      output_data);
             TEST_ASSERT_EQUAL(expected, result);
         }
 
@@ -297,7 +297,7 @@
     free(output_ctx.buf);
 }
 
-void svdf_3_arm_svdf_s8(void)
+void svdf_3_arm_svdf_state_s16_s8(void)
 {
     const arm_status expected = ARM_MATH_SUCCESS;
     cmsis_nn_context input_ctx;
@@ -354,23 +354,23 @@
         for (int j = 0; j < number_inputs; j++)
         {
             memcpy(input_data, svdf_3_input_sequence + j * input_round_size, input_round_size);
-            arm_status result = arm_svdf_s8(&input_ctx,
-                                            &output_ctx,
-                                            &svdf_3_params,
-                                            &input_quant_params,
-                                            &output_quant_params,
-                                            &input_dims,
-                                            input_data,
-                                            &state_dims,
-                                            state_data,
-                                            &weights_feature_dims,
-                                            weights_feature_data,
-                                            &weights_time_dims,
-                                            weights_time_data,
-                                            &bias_dims,
-                                            null_bias == true ? NULL : svdf_3_biases,
-                                            &output_dims,
-                                            output_data);
+            arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+                                                      &output_ctx,
+                                                      &svdf_3_params,
+                                                      &input_quant_params,
+                                                      &output_quant_params,
+                                                      &input_dims,
+                                                      input_data,
+                                                      &state_dims,
+                                                      state_data,
+                                                      &weights_feature_dims,
+                                                      weights_feature_data,
+                                                      &weights_time_dims,
+                                                      weights_time_data,
+                                                      &bias_dims,
+                                                      null_bias == true ? NULL : svdf_3_biases,
+                                                      &output_dims,
+                                                      output_data);
             TEST_ASSERT_EQUAL(expected, result);
         }