CMSIS-NN: Add SVDF state tensor with 8 bit precision (#1461)
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 030eb6f..41c6ff2 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -23,6 +23,7 @@
- Support for int16 average and max pooling for reference implementation
- Support for elementwise add and mul int16 scalar version
- Support for softmax int16 scalar version
+ - Support for SVDF with 8 bit state tensor
CMSIS-RTOS2:
- RTX 5.5.4 (see revision history for details)
</release>
@@ -2242,6 +2243,7 @@
<file category="source" name="CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c"/>
<file category="source" name="CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c"/>
<file category="source" name="CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c"/>
+ <file category="source" name="CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c"/>
<file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c"/>
<file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c"/>
<file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c"/>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 02df6f8..deaade7 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
- * $Date: 7 April 2022
- * $Revision: V.8.1.2
+ * $Date: 19 April 2022
+ * $Revision: V.9.0.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -2430,7 +2430,7 @@
*/
/**
- * @brief s8 SVDF function
+ * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
*
* @param[in] input_ctx Temporary scratch buffer
* @param[in] output_ctx Temporary output scratch buffer
@@ -2467,16 +2467,64 @@
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *state_dims,
- q15_t *state_data,
+ q7_t *state_data,
const cmsis_nn_dims *weights_feature_dims,
const q7_t *weights_feature_data,
const cmsis_nn_dims *weights_time_dims,
- const q15_t *weights_time_data,
+ const q7_t *weights_time_data,
const cmsis_nn_dims *bias_dims,
const q31_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
+/**
+ * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
+ *
+ * @param[in] input_ctx Temporary scratch buffer
+ * @param[in] output_ctx Temporary output scratch buffer
+ * @param[in] svdf_params SVDF Parameters
+ * Range of svdf_params->input_offset : [-128, 127]
+ * Range of svdf_params->output_offset : [-128, 127]
+ * @param[in] input_quant_params Input quantization parameters
+ * @param[in] output_quant_params Output quantization parameters
+ * @param[in] input_dims Input tensor dimensions
+ * @param[in] input_data Pointer to input tensor
+ * @param[in] state_dims State tensor dimensions
+ * @param[in] state_data Pointer to state tensor
+ * @param[in] weights_feature_dims Weights (feature) tensor dimensions
+ * @param[in] weights_feature_data Pointer to the weights (feature) tensor
+ * @param[in] weights_time_dims Weights (time) tensor dimensions
+ * @param[in] weights_time_data Pointer to the weights (time) tensor
+ * @param[in] bias_dims Bias tensor dimensions
+ * @param[in] bias_data Pointer to bias tensor
+ * @param[in] output_dims Output tensor dimensions
+ * @param[out] output_data Pointer to the output tensor
+ *
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ * @details
+ * 1. Supported framework: TensorFlow Lite micro
+ * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
+ *
+ */
+arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
+ const cmsis_nn_context *output_ctx,
+ const cmsis_nn_svdf_params *svdf_params,
+ const cmsis_nn_per_tensor_quant_params *input_quant_params,
+ const cmsis_nn_per_tensor_quant_params *output_quant_params,
+ const cmsis_nn_dims *input_dims,
+ const q7_t *input_data,
+ const cmsis_nn_dims *state_dims,
+ q15_t *state_data,
+ const cmsis_nn_dims *weights_feature_dims,
+ const q7_t *weights_feature_data,
+ const cmsis_nn_dims *weights_time_dims,
+ const q15_t *weights_time_data,
+ const cmsis_nn_dims *bias_dims,
+ const q31_t *bias_data,
+ const cmsis_nn_dims *output_dims,
+ q7_t *output_data);
+
#ifdef __cplusplus
}
#endif
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index b5e968d..4294f26 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,9 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
-
- * $Date: 16. March 2022
- * $Revision: V.6.2.1
+ * $Date: 16 March 2022
+ * $Revision: V.7.0.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -388,6 +387,8 @@
* @param[in] rhs_rows Number of rows in the right-hand side input matrix
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
+ * @param[in] address_offset Memory position offset for dst. First output is stored at 'dst', the
+ * second at 'dst + address_offset' and so on. Default value is typically 1.
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
@@ -404,7 +405,8 @@
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
- const int32_t activation_max);
+ const int32_t activation_max,
+ const int32_t address_offset);
/**
* @brief s16 Vector by Matrix (transposed) multiplication
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index f0784fe..1efb154 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -53,6 +53,7 @@
||arm_softmax_u8()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
|[SVDF](https://arm-software.github.io/CMSIS_5/NN/html/group__SVDF.html)||||| | ||
||arm_svdf_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
+||arm_svdf_state_s16_s8()| SVDF | None | None | Yes | Yes | Bit exact to TFLu |
|[Misc](https://arm-software.github.io/CMSIS_5/NN/html/group__groupNN.html)||||| | ||
||arm_reshape_s8()| SOFTMAX | None | None | No | No | |
||arm_elementwise_add_s8()| ELEMENTWISE ADD | None | None | Yes| Yes| Reshape is not done in this function <br/> Only minor improvements are expected |
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
index dbb0807..9615701 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_fully_connected_s8
* Description: Fully connected function compatible with TF Lite.
*
- * $Date: 19. March 2021
- * $Revision: V.3.0.0
+ * $Date: 8 April 2022
+ * $Revision: V.3.1.0
*
* Target Processor: Cortex-M and Cortex-A cores
*
@@ -79,7 +79,8 @@
filter_dims->n, /* col_dim or accum_depth */
output_dims->c, /* row_dim or output_depth */
fc_params->activation.min,
- fc_params->activation.max);
+ fc_params->activation.max,
+ 1L);
input += filter_dims->n;
output += output_dims->c;
batch_cnt--;
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
index 956fae5..d794819 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2020-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_nn_vec_mat_mult_t_s8
* Description: s8 vector by matrix (transposed) multiplication
*
- * $Date: 19. August 2021
- * $Revision: V.2.5.2
+ * $Date: 16 March 2022
+ * $Revision: V.3.0.0
*
* Target Processor: Cortex-M
*
@@ -57,11 +57,13 @@
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
- const int32_t activation_max)
+ const int32_t activation_max,
+ const int32_t address_offset)
{
(void)rhs_offset;
#if defined(ARM_MATH_MVEI)
const int32_t row_loop_cnt = rhs_rows / 3;
+ const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3};
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
{
@@ -123,8 +125,9 @@
acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
acc = vmaxq_s32(acc, vdupq_n_s32(activation_min));
acc = vminq_s32(acc, vdupq_n_s32(activation_max));
- vstrbq_p_s32(dst, acc, p);
- dst += 3;
+
+ vstrbq_scatter_offset_s32(dst, address_offset_array, acc);
+ dst += 3 * address_offset;
}
const int loop_cnt = rhs_rows % 3;
@@ -165,14 +168,12 @@
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
*dst = MIN(acc_0, activation_max);
- dst++;
+ dst += address_offset;
}
#elif defined(ARM_MATH_DSP)
const int32_t row_loop_cnt = rhs_rows / 2;
-
const int16_t lhs_offset_s16 = (int16_t)lhs_offset;
-
const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
for (int32_t i = 0; i < row_loop_cnt; i++)
@@ -235,9 +236,9 @@
acc_0 = MIN(acc_0, activation_max);
acc_1 = MAX(acc_1, activation_min);
acc_1 = MIN(acc_1, activation_max);
-
- *dst++ = (q7_t)acc_0;
- *dst++ = (q7_t)acc_1;
+ *dst = (int8_t)acc_0;
+ *(dst + address_offset) = (int8_t)acc_1;
+ dst += 2 * address_offset;
}
if (rhs_rows & 0x1)
@@ -281,8 +282,8 @@
// Clamp the result
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
-
- *dst++ = (q7_t)acc_0;
+ *dst = (int8_t)acc_0;
+ dst += address_offset;
}
#else
@@ -339,9 +340,10 @@
res02 = MAX(res02, activation_min);
res02 = MIN(res02, activation_max);
- *dst++ = (q7_t)res00;
- *dst++ = (q7_t)res01;
- *dst++ = (q7_t)res02;
+ *dst = (q7_t)res00;
+ *(dst + address_offset) = (q7_t)res01;
+ *(dst + 2 * address_offset) = (q7_t)res02;
+ dst += 3 * address_offset;
rhs += 3 * rhs_cols;
}
@@ -380,7 +382,8 @@
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
- *dst++ = (q7_t)res00;
+ *dst = (int8_t)res00;
+ dst += address_offset;
rhs += rhs_cols;
}
#endif
diff --git a/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c b/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
index 7108d19..d5b61e7 100644
--- a/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
+++ b/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_svdf_s8.c
* Description: S8 basic SVDF layer function
*
- * $Date: 17. August 2021
- * $Revision: V.1.5.1
+ * $Date: 16 March 2022
+ * $Revision: V.3.0.0
*
* Target Processor: Cortex-M processors
*
@@ -41,7 +41,7 @@
*/
/*
- * S8 SVDF layer function for TensorFlow Lite
+ * S8 SVDF layer function for TensorFlow Lite with 8 bit state tensor
*
* Refer to header file for details.
*
@@ -55,11 +55,11 @@
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *state_dims,
- q15_t *state_data,
+ q7_t *state_data,
const cmsis_nn_dims *weights_feature_dims,
const q7_t *weights_feature_data,
const cmsis_nn_dims *weights_time_dims,
- const q15_t *weights_time_data,
+ const q7_t *weights_time_data,
const cmsis_nn_dims *bias_dims,
const q31_t *bias_data,
const cmsis_nn_dims *output_dims,
@@ -99,28 +99,30 @@
}
q31_t *buffer_b = (q31_t *)output_ctx->buf;
- memmove((q15_t *)state_data,
- (q15_t *)state_data + 1,
- (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t)));
+ memmove((int8_t *)state_data,
+ (int8_t *)state_data + 1,
+ (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int8_t)));
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
- q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
+ q7_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
const q7_t *weight = weights_feature_data;
const q7_t *input = input_data + i_batch * input_height;
- arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
- weight,
- res_ptr,
- -zp_in,
- 0,
- time_batches,
- multiplier_in,
- shift_in,
- input_height,
- feature_batches,
- in_activation_min,
- in_activation_max);
+ arm_status res = arm_nn_vec_mat_mult_t_s8(input,
+ weight,
+ NULL,
+ res_ptr,
+ -zp_in,
+ 0,
+ 0,
+ multiplier_in,
+ shift_in,
+ input_height,
+ feature_batches,
+ in_activation_min,
+ in_activation_max,
+ time_batches);
if (res != ARM_MATH_SUCCESS)
{
@@ -130,10 +132,10 @@
{
q31_t *ptr_a = buffer_a;
- const q15_t *v2 = state_data;
+ const int8_t *v2 = state_data;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
- const q15_t *v1 = weights_time_data;
+ const int8_t *v1 = weights_time_data;
for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
{
@@ -145,8 +147,12 @@
for (int i = 0; i < block_count; i++)
{
j += 2;
- q31_t r1 = arm_nn_read_q15x2_ia(&v1);
- q31_t r2 = arm_nn_read_q15x2_ia(&v2);
+
+ q31_t r1 = arm_nn_read_q7x4_ia(&v1);
+ r1 = __SXTB16(r1);
+
+ q31_t r2 = arm_nn_read_q7x4_ia(&v2);
+ r2 = __SXTB16(r2);
sum = __SMLAD(r1, r2, sum);
}
diff --git a/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c b/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c
new file mode 100644
index 0000000..49c7149
--- /dev/null
+++ b/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2022 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_svdf_s8.c
+ * Description: S8 basic SVDF layer function
+ *
+ * $Date: 8 April 2022
+ * $Revision: V.1.0.0
+ *
+ * Target Processor: Cortex-M processors
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupNN
+ */
+
+/**
+ * @addtogroup SVDF
+ * @{
+ */
+
+/*
+ * S8 SVDF layer function for TensorFlow Lite with 16 bit state tensor
+ *
+ * Refer to header file for details.
+ *
+ */
+
+arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
+ const cmsis_nn_context *output_ctx,
+ const cmsis_nn_svdf_params *svdf_params,
+ const cmsis_nn_per_tensor_quant_params *input_quant_params,
+ const cmsis_nn_per_tensor_quant_params *output_quant_params,
+ const cmsis_nn_dims *input_dims,
+ const q7_t *input_data,
+ const cmsis_nn_dims *state_dims,
+ q15_t *state_data,
+ const cmsis_nn_dims *weights_feature_dims,
+ const q7_t *weights_feature_data,
+ const cmsis_nn_dims *weights_time_dims,
+ const q15_t *weights_time_data,
+ const cmsis_nn_dims *bias_dims,
+ const q31_t *bias_data,
+ const cmsis_nn_dims *output_dims,
+ q7_t *output_data)
+{
+ (void)bias_dims;
+ (void)state_dims;
+ (void)output_dims;
+
+ const q31_t multiplier_in = input_quant_params->multiplier;
+ const q31_t shift_in = input_quant_params->shift;
+ const q31_t multiplier_out = output_quant_params->multiplier;
+ const q31_t shift_2 = output_quant_params->shift;
+ const int32_t zp_in = svdf_params->input_offset;
+ const int32_t zp_out = svdf_params->output_offset;
+ const int32_t in_activation_min = svdf_params->input_activation.min;
+ const int32_t in_activation_max = svdf_params->input_activation.max;
+ const int32_t out_activation_min = svdf_params->output_activation.min;
+ const int32_t out_activation_max = svdf_params->output_activation.max;
+ const int16_t rank = svdf_params->rank;
+
+ const int32_t input_batches = input_dims->n;
+ const int32_t input_height = input_dims->h;
+ const int32_t feature_batches = weights_feature_dims->n;
+ const int32_t time_batches = weights_time_dims->h;
+ const int32_t unit_count = feature_batches / rank;
+
+ if (input_ctx->buf == NULL)
+ {
+ return ARM_MATH_ARGUMENT_ERROR;
+ }
+ q31_t *buffer_a = (q31_t *)input_ctx->buf;
+
+ if (output_ctx->buf == NULL)
+ {
+ return ARM_MATH_ARGUMENT_ERROR;
+ }
+ q31_t *buffer_b = (q31_t *)output_ctx->buf;
+
+ memmove((q15_t *)state_data,
+ (q15_t *)state_data + 1,
+ (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t)));
+
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+ q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
+ const q7_t *weight = weights_feature_data;
+ const q7_t *input = input_data + i_batch * input_height;
+
+ arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
+ weight,
+ res_ptr,
+ -zp_in,
+ 0,
+ time_batches,
+ multiplier_in,
+ shift_in,
+ input_height,
+ feature_batches,
+ in_activation_min,
+ in_activation_max);
+
+ if (res != ARM_MATH_SUCCESS)
+ {
+ return res;
+ }
+ }
+
+ {
+ q31_t *ptr_a = buffer_a;
+ const q15_t *v2 = state_data;
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+ const q15_t *v1 = weights_time_data;
+
+ for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
+ {
+ *ptr_a = 0;
+ int32_t sum = 0;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ int j = 0;
+ int32_t block_count = time_batches >> 1;
+ for (int i = 0; i < block_count; i++)
+ {
+ j += 2;
+ q31_t r1 = arm_nn_read_q15x2_ia(&v1);
+ q31_t r2 = arm_nn_read_q15x2_ia(&v2);
+
+ sum = __SMLAD(r1, r2, sum);
+ }
+
+ // Process the remaining data
+ for (; j < time_batches; j++)
+ {
+ sum += *v1 * *v2;
+ v1++;
+ v2++;
+ }
+#else
+ for (int j = 0; j < time_batches; j++)
+ {
+ sum += *v1 * *v2;
+ v1++;
+ v2++;
+ }
+#endif
+
+ *ptr_a = sum;
+ ptr_a++;
+ }
+ }
+ }
+
+ if (bias_data)
+ {
+ if (unit_count == feature_batches)
+ {
+ for (int i = 0; i < input_batches; i++)
+ {
+ q31_t *output_temp = buffer_b + i * feature_batches;
+ const q31_t *ptr_a = buffer_a + i * feature_batches;
+
+ const int32_t *bi = bias_data;
+ for (int j = 0; j < feature_batches; j++)
+ {
+ output_temp[j] = ptr_a[j] + bi[j];
+ }
+ }
+ }
+ else
+ {
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+ q31_t *output_data_temp = buffer_b + i_batch * unit_count;
+ q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+
+ for (int i = 0; i < unit_count; i++)
+ {
+ int32_t sum = bias_data[i];
+ for (int j = 0; j < rank; j++)
+ {
+ sum += *ptr_a;
+ ptr_a++;
+ }
+ output_data_temp[i] = sum;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+ q31_t *output_data_temp = buffer_b + i_batch * unit_count;
+ q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+
+ for (int i = 0; i < unit_count; i++)
+ {
+ int32_t sum = 0;
+ for (int j = 0; j < rank; j++)
+ {
+ sum += *ptr_a;
+ ptr_a++;
+ }
+ output_data_temp[i] = sum;
+ }
+ }
+ }
+
+#if defined(ARM_MATH_MVEI)
+ int32_t num_elements = input_batches * unit_count;
+ const int32_t loop_count = (num_elements + 3) / 4;
+ for (int i_op = 0; i_op < loop_count; i_op++)
+ {
+ mve_pred16_t p = vctp32q((uint32_t)num_elements);
+ int32x4_t op = vldrwq_z_s32(buffer_b, p);
+ op = arm_requantize_mve(op, multiplier_out, shift_2);
+ op = vaddq_n_s32(op, zp_out);
+ const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);
+ const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);
+ op = vmaxq_s32(op, min_vec);
+ op = vminq_s32(op, max_vec);
+ vstrbq_p_s32(output_data, op, p);
+ output_data += 4;
+ buffer_b += 4;
+ num_elements -= 4;
+ }
+#else
+ for (int i = 0; i < input_batches * unit_count; i++)
+ {
+ output_data[i] = (q7_t)CLAMP(
+ arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
+ }
+#endif
+
+ return (ARM_MATH_SUCCESS);
+}
+
+/**
+ * @} end of SVDF group
+ */
diff --git a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
index 0cf59c6..74cf218 100644
--- a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
@@ -87,7 +87,7 @@
add_subdirectory(TestCases/test_arm_softmax_s8)
add_subdirectory(TestCases/test_arm_softmax_s8_s16)
add_subdirectory(TestCases/test_arm_softmax_s16)
-add_subdirectory(TestCases/test_arm_svdf_s8)
+add_subdirectory(TestCases/test_arm_svdf_state_s16_s8)
set(MAKE_CMD "python3")
set(MAKE_CMD_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/unittest_targets.py")
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/CMakeLists.txt
similarity index 67%
rename from CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/CMakeLists.txt
rename to CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/CMakeLists.txt
index 2d20098..a7e5e82 100644
--- a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2010-2022 Arm Limited or its affiliates.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -16,8 +16,8 @@
# limitations under the License.
#
-add_cmsis_nn_unit_test_executable(test_arm_svdf_s8)
+add_cmsis_nn_unit_test_executable(test_arm_svdf_state_s16_s8)
-target_sources(test_arm_svdf_s8 PRIVATE
- Unity/unity_test_arm_svdf_s8.c
- Unity/TestRunner/unity_test_arm_svdf_s8_runner.c)
+target_sources(test_arm_svdf_state_s16_s8 PRIVATE
+ Unity/unity_test_arm_svdf_state_s16_s8.c
+ Unity/TestRunner/unity_test_arm_svdf_state_s16_s8_runner.c)
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/Unity/unity_test_arm_svdf_state_s16_s8.c
similarity index 73%
rename from CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
rename to CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/Unity/unity_test_arm_svdf_state_s16_s8.c
index 8c10a8b..d666a56 100644
--- a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/Unity/unity_test_arm_svdf_state_s16_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -22,7 +22,7 @@
#include <stdlib.h>
#include <string.h>
-#include "../test_arm_svdf_s8.c"
+#include "../test_arm_svdf_state_s16_s8.c"
#include "unity.h"
#ifdef USING_FVP_CORSTONE_300
@@ -44,10 +44,10 @@
*/
void tearDown(void) {}
-void test_svdf_arm_svdf_s8(void) { svdf_arm_svdf_s8(); }
+void test_svdf_arm_state_s16_s8(void) { svdf_arm_svdf_state_s16_s8(); }
-void test_svdf_1_arm_svdf_s8(void) { svdf_1_arm_svdf_s8(); }
+void test_svdf_1_arm_state_s16_s8(void) { svdf_1_arm_svdf_state_s16_s8(); }
-void test_svdf_2_arm_svdf_s8(void) { svdf_2_arm_svdf_s8(); }
+void test_svdf_2_arm_state_s16_s8(void) { svdf_2_arm_svdf_state_s16_s8(); }
-void test_svdf_3_arm_svdf_s8(void) { svdf_3_arm_svdf_s8(); }
+void test_svdf_3_arm_state_s16_s8(void) { svdf_3_arm_svdf_state_s16_s8(); }
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/test_arm_svdf_state_s16_s8.c
similarity index 70%
rename from CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
rename to CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/test_arm_svdf_state_s16_s8.c
index 41a5f78..5db3eb1 100644
--- a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_svdf_state_s16_s8/test_arm_svdf_state_s16_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -41,7 +41,7 @@
return null_bias;
}
-void svdf_arm_svdf_s8(void)
+void svdf_arm_svdf_state_s16_s8(void)
{
const arm_status expected = ARM_MATH_SUCCESS;
cmsis_nn_context input_ctx;
@@ -99,23 +99,23 @@
for (int j = 0; j < number_inputs; j++)
{
memcpy(input_data, svdf_input_sequence + j * input_round_size, input_round_size);
- arm_status result = arm_svdf_s8(&input_ctx,
- &output_ctx,
- &svdf_params,
- &input_quant_params,
- &output_quant_params,
- &input_dims,
- input_data,
- &state_dims,
- state_data,
- &weights_feature_dims,
- weights_feature_data,
- &weights_time_dims,
- weights_time_data,
- &bias_dims,
- null_bias == true ? NULL : svdf_biases,
- &output_dims,
- output_data);
+ arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+ &output_ctx,
+ &svdf_params,
+ &input_quant_params,
+ &output_quant_params,
+ &input_dims,
+ input_data,
+ &state_dims,
+ state_data,
+ &weights_feature_dims,
+ weights_feature_data,
+ &weights_time_dims,
+ weights_time_data,
+ &bias_dims,
+ null_bias == true ? NULL : svdf_biases,
+ &output_dims,
+ output_data);
TEST_ASSERT_EQUAL(expected, result);
}
@@ -127,7 +127,7 @@
free(output_ctx.buf);
}
-void svdf_1_arm_svdf_s8(void)
+void svdf_1_arm_svdf_state_s16_s8(void)
{
const arm_status expected = ARM_MATH_SUCCESS;
cmsis_nn_context input_ctx;
@@ -184,23 +184,23 @@
for (int j = 0; j < number_inputs; j++)
{
memcpy(input_data, svdf_1_input_sequence + j * input_round_size, input_round_size);
- arm_status result = arm_svdf_s8(&input_ctx,
- &output_ctx,
- &svdf_1_params,
- &input_quant_params,
- &output_quant_params,
- &input_dims,
- input_data,
- &state_dims,
- state_data,
- &weights_feature_dims,
- weights_feature_data,
- &weights_time_dims,
- weights_time_data,
- &bias_dims,
- null_bias == true ? NULL : svdf_1_biases,
- &output_dims,
- output_data);
+ arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+ &output_ctx,
+ &svdf_1_params,
+ &input_quant_params,
+ &output_quant_params,
+ &input_dims,
+ input_data,
+ &state_dims,
+ state_data,
+ &weights_feature_dims,
+ weights_feature_data,
+ &weights_time_dims,
+ weights_time_data,
+ &bias_dims,
+ null_bias == true ? NULL : svdf_1_biases,
+ &output_dims,
+ output_data);
TEST_ASSERT_EQUAL(expected, result);
}
@@ -212,7 +212,7 @@
free(output_ctx.buf);
}
-void svdf_2_arm_svdf_s8(void)
+void svdf_2_arm_svdf_state_s16_s8(void)
{
const arm_status expected = ARM_MATH_SUCCESS;
cmsis_nn_context input_ctx;
@@ -269,23 +269,23 @@
for (int j = 0; j < number_inputs; j++)
{
memcpy(input_data, svdf_2_input_sequence + j * input_round_size, input_round_size);
- arm_status result = arm_svdf_s8(&input_ctx,
- &output_ctx,
- &svdf_2_params,
- &input_quant_params,
- &output_quant_params,
- &input_dims,
- input_data,
- &state_dims,
- state_data,
- &weights_feature_dims,
- weights_feature_data,
- &weights_time_dims,
- weights_time_data,
- &bias_dims,
- null_bias == true ? NULL : svdf_2_biases,
- &output_dims,
- output_data);
+ arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+ &output_ctx,
+ &svdf_2_params,
+ &input_quant_params,
+ &output_quant_params,
+ &input_dims,
+ input_data,
+ &state_dims,
+ state_data,
+ &weights_feature_dims,
+ weights_feature_data,
+ &weights_time_dims,
+ weights_time_data,
+ &bias_dims,
+ null_bias == true ? NULL : svdf_2_biases,
+ &output_dims,
+ output_data);
TEST_ASSERT_EQUAL(expected, result);
}
@@ -297,7 +297,7 @@
free(output_ctx.buf);
}
-void svdf_3_arm_svdf_s8(void)
+void svdf_3_arm_svdf_state_s16_s8(void)
{
const arm_status expected = ARM_MATH_SUCCESS;
cmsis_nn_context input_ctx;
@@ -354,23 +354,23 @@
for (int j = 0; j < number_inputs; j++)
{
memcpy(input_data, svdf_3_input_sequence + j * input_round_size, input_round_size);
- arm_status result = arm_svdf_s8(&input_ctx,
- &output_ctx,
- &svdf_3_params,
- &input_quant_params,
- &output_quant_params,
- &input_dims,
- input_data,
- &state_dims,
- state_data,
- &weights_feature_dims,
- weights_feature_data,
- &weights_time_dims,
- weights_time_data,
- &bias_dims,
- null_bias == true ? NULL : svdf_3_biases,
- &output_dims,
- output_data);
+ arm_status result = arm_svdf_state_s16_s8(&input_ctx,
+ &output_ctx,
+ &svdf_3_params,
+ &input_quant_params,
+ &output_quant_params,
+ &input_dims,
+ input_data,
+ &state_dims,
+ state_data,
+ &weights_feature_dims,
+ weights_feature_data,
+ &weights_time_dims,
+ weights_time_data,
+ &bias_dims,
+ null_bias == true ? NULL : svdf_3_biases,
+ &output_dims,
+ output_data);
TEST_ASSERT_EQUAL(expected, result);
}