CMSIS-NN: update MVE intrinsics usage and predication (#1554)
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
index adfa702..22bba53 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_nn_mat_mult_s8.c
* Description: General Matrix-multiplication function
*
- * $Date: 27. October 2021
- * $Revision: V.2.0.6
+ * $Date: 16 August 2022
+ * $Revision: V.2.0.7
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
@@ -73,7 +73,7 @@
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
- const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
+ const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
row_len_tmp -= 8;
int16x8_t c0 = vldrbq_s16(ip_c0);
@@ -133,7 +133,7 @@
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
- const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
+ const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
row_len_tmp -= 8;
int16x8_t c0 = vldrbq_s16(ip_c0);
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
index 608019a..d708b34 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2020-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_nn_vec_mat_mult_t_s8
* Description: s8 vector by matrix (transposed) multiplication
*
- * $Date: 2 May 2022
- * $Revision: V.4.0.1
+ * $Date: 16 Aug 2022
+ * $Revision: V.4.0.2
*
* Target Processor: Cortex-M
*
@@ -115,7 +115,7 @@
if (bias)
{
int32x4_t b = vldrwq_z_s32(bias, p);
- acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p);
+ acc = vaddq_x_s32(acc, b, p);
bias += 3;
}
const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
index 5fbe1f9..a9886b7 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
@@ -21,8 +21,8 @@
* Title: arm_max_pool_s16.c
* Description: Pooling function implementations
*
- * $Date: 20 June 2022
- * $Revision: V.2.1.0
+ * $Date: 16 August 2022
+ * $Revision: V.2.1.1
*
* Target Processor: Cortex-M CPUs
*
@@ -94,8 +94,8 @@
mve_pred16_t p = vctp16q((uint32_t)length);
length -= 8;
const int16x8_t src = vldrhq_z_s16(source, p);
- int16x8_t res = vmaxq_m_s16(vuninitializedq_s16(), src, min, p);
- res = vminq_m_s16(vuninitializedq_s16(), res, max, p);
+ int16x8_t res = vmaxq_x_s16(src, min, p);
+ res = vminq_x_s16(res, max, p);
vstrhq_p_s16(source, res, p);
source += 8;
}
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
index b5cdc87..6ebc788 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_max_pool_s8.c
* Description: Pooling function implementations
*
- * $Date: 19 April 2022
- * $Revision: V.3.0.0
+ * $Date: 16 August 2022
+ * $Revision: V.3.0.1
*
* Target Processor: Cortex-M CPUs
*
@@ -40,7 +40,7 @@
mve_pred16_t p = vctp8q((uint32_t)length);
const int8x16_t op_1 = vldrbq_z_s8(base, p);
const int8x16_t op_2 = vldrbq_z_s8(target, p);
- const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p);
+ const int8x16_t max = vmaxq_x_s8(op_1, op_2, p);
vstrbq_p_s8(base, max, p);
base += 16;
target += 16;
@@ -98,15 +98,16 @@
{
#if defined(ARM_MATH_MVEI)
int32_t loop_count = (length + 15) / 16;
+ const int8x16_t vmin = vdupq_n_s8((int8_t)act_min);
+ const int8x16_t vmax = vdupq_n_s8((int8_t)act_max);
+
for (int i = 0; i < loop_count; i++)
{
mve_pred16_t p = vctp8q((uint32_t)length);
length -= 16;
const int8x16_t src = vldrbq_z_s8(source, p);
- const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p);
- const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p);
- int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p);
- res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p);
+ int8x16_t res = vmaxq_x_s8(src, vmin, p);
+ res = vminq_x_s8(res, vmax, p);
vstrbq_p_s8(source, res, p);
source += 16;
}