CMSIS-NN: Add support for int16 DSP avg pooling (#1486)
* CMSIS-NN: Add support for int16 DSP avg pooling
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index c9f1d3f..b2813bd 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -12,6 +12,7 @@
Active development ...
CMSIS-NN: 4.0.0 (see revision history for details)
- Changed return types of all API's
+ - Support for int16 average pooling DSP implementation
</release>
<release version="5.9.0" date="2022-05-02">
CMSIS-Core(M): 5.6.0
diff --git a/CMSIS/DoxyGen/NN/src/history.txt b/CMSIS/DoxyGen/NN/src/history.txt
index 6314993..1d86be2 100644
--- a/CMSIS/DoxyGen/NN/src/history.txt
+++ b/CMSIS/DoxyGen/NN/src/history.txt
@@ -11,6 +11,7 @@
<td>
<ul>
<li> Replaced arm_status with arm_cmsis_nn_status struct </li>
+ <li> Added DSP support in arm_avgpool_s16.c </li>
</ul>
</td>
</tr>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 59bb636..bbc61e6 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
- * $Date: 19 April 2022
- * $Revision: V.10.0.0
+ * $Date: 17 May 2022
+ * $Revision: V.10.0.1
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -1936,6 +1936,7 @@
* @param[in, out] output_data Output data pointer. Data type: int16
* @return The function returns
* <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
+ * <code>ARM_CMSIS_NN_ARG_ERROR</code> - In case of invalid arguments
*
* @details
* - Supported Framework: TensorFlow Lite
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index f5cf2cb..8d46779 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -41,8 +41,8 @@
|| arm_fully_connected_s8() |FULLY CONNECTED & <br/> MAT MUL | None | No | Yes | Yes | |
|| arm_fully_connected_s16() |FULLY CONNECTED & <br/> MAT MUL | None | No | Yes | No | |
|[Pooling](https://arm-software.github.io/CMSIS_5/NN/html/group__Pooling.html)||||| | ||
-|| arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 2<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
-|| arm_avgpool_s16() | AVERAGE POOL | None | None | No| No| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
+|| arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
+|| arm_avgpool_s16() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| No| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
|| arm_maxpool_s8() | MAX POOL | None | None | Yes| Yes| |
|| arm_maxpool_s16() | MAX POOL | None | None | No| No| |
|[Softmax](https://arm-software.github.io/CMSIS_5/NN/html/group__Softmax.html)||||| | ||
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
index 980eb8c..659fe64 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
@@ -21,8 +21,8 @@
* Title: arm_avgpool_s16.c
* Description: Pooling function implementations
*
- * $Date: 19 April 2022
- * $Revision: V.2.0.0
+ * $Date: 17 May 2022
+ * $Revision: V.2.1.0
*
* Target Processor: Cortex-M CPUs
*
@@ -31,8 +31,32 @@
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+
+static void scale_q31_to_q15_and_clamp(const q31_t *buffer,
+ q15_t *target,
+ int32_t length,
+ const int32_t count,
+ const int act_min,
+ const int act_max)
+{
+ const int half_count = count / 2;
+
+ for (int i = 0; i < length; i++)
+ {
+ int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count);
+ sum = sum / count;
+ sum = MAX(sum, act_min);
+ sum = MIN(sum, act_max);
+
+ target[i] = (q15_t)sum;
+ }
+}
+#endif
+
/**
* @ingroup groupNN
+
*/
/**
@@ -54,7 +78,6 @@
const cmsis_nn_dims *output_dims,
q15_t *dst)
{
- (void)ctx;
const int32_t input_y = input_dims->h;
const int32_t input_x = input_dims->w;
const int32_t output_y = output_dims->h;
@@ -69,9 +92,74 @@
const int32_t act_max = pool_params->activation.max;
const int32_t ch_src = input_dims->c;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+
+ q31_t *buffer = (q31_t *)ctx->buf;
+
+ if (buffer == NULL)
+ {
+ return ARM_CMSIS_NN_ARG_ERROR;
+ }
+
+ /* Run the following code for CPU's with DSP extension
+ */
+ for (int i_y = 0, idx_y = -pad_y; i_y < output_y; idx_y += stride_y, i_y++)
+ {
+ for (int i_x = 0, idx_x = -pad_x; i_x < output_x; idx_x += stride_x, i_x++)
+ {
+ /* Condition for kernel start dimension:
+ (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
+ const int32_t kernel_y_start = MAX(0, -idx_y);
+ const int32_t kernel_x_start = MAX(0, -idx_x);
+
+ /* Condition for kernel end dimension:
+ (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
+ const int32_t kernel_y_end = MIN(kernel_y, input_y - idx_y);
+ const int32_t kernel_x_end = MIN(kernel_x, input_x - idx_x);
+
+ int count = 0;
+
+ for (int k_y = kernel_y_start; k_y < kernel_y_end; k_y++)
+ {
+ for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++)
+ {
+ const q15_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
+
+ if (count == 0)
+ {
+ for (int i = 0; i < ch_src; i++)
+ {
+ buffer[i] = start[i];
+ }
+ }
+ else
+ {
+ for (int i = 0; i < ch_src; i++)
+ {
+ buffer[i] = __QADD(start[i], buffer[i]);
+ }
+ }
+ count++;
+ }
+ }
+
+ // Prevent static code issue DIVIDE_BY_ZERO.
+ if (count == 0)
+ {
+ return ARM_CMSIS_NN_ARG_ERROR;
+ }
+
+ scale_q31_to_q15_and_clamp(buffer, dst, ch_src, count, act_min, act_max);
+ dst += ch_src;
+ }
+ }
+
+#else
/* Reference C code adapted from CMSIS-NN arm_avgpool_s8.c.
*/
+ (void)ctx;
+
for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
{
for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
@@ -112,6 +200,7 @@
}
}
}
+#endif
return ARM_CMSIS_NN_SUCCESS;
}
@@ -119,7 +208,11 @@
int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src)
{
(void)output_x;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ return (ch_src * (int32_t)sizeof(int32_t));
+#else
(void)ch_src;
+#endif
return 0;
}
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
index b8421ea..7b35fc4 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
@@ -21,8 +21,8 @@
* Title: arm_avgpool_s8.c
* Description: Pooling function implementations
*
- * $Date: 19 April 2022
- * $Revision: V.3.0.0
+ * $Date: 17 May 2022
+ * $Revision: V.3.0.1
*
* Target Processor: Cortex-M CPUs
*
@@ -42,12 +42,6 @@
{
const int half_count = count / 2;
- // Prevent static code issue DIVIDE_BY_ZERO.
- if (count == 0)
- {
- return;
- }
-
for (int i = 0; i < length; i++)
{
int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count);