CMSIS-NN: Add support for int16 DSP avg pooling (#1486)

* CMSIS-NN: Add support for int16 DSP avg pooling


diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index c9f1d3f..b2813bd 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -12,6 +12,7 @@
       Active development ...
       CMSIS-NN: 4.0.0 (see revision history for details)
        - Changed return types of all API's
+       - Support for int16 average pooling DSP implementation
     </release>
     <release version="5.9.0" date="2022-05-02">
       CMSIS-Core(M): 5.6.0
diff --git a/CMSIS/DoxyGen/NN/src/history.txt b/CMSIS/DoxyGen/NN/src/history.txt
index 6314993..1d86be2 100644
--- a/CMSIS/DoxyGen/NN/src/history.txt
+++ b/CMSIS/DoxyGen/NN/src/history.txt
@@ -11,6 +11,7 @@
     <td>
     <ul>
       <li> Replaced arm_status with arm_cmsis_nn_status struct </li>
+      <li> Added DSP support in arm_avgpool_s16.c </li>
       </ul>
     </td>
   </tr>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 59bb636..bbc61e6 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        19 April 2022
- * $Revision:    V.10.0.0
+ * $Date:        17 May 2022
+ * $Revision:    V.10.0.1
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -1936,6 +1936,7 @@
  * @param[in, out] output_data    Output data pointer. Data type: int16
  * @return                        The function returns
  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
+ *                                    <code>ARM_CMSIS_NN_ARG_ERROR</code> - In case of invalid arguments
  *
  * @details
  *    - Supported Framework: TensorFlow Lite
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index f5cf2cb..8d46779 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -41,8 +41,8 @@
 || arm_fully_connected_s8() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | Yes | |
 || arm_fully_connected_s16() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | No | |
 |[Pooling](https://arm-software.github.io/CMSIS_5/NN/html/group__Pooling.html)||||| |  ||
-|| arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 2<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
-|| arm_avgpool_s16() | AVERAGE POOL | None | None | No| No| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
+|| arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
+|| arm_avgpool_s16() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| No| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
 || arm_maxpool_s8() | MAX POOL | None | None | Yes| Yes|  |
 || arm_maxpool_s16() | MAX POOL | None | None | No| No|  |
 |[Softmax](https://arm-software.github.io/CMSIS_5/NN/html/group__Softmax.html)||||| |  ||
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
index 980eb8c..659fe64 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
@@ -21,8 +21,8 @@
  * Title:        arm_avgpool_s16.c
  * Description:  Pooling function implementations
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        17 May 2022
+ * $Revision:    V.2.1.0
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -31,8 +31,32 @@
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+
+static void scale_q31_to_q15_and_clamp(const q31_t *buffer,
+                                       q15_t *target,
+                                       int32_t length,
+                                       const int32_t count,
+                                       const int act_min,
+                                       const int act_max)
+{
+    const int half_count = count / 2;
+
+    for (int i = 0; i < length; i++)
+    {
+        int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count);
+        sum = sum / count;
+        sum = MAX(sum, act_min);
+        sum = MIN(sum, act_max);
+
+        target[i] = (q15_t)sum;
+    }
+}
+#endif
+
 /**
  *  @ingroup groupNN
+
  */
 
 /**
@@ -54,7 +78,6 @@
                                     const cmsis_nn_dims *output_dims,
                                     q15_t *dst)
 {
-    (void)ctx;
     const int32_t input_y = input_dims->h;
     const int32_t input_x = input_dims->w;
     const int32_t output_y = output_dims->h;
@@ -69,9 +92,74 @@
     const int32_t act_max = pool_params->activation.max;
     const int32_t ch_src = input_dims->c;
 
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+
+    q31_t *buffer = (q31_t *)ctx->buf;
+
+    if (buffer == NULL)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    /* Run the following code for CPU's with DSP extension
+     */
+    for (int i_y = 0, idx_y = -pad_y; i_y < output_y; idx_y += stride_y, i_y++)
+    {
+        for (int i_x = 0, idx_x = -pad_x; i_x < output_x; idx_x += stride_x, i_x++)
+        {
+            /* Condition for kernel start dimension:
+                      (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
+            const int32_t kernel_y_start = MAX(0, -idx_y);
+            const int32_t kernel_x_start = MAX(0, -idx_x);
+
+            /* Condition for kernel end dimension:
+                   (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
+            const int32_t kernel_y_end = MIN(kernel_y, input_y - idx_y);
+            const int32_t kernel_x_end = MIN(kernel_x, input_x - idx_x);
+
+            int count = 0;
+
+            for (int k_y = kernel_y_start; k_y < kernel_y_end; k_y++)
+            {
+                for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++)
+                {
+                    const q15_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
+
+                    if (count == 0)
+                    {
+                        for (int i = 0; i < ch_src; i++)
+                        {
+                            buffer[i] = start[i];
+                        }
+                    }
+                    else
+                    {
+                        for (int i = 0; i < ch_src; i++)
+                        {
+                            buffer[i] = __QADD(start[i], buffer[i]);
+                        }
+                    }
+                    count++;
+                }
+            }
+
+            // Prevent static code issue DIVIDE_BY_ZERO.
+            if (count == 0)
+            {
+                return ARM_CMSIS_NN_ARG_ERROR;
+            }
+
+            scale_q31_to_q15_and_clamp(buffer, dst, ch_src, count, act_min, act_max);
+            dst += ch_src;
+        }
+    }
+
+#else
     /* Reference C code adapted from CMSIS-NN arm_avgpool_s8.c.
      */
 
+    (void)ctx;
+
     for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
     {
         for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
@@ -112,6 +200,7 @@
             }
         }
     }
+#endif
 
     return ARM_CMSIS_NN_SUCCESS;
 }
@@ -119,7 +208,11 @@
 int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src)
 {
     (void)output_x;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    return (ch_src * (int32_t)sizeof(int32_t));
+#else
     (void)ch_src;
+#endif
     return 0;
 }
 
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
index b8421ea..7b35fc4 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_avgpool_s8.c
  * Description:  Pooling function implementations
  *
- * $Date:        19 April 2022
- * $Revision:    V.3.0.0
+ * $Date:        17 May 2022
+ * $Revision:    V.3.0.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -42,12 +42,6 @@
 {
     const int half_count = count / 2;
 
-    // Prevent static code issue DIVIDE_BY_ZERO.
-    if (count == 0)
-    {
-        return;
-    }
-
     for (int i = 0; i < length; i++)
     {
         int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count);