CMSIS-NN: Add int16 support for depthwise_conv (#1406)

Adds new function arm_depthwise_conv_s16 including dilation support and
corresponding unit tests.

Change-Id: Ia393679990fdcb94edd062c53b76346c67e3405b
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index b6692de..0938273 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -16,6 +16,7 @@
        - Support for DSP extension optimization for int16 convolution and fully connected
        - Support dilation for int8 convolution
        - Support dilation for int8 depthwise convolution
+       - Support for int16 depthwise conv for reference implementation including dilation
       CMSIS-RTOS2:
         - RTX 5.5.4 (see revision history for details)
     </release>
@@ -2820,6 +2821,7 @@
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c"/>
+        <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c"/>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index f0e2b39..8c10637 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        17 August 2021
- * $Revision:    V.7.3.1
+ * $Date:        20 January 2022
+ * $Revision:    V.7.4.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -1130,14 +1130,14 @@
  * @param[in]      quant_params   Per-channel quantization info.
  *                               It contains the multiplier and shift values to be applied to each
  *                               output channel
- * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
+ * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  *                                Batch argument N is not used.
  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
  * @param[in]      filter_dims    Filter tensor dimensions. Format: [1, H, W, C_OUT]
  * @param[in]      filter_data    Filter data pointer. Data type: int8
  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
  * @param[in]      bias_data      Bias data pointer. Data type: int32
- * @param[in]      output_dims    Output tensor dimensions. Format: [1, H, W, C_OUT]
+ * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
  * @param[in, out] output_data    Output data pointer. Data type: int8
  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
  *
@@ -1158,6 +1158,47 @@
                                  q7_t *output_data);
 
 /**
+ * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
+ *
+ * @param[in, out] ctx            Function context (e.g. temporary buffer). Check the function
+ *                                definition file to see if an additional buffer is required.
+ *                                Optional function {API}_get_buffer_size() provides the buffer
+ *                                size if an additional buffer is required.
+ *                                exists if additional memory is.
+ * @param[in]      dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
+ *                                conv_params->input_offset  : Not used
+ *                                conv_params->output_offset : Not used
+ * @param[in]      quant_params   Per-channel quantization info.
+ *                               It contains the multiplier and shift values to be applied to each
+ *                               output channel
+ * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ *                                Batch argument N is not used.
+ * @param[in]      input_data     Input (activation) data pointer. Data type: int8
+ * @param[in]      filter_dims    Filter tensor dimensions. Format: [1, H, W, C_OUT]
+ * @param[in]      filter_data    Filter data pointer. Data type: int8
+ * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
+ * @param[in]      bias_data      Bias data pointer. Data type: int64
+ * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
+ * @param[in, out] output_data    Output data pointer. Data type: int16
+ * @return     The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ * @details
+ *    - Supported framework: TensorFlow Lite
+ *    - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
+ */
+arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
+                                  const cmsis_nn_dw_conv_params *dw_conv_params,
+                                  const cmsis_nn_per_channel_quant_params *quant_params,
+                                  const cmsis_nn_dims *input_dims,
+                                  const q15_t *input,
+                                  const cmsis_nn_dims *filter_dims,
+                                  const q7_t *kernel,
+                                  const cmsis_nn_dims *bias_dims,
+                                  const int64_t *bias,
+                                  const cmsis_nn_dims *output_dims,
+                                  q15_t *output);
+
+/**
  * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
  *        the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
  *        argument details.
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index c29c19f..942d22d 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -34,6 +34,7 @@
 ||arm_convolve_wrapper_s16()|CONV|None|n.a.| Yes | No |The additional memory required depends on the optimal convolution function called|
 ||arm_convolve_s16()|CONV|None|No| No | No ||
 ||arm_convolve_fast_s16()|CONV|dilation = 1, <br/> ker_x * ker_y * input_ch < 512 <br/> |4 * ker_x * ker_y * input_ch| Yes | No ||
+| arm_depthwise_conv_s16() | DEPTHWISE_CONV | None | No|No|No||
 |[Fully Connected](https://arm-software.github.io/CMSIS_5/NN/html/group__FC.html)||||| |  | |
 || arm_fully_connected_s8() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | Yes | |
 || arm_fully_connected_s16() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | No | |
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt b/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
index 6d9575d..30be0fe 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
+++ b/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021 Arm Limited.
+# Copyright (c) 2019-2022 Arm Limited.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -17,7 +17,8 @@
 #
 
 file(GLOB SRC "./*_s8*.c")
-target_sources(cmsis-nn PRIVATE arm_convolve_s16.c arm_convolve_wrapper_s16.c arm_convolve_fast_s16.c ${SRC})
+file(GLOB SRC_S16 "./*_s16*.c")
+target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16})
 
 
 
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c
new file mode 100644
index 0000000..42e4bbd
--- /dev/null
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 2022 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_depthwise_conv_s16.c
+ * Description:  s16 version of depthwise convolution.
+ *
+ * $Date:        26. Jan 2022
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M CPUs
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+static void __attribute__((unused)) depthwise_conv_s16_mult_4_s16(const int16_t *input,
+                                                                  const int32_t input_x,
+                                                                  const int32_t input_y,
+                                                                  const int32_t input_ch,
+                                                                  const int8_t *kernel,
+                                                                  const int32_t output_ch,
+                                                                  const int32_t ch_mult,
+                                                                  const int32_t kernel_x,
+                                                                  const int32_t kernel_y,
+                                                                  const int32_t pad_x,
+                                                                  const int32_t pad_y,
+                                                                  const int32_t stride_x,
+                                                                  const int32_t stride_y,
+                                                                  const int64_t *bias,
+                                                                  int16_t *output,
+                                                                  const int32_t *output_shift,
+                                                                  const int32_t *output_mult,
+                                                                  const int32_t output_x,
+                                                                  const int32_t output_y,
+                                                                  const int32_t output_activation_min,
+                                                                  const int32_t output_activation_max)
+{
+    for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
+    {
+        for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
+        {
+            for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
+                 ++in_ch, out_ch += ch_mult)
+            {
+                for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
+                {
+                    int32_t out_buff32[4] = {REDUCE_MULTIPLIER(output_mult[out_ch + 0 + mult_tile]),
+                                             REDUCE_MULTIPLIER(output_mult[out_ch + 1 + mult_tile]),
+                                             REDUCE_MULTIPLIER(output_mult[out_ch + 2 + mult_tile]),
+                                             REDUCE_MULTIPLIER(output_mult[out_ch + 3 + mult_tile])};
+
+                    int64_t out_buff[4] = {0, 0, 0, 0};
+
+                    if (bias)
+                    {
+                        out_buff[0] = bias[out_ch + 0 + mult_tile];
+                        out_buff[1] = bias[out_ch + 1 + mult_tile];
+                        out_buff[2] = bias[out_ch + 2 + mult_tile];
+                        out_buff[3] = bias[out_ch + 3 + mult_tile];
+                    }
+
+                    for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
+                    {
+                        int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
+                        int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+#pragma clang loop unroll(disable)
+#endif
+                        for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
+                             ++ker_w, ker_idx += output_ch)
+                        {
+                            // TODO: Unroll of 4 with 64 bit accumulator will probably result in too much register
+                            // spills. Try with unroll of 2 when enabling this.
+                            int32_t in_val = input[in_idx + ker_w * input_ch];
+                            out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
+                            out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
+                            out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
+                            out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
+                        }
+                    }
+
+                    out_buff32[0] =
+                        arm_nn_requantize_s64(out_buff[0], out_buff32[0], output_shift[out_ch + 0 + mult_tile]);
+                    out_buff32[1] =
+                        arm_nn_requantize_s64(out_buff[1], out_buff32[1], output_shift[out_ch + 1 + mult_tile]);
+                    out_buff32[2] =
+                        arm_nn_requantize_s64(out_buff[2], out_buff32[2], output_shift[out_ch + 2 + mult_tile]);
+                    out_buff32[3] =
+                        arm_nn_requantize_s64(out_buff[3], out_buff32[3], output_shift[out_ch + 3 + mult_tile]);
+
+                    out_buff32[0] = MIN(MAX(out_buff32[0], output_activation_min), output_activation_max);
+                    out_buff32[1] = MIN(MAX(out_buff32[1], output_activation_min), output_activation_max);
+                    out_buff32[2] = MIN(MAX(out_buff32[2], output_activation_min), output_activation_max);
+                    out_buff32[3] = MIN(MAX(out_buff32[3], output_activation_min), output_activation_max);
+
+                    output[out_idx++] = (int16_t)out_buff32[0];
+                    output[out_idx++] = (int16_t)out_buff32[1];
+                    output[out_idx++] = (int16_t)out_buff32[2];
+                    output[out_idx++] = (int16_t)out_buff32[3];
+                }
+            }
+        }
+    }
+}
+
+static void depthwise_conv_s16_generic_s16(const int16_t *input,
+                                           const uint16_t input_batches,
+                                           const uint16_t input_x,
+                                           const uint16_t input_y,
+                                           const uint16_t input_ch,
+                                           const int8_t *kernel,
+                                           const uint16_t ch_mult,
+                                           const uint16_t kernel_x,
+                                           const uint16_t kernel_y,
+                                           const uint16_t pad_x,
+                                           const uint16_t pad_y,
+                                           const uint16_t stride_x,
+                                           const uint16_t stride_y,
+                                           const int64_t *bias,
+                                           int16_t *output,
+                                           const int32_t *output_shift,
+                                           const int32_t *output_mult,
+                                           const uint16_t output_x,
+                                           const uint16_t output_y,
+                                           const int32_t output_activation_min,
+                                           const int32_t output_activation_max,
+                                           const uint16_t dilation_x,
+                                           const uint16_t dilation_y)
+
+{
+    for (int i_batch = 0; i_batch < input_batches; i_batch++)
+    {
+        for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
+        {
+            const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
+            for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
+            {
+                const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
+                for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
+                {
+                    for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
+                    {
+                        const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
+
+                        const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]);
+                        int64_t acc_0 = 0;
+
+                        int ker_y_start;
+                        int ker_x_start;
+                        int ker_y_end;
+                        int ker_x_end;
+
+                        if (dilation_x > 1)
+                        {
+                            const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
+                            ker_x_start = MAX(0, start_x_max);
+                            const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
+                            ker_x_end = MIN(kernel_x, end_min_x);
+                        }
+                        else
+                        {
+                            ker_x_start = MAX(0, -base_idx_x);
+                            ker_x_end = MIN(kernel_x, input_x - base_idx_x);
+                        }
+
+                        if (dilation_y > 1)
+                        {
+                            const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
+                            ker_y_start = MAX(0, start_y_max);
+                            const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
+                            ker_y_end = MIN(kernel_y, end_min_y);
+                        }
+                        else
+                        {
+                            ker_y_start = MAX(0, -base_idx_y);
+                            ker_y_end = MIN(kernel_y, input_y - base_idx_y);
+                        }
+
+                        if (bias)
+                        {
+                            acc_0 = bias[idx_out_ch];
+                        }
+
+                        for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
+                        {
+                            const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
+                            for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
+                            {
+                                const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
+                                int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
+                                int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
+
+                                acc_0 += input[idx_0] * kernel[ker_idx_0];
+                            }
+                        }
+
+                        /* Requantize and clamp output to provided range */
+                        int32_t result = arm_nn_requantize_s64(acc_0, reduced_multiplier, output_shift[idx_out_ch]);
+                        result = MAX(result, output_activation_min);
+                        result = MIN(result, output_activation_max);
+                        *output++ = (int16_t)result;
+                    }
+                }
+            }
+        }
+        /* Advance to the next batch */
+        input += (input_x * input_y * input_ch);
+    }
+}
+
+/*
+ *  Basic s16 depthwise convolution function.
+ *
+ *  Refer header file for details.
+ *
+ */
+arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
+                                  const cmsis_nn_dw_conv_params *dw_conv_params,
+                                  const cmsis_nn_per_channel_quant_params *quant_params,
+                                  const cmsis_nn_dims *input_dims,
+                                  const q15_t *input,
+                                  const cmsis_nn_dims *filter_dims,
+                                  const q7_t *kernel,
+                                  const cmsis_nn_dims *bias_dims,
+                                  const int64_t *bias,
+                                  const cmsis_nn_dims *output_dims,
+                                  q15_t *output)
+{
+    const uint16_t dilation_x = dw_conv_params->dilation.w;
+    const uint16_t dilation_y = dw_conv_params->dilation.h;
+
+    (void)bias_dims;
+    (void)ctx;
+
+    depthwise_conv_s16_generic_s16(input,
+                                   input_dims->n,
+                                   input_dims->w,
+                                   input_dims->h,
+                                   input_dims->c,
+                                   kernel,
+                                   dw_conv_params->ch_mult,
+                                   filter_dims->w,
+                                   filter_dims->h,
+                                   dw_conv_params->padding.w,
+                                   dw_conv_params->padding.h,
+                                   dw_conv_params->stride.w,
+                                   dw_conv_params->stride.h,
+                                   bias,
+                                   output,
+                                   quant_params->shift,
+                                   quant_params->multiplier,
+                                   output_dims->w,
+                                   output_dims->h,
+                                   dw_conv_params->activation.min,
+                                   dw_conv_params->activation.max,
+                                   dilation_x,
+                                   dilation_y);
+
+    /* Return to application */
+    return ARM_MATH_SUCCESS;
+}
+
+/**
+ * @} end of NNConv group
+ */
diff --git a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
index 257f975..5ebc907 100644
--- a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021 Arm Limited.
+# Copyright (c) 2019-2022 Arm Limited.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -74,6 +74,7 @@
 add_subdirectory(TestCases/test_arm_depthwise_conv_3x3_s8)
 add_subdirectory(TestCases/test_arm_depthwise_conv_s8)
 add_subdirectory(TestCases/test_arm_depthwise_conv_s8_opt)
+add_subdirectory(TestCases/test_arm_depthwise_conv_s16)
 add_subdirectory(TestCases/test_arm_fully_connected_s8)
 add_subdirectory(TestCases/test_arm_fully_connected_s16)
 add_subdirectory(TestCases/test_arm_max_pool_s8)
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/bias.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/bias.txt
new file mode 100644
index 0000000..7a2d8e1
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/bias.txt
@@ -0,0 +1,2 @@
+# 8
+2.982800000000000000e+04,-1.661600000000000000e+04,-3.065100000000000000e+04,-1.166800000000000000e+04,1.840400000000000000e+04,1.216300000000000000e+04,5.951000000000000000e+03,2.316500000000000000e+04
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/input.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/input.txt
new file mode 100644
index 0000000..16e362c
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/input.txt
@@ -0,0 +1,46 @@
+# 1,5,9,4
+-3.127200000000000000e+04,-7.747000000000000000e+03,-2.087200000000000000e+04,1.132300000000000000e+04
+1.253200000000000000e+04,-2.800600000000000000e+04,4.641000000000000000e+03,-1.899300000000000000e+04
+2.693700000000000000e+04,2.359900000000000000e+04,-1.341300000000000000e+04,-2.112800000000000000e+04
+9.067000000000000000e+03,2.733000000000000000e+04,1.907200000000000000e+04,-2.921400000000000000e+04
+2.215000000000000000e+03,-2.876000000000000000e+04,1.588900000000000000e+04,-1.911100000000000000e+04
+-2.901600000000000000e+04,1.736000000000000000e+04,2.597200000000000000e+04,2.373200000000000000e+04
+3.010100000000000000e+04,-8.538000000000000000e+03,-9.263000000000000000e+03,8.245000000000000000e+03
+-1.567300000000000000e+04,9.282000000000000000e+03,1.823600000000000000e+04,2.352800000000000000e+04
+3.966000000000000000e+03,5.270000000000000000e+03,1.091600000000000000e+04,-1.125600000000000000e+04
+-1.617700000000000000e+04,-4.326000000000000000e+03,2.181200000000000000e+04,-1.821000000000000000e+03
+5.136000000000000000e+03,-1.117100000000000000e+04,5.400000000000000000e+03,-1.866200000000000000e+04
+1.184500000000000000e+04,2.408900000000000000e+04,-3.239800000000000000e+04,-9.535000000000000000e+03
+1.813500000000000000e+04,1.491500000000000000e+04,2.260000000000000000e+04,2.264700000000000000e+04
+-1.934200000000000000e+04,-9.410000000000000000e+03,1.688800000000000000e+04,-3.021200000000000000e+04
+-2.102500000000000000e+04,-1.855300000000000000e+04,-1.800000000000000000e+04,2.176300000000000000e+04
+2.901200000000000000e+04,-2.197400000000000000e+04,7.414000000000000000e+03,1.501500000000000000e+04
+-2.791700000000000000e+04,-1.871000000000000000e+03,2.650700000000000000e+04,3.269000000000000000e+04
+-1.236600000000000000e+04,-2.839200000000000000e+04,-7.318000000000000000e+03,-4.925000000000000000e+03
+8.718000000000000000e+03,4.905000000000000000e+03,1.517400000000000000e+04,-2.038500000000000000e+04
+-3.105100000000000000e+04,-6.500000000000000000e+02,-2.726900000000000000e+04,2.997800000000000000e+04
+2.294500000000000000e+04,-1.659500000000000000e+04,1.376600000000000000e+04,-6.699000000000000000e+03
+2.818600000000000000e+04,1.391900000000000000e+04,-2.301100000000000000e+04,2.274500000000000000e+04
+-2.955800000000000000e+04,-5.496000000000000000e+03,3.191700000000000000e+04,-1.309500000000000000e+04
+8.373000000000000000e+03,-1.977300000000000000e+04,-2.103500000000000000e+04,3.203200000000000000e+04
+1.608800000000000000e+04,3.028300000000000000e+04,1.095300000000000000e+04,-2.722200000000000000e+04
+-1.997000000000000000e+03,-1.699800000000000000e+04,-6.534000000000000000e+03,-2.432000000000000000e+04
+2.057800000000000000e+04,8.070000000000000000e+03,1.502000000000000000e+03,8.771000000000000000e+03
+-3.163700000000000000e+04,-1.955200000000000000e+04,3.218600000000000000e+04,1.930500000000000000e+04
+3.178400000000000000e+04,2.120200000000000000e+04,2.752300000000000000e+04,-3.032500000000000000e+04
+7.806000000000000000e+03,2.046500000000000000e+04,1.389200000000000000e+04,-1.132400000000000000e+04
+8.090000000000000000e+03,1.747400000000000000e+04,1.176000000000000000e+04,-1.959700000000000000e+04
+-1.989500000000000000e+04,2.135600000000000000e+04,1.652300000000000000e+04,-9.791000000000000000e+03
+-1.856000000000000000e+03,-1.221500000000000000e+04,1.209700000000000000e+04,3.098700000000000000e+04
+-1.460500000000000000e+04,-1.212400000000000000e+04,-6.613000000000000000e+03,1.577400000000000000e+04
+1.162700000000000000e+04,2.471000000000000000e+04,-2.268900000000000000e+04,2.454300000000000000e+04
+2.609700000000000000e+04,-1.072400000000000000e+04,3.131500000000000000e+04,-1.269300000000000000e+04
+2.756500000000000000e+04,3.149000000000000000e+03,1.910200000000000000e+04,1.857900000000000000e+04
+3.479000000000000000e+03,-2.629700000000000000e+04,1.219300000000000000e+04,2.986200000000000000e+04
+2.908800000000000000e+04,-3.228100000000000000e+04,-1.065800000000000000e+04,-1.952000000000000000e+03
+7.832000000000000000e+03,1.108500000000000000e+04,3.221400000000000000e+04,-6.342000000000000000e+03
+5.380000000000000000e+03,2.441800000000000000e+04,-5.636000000000000000e+03,-1.902700000000000000e+04
+9.984000000000000000e+03,-4.049000000000000000e+03,1.847800000000000000e+04,-3.116800000000000000e+04
+-2.311200000000000000e+04,1.655400000000000000e+04,-1.482900000000000000e+04,3.011000000000000000e+04
+9.078000000000000000e+03,1.944600000000000000e+04,1.622200000000000000e+04,2.120700000000000000e+04
+-1.322000000000000000e+03,3.246400000000000000e+04,2.085200000000000000e+04,-3.700000000000000000e+02
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/kernel.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/kernel.txt
new file mode 100644
index 0000000..10f1340
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8/kernel.txt
@@ -0,0 +1,49 @@
+# 4,3,4,2
+-2.549400000000000000e+04,4.536000000000000000e+03
+-1.558500000000000000e+04,6.842000000000000000e+03
+-3.173600000000000000e+04,-2.232000000000000000e+03
+-2.939700000000000000e+04,-1.447000000000000000e+04
+2.987700000000000000e+04,3.907000000000000000e+03
+1.905200000000000000e+04,1.957400000000000000e+04
+2.564600000000000000e+04,-2.410500000000000000e+04
+2.587100000000000000e+04,-1.815900000000000000e+04
+2.480000000000000000e+03,6.527000000000000000e+03
+1.479900000000000000e+04,-2.943500000000000000e+04
+-3.616000000000000000e+03,-7.591000000000000000e+03
+2.678400000000000000e+04,7.252000000000000000e+03
+-5.295000000000000000e+03,2.360700000000000000e+04
+1.351500000000000000e+04,-9.980000000000000000e+03
+-1.293300000000000000e+04,-3.079800000000000000e+04
+-2.806000000000000000e+03,1.699400000000000000e+04
+2.968100000000000000e+04,-7.461000000000000000e+03
+-4.394000000000000000e+03,-2.294900000000000000e+04
+-8.775000000000000000e+03,-6.773000000000000000e+03
+2.015400000000000000e+04,-2.114600000000000000e+04
+-1.672000000000000000e+04,1.779000000000000000e+03
+-6.890000000000000000e+02,-1.744000000000000000e+04
+2.300900000000000000e+04,-1.729600000000000000e+04
+-1.516300000000000000e+04,2.934500000000000000e+04
+3.162900000000000000e+04,9.363000000000000000e+03
+-2.547000000000000000e+04,-1.498600000000000000e+04
+-3.218100000000000000e+04,-1.277900000000000000e+04
+-5.992000000000000000e+03,-1.700100000000000000e+04
+-3.212300000000000000e+04,-5.900000000000000000e+01
+9.540000000000000000e+03,3.155600000000000000e+04
+-1.960400000000000000e+04,4.146000000000000000e+03
+-2.451600000000000000e+04,1.873800000000000000e+04
+2.503000000000000000e+03,-2.225500000000000000e+04
+2.758400000000000000e+04,-2.599200000000000000e+04
+-2.219700000000000000e+04,2.299100000000000000e+04
+-2.359000000000000000e+04,-1.134900000000000000e+04
+-2.286600000000000000e+04,3.392000000000000000e+03
+-2.309900000000000000e+04,1.174200000000000000e+04
+1.349600000000000000e+04,-1.895400000000000000e+04
+5.585000000000000000e+03,-1.187700000000000000e+04
+2.663800000000000000e+04,3.179400000000000000e+04
+2.235900000000000000e+04,2.397600000000000000e+04
+-1.107400000000000000e+04,-5.145000000000000000e+03
+1.395300000000000000e+04,-9.516000000000000000e+03
+-1.579900000000000000e+04,-2.128000000000000000e+04
+-2.984800000000000000e+04,1.232300000000000000e+04
+1.877800000000000000e+04,3.090600000000000000e+04
+-1.628200000000000000e+04,1.331000000000000000e+04
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/bias.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/bias.txt
new file mode 100644
index 0000000..c36251d
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/bias.txt
@@ -0,0 +1,2 @@
+# 8
+2.326300000000000000e+04,3.024400000000000000e+04,-5.510000000000000000e+02,1.642900000000000000e+04,2.370300000000000000e+04,-2.586200000000000000e+04,1.082000000000000000e+04,-2.185900000000000000e+04
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/input.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/input.txt
new file mode 100644
index 0000000..a5c4313
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/input.txt
@@ -0,0 +1,46 @@
+# 1,5,9,4
+2.245100000000000000e+04,-2.812700000000000000e+04,-1.384600000000000000e+04,-1.854300000000000000e+04
+-3.057900000000000000e+04,-4.546000000000000000e+03,-2.075300000000000000e+04,2.265200000000000000e+04
+2.401400000000000000e+04,-2.838000000000000000e+04,-6.297000000000000000e+03,-1.374900000000000000e+04
+2.266600000000000000e+04,-2.645400000000000000e+04,-1.575000000000000000e+04,1.961300000000000000e+04
+-1.646100000000000000e+04,-1.995600000000000000e+04,1.114900000000000000e+04,2.349500000000000000e+04
+-2.964000000000000000e+03,-2.778800000000000000e+04,1.957500000000000000e+04,1.134000000000000000e+03
+1.001100000000000000e+04,-1.489500000000000000e+04,-3.450000000000000000e+02,1.998000000000000000e+04
+-1.378900000000000000e+04,3.048100000000000000e+04,1.830800000000000000e+04,1.239700000000000000e+04
+-7.340000000000000000e+02,2.516600000000000000e+04,2.751000000000000000e+04,5.664000000000000000e+03
+-1.563300000000000000e+04,-1.039800000000000000e+04,-1.481600000000000000e+04,3.190100000000000000e+04
+2.344100000000000000e+04,2.268700000000000000e+04,-2.332600000000000000e+04,-4.507000000000000000e+03
+-3.885000000000000000e+03,3.268900000000000000e+04,-1.320800000000000000e+04,2.057800000000000000e+04
+-2.823400000000000000e+04,-7.725000000000000000e+03,2.171100000000000000e+04,-2.280000000000000000e+04
+-2.391800000000000000e+04,-2.822600000000000000e+04,-9.705000000000000000e+03,-1.268000000000000000e+03
+-2.609000000000000000e+04,-6.391000000000000000e+03,-1.991300000000000000e+04,-1.843100000000000000e+04
+-3.206900000000000000e+04,-3.011900000000000000e+04,-1.715200000000000000e+04,-1.275200000000000000e+04
+-1.287400000000000000e+04,-1.972500000000000000e+04,1.450900000000000000e+04,-1.713600000000000000e+04
+-1.841000000000000000e+04,2.800700000000000000e+04,-4.493000000000000000e+03,-1.956800000000000000e+04
+-2.419100000000000000e+04,-1.118700000000000000e+04,5.532000000000000000e+03,1.385000000000000000e+04
+1.279800000000000000e+04,2.417300000000000000e+04,-2.798000000000000000e+03,-2.040000000000000000e+02
+8.340000000000000000e+02,1.112100000000000000e+04,1.299900000000000000e+04,2.944000000000000000e+04
+7.309000000000000000e+03,-1.603600000000000000e+04,-2.206000000000000000e+04,-2.693100000000000000e+04
+-1.718800000000000000e+04,-2.084800000000000000e+04,-4.300000000000000000e+01,-1.937100000000000000e+04
+-1.171600000000000000e+04,-2.977100000000000000e+04,1.491600000000000000e+04,2.716900000000000000e+04
+-2.813900000000000000e+04,-1.304600000000000000e+04,1.630900000000000000e+04,-1.369100000000000000e+04
+-3.012900000000000000e+04,-1.003400000000000000e+04,-3.905000000000000000e+03,1.255600000000000000e+04
+1.619100000000000000e+04,-3.121100000000000000e+04,-5.861000000000000000e+03,-1.376100000000000000e+04
+2.385400000000000000e+04,-3.240000000000000000e+04,-1.633300000000000000e+04,-3.276300000000000000e+04
+-1.751000000000000000e+03,-3.009700000000000000e+04,-2.181700000000000000e+04,2.074700000000000000e+04
+8.616000000000000000e+03,-1.397200000000000000e+04,-1.251200000000000000e+04,-9.105000000000000000e+03
+2.612100000000000000e+04,1.505400000000000000e+04,-1.584800000000000000e+04,3.081600000000000000e+04
+6.712000000000000000e+03,-2.696600000000000000e+04,4.734000000000000000e+03,-2.794200000000000000e+04
+-1.283600000000000000e+04,2.921400000000000000e+04,-2.371100000000000000e+04,-2.844000000000000000e+04
+7.570000000000000000e+02,1.694800000000000000e+04,-2.184800000000000000e+04,-9.721000000000000000e+03
+1.885000000000000000e+03,-1.259900000000000000e+04,9.509000000000000000e+03,1.834900000000000000e+04
+-2.495200000000000000e+04,2.045000000000000000e+03,-3.092600000000000000e+04,-2.718300000000000000e+04
+-1.258600000000000000e+04,-2.774600000000000000e+04,1.109100000000000000e+04,-1.114400000000000000e+04
+-1.371800000000000000e+04,-2.997600000000000000e+04,-3.087100000000000000e+04,-1.817100000000000000e+04
+1.808000000000000000e+04,2.851100000000000000e+04,-1.267800000000000000e+04,-1.916900000000000000e+04
+2.256300000000000000e+04,-9.642000000000000000e+03,-5.862000000000000000e+03,1.719100000000000000e+04
+-2.778600000000000000e+04,-1.937900000000000000e+04,4.749000000000000000e+03,1.322400000000000000e+04
+-2.240300000000000000e+04,-1.563800000000000000e+04,-3.176200000000000000e+04,-2.328800000000000000e+04
+2.354800000000000000e+04,1.926000000000000000e+03,-2.661400000000000000e+04,1.144600000000000000e+04
+6.810000000000000000e+03,1.144200000000000000e+04,-7.535000000000000000e+03,-1.312600000000000000e+04
+-1.175600000000000000e+04,7.695000000000000000e+03,-1.646100000000000000e+04,2.326200000000000000e+04
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/kernel.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/kernel.txt
new file mode 100644
index 0000000..96a2a51
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_dilation/kernel.txt
@@ -0,0 +1,65 @@
+# 4,4,4,2
+1.547500000000000000e+04,2.301100000000000000e+04
+-2.753300000000000000e+04,-1.367400000000000000e+04
+2.940000000000000000e+04,1.689200000000000000e+04
+-5.753000000000000000e+03,2.514000000000000000e+03
+-2.854600000000000000e+04,1.028000000000000000e+04
+-2.264000000000000000e+04,2.997000000000000000e+03
+-2.306600000000000000e+04,-2.716900000000000000e+04
+2.341500000000000000e+04,1.068400000000000000e+04
+1.540600000000000000e+04,7.210000000000000000e+03
+-3.152100000000000000e+04,-3.093700000000000000e+04
+2.372600000000000000e+04,1.953500000000000000e+04
+-3.215000000000000000e+03,-7.766000000000000000e+03
+3.469000000000000000e+03,2.030300000000000000e+04
+-2.877300000000000000e+04,-1.444900000000000000e+04
+-1.983400000000000000e+04,-1.113200000000000000e+04
+2.386500000000000000e+04,1.567000000000000000e+04
+1.461300000000000000e+04,1.461000000000000000e+03
+-1.106300000000000000e+04,3.220400000000000000e+04
+-1.427500000000000000e+04,4.433000000000000000e+03
+2.327400000000000000e+04,-6.877000000000000000e+03
+-1.321800000000000000e+04,-1.717000000000000000e+04
+-3.226000000000000000e+03,1.252900000000000000e+04
+-3.170800000000000000e+04,3.086800000000000000e+04
+-8.160000000000000000e+02,1.859000000000000000e+04
+8.010000000000000000e+03,-2.930800000000000000e+04
+1.623100000000000000e+04,-1.482500000000000000e+04
+-2.657000000000000000e+04,2.635700000000000000e+04
+2.471000000000000000e+03,2.679800000000000000e+04
+9.508000000000000000e+03,-9.457000000000000000e+03
+2.981800000000000000e+04,3.265100000000000000e+04
+-4.409000000000000000e+03,-1.206800000000000000e+04
+1.988700000000000000e+04,9.155000000000000000e+03
+2.804300000000000000e+04,-2.336300000000000000e+04
+3.141500000000000000e+04,2.188600000000000000e+04
+2.726400000000000000e+04,-5.060000000000000000e+03
+-1.044800000000000000e+04,1.611100000000000000e+04
+-2.524400000000000000e+04,-2.923200000000000000e+04
+3.032100000000000000e+04,7.989000000000000000e+03
+2.515900000000000000e+04,1.172600000000000000e+04
+-2.428500000000000000e+04,-2.048200000000000000e+04
+2.717100000000000000e+04,-1.199800000000000000e+04
+2.064300000000000000e+04,-8.400000000000000000e+01
+-1.660100000000000000e+04,9.368000000000000000e+03
+3.124800000000000000e+04,-1.902600000000000000e+04
+-1.903400000000000000e+04,-2.442700000000000000e+04
+-2.629900000000000000e+04,-2.947400000000000000e+04
+2.791000000000000000e+03,2.417200000000000000e+04
+1.751300000000000000e+04,3.450000000000000000e+03
+4.306000000000000000e+03,8.994000000000000000e+03
+-3.149800000000000000e+04,-3.036200000000000000e+04
+1.795300000000000000e+04,2.452400000000000000e+04
+-1.931300000000000000e+04,-2.563200000000000000e+04
+1.334900000000000000e+04,-1.101900000000000000e+04
+-2.293000000000000000e+04,1.491200000000000000e+04
+2.395400000000000000e+04,-7.796000000000000000e+03
+1.200300000000000000e+04,3.309000000000000000e+03
+2.212000000000000000e+04,-2.899800000000000000e+04
+2.020700000000000000e+04,2.212500000000000000e+04
+-2.756100000000000000e+04,-2.093900000000000000e+04
+-2.674400000000000000e+04,2.788100000000000000e+04
+-2.929200000000000000e+04,-3.224700000000000000e+04
+7.653000000000000000e+03,1.954300000000000000e+04
+2.698000000000000000e+03,-1.963100000000000000e+04
+-2.152400000000000000e+04,-3.009400000000000000e+04
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/bias.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/bias.txt
new file mode 100644
index 0000000..d1967d5
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/bias.txt
@@ -0,0 +1,2 @@
+# 8
+1.735000000000000000e+03,-2.972400000000000000e+04,2.543300000000000000e+04,1.310000000000000000e+04,-2.080600000000000000e+04,1.646500000000000000e+04,-2.109200000000000000e+04,-2.065200000000000000e+04
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/input.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/input.txt
new file mode 100644
index 0000000..d9d2bea
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/input.txt
@@ -0,0 +1,21 @@
+# 1,5,4,2
+-1.748500000000000000e+04,2.181900000000000000e+04
+-2.631200000000000000e+04,-1.444600000000000000e+04
+3.232600000000000000e+04,2.648200000000000000e+04
+-4.390000000000000000e+03,-1.862100000000000000e+04
+-3.218600000000000000e+04,1.503700000000000000e+04
+-7.461000000000000000e+03,3.031200000000000000e+04
+-1.899000000000000000e+04,1.560500000000000000e+04
+-2.006400000000000000e+04,3.712000000000000000e+03
+-2.556700000000000000e+04,-2.221600000000000000e+04
+-7.666000000000000000e+03,4.861000000000000000e+03
+3.091800000000000000e+04,-1.954000000000000000e+03
+1.495000000000000000e+04,6.892000000000000000e+03
+-2.162800000000000000e+04,-1.036900000000000000e+04
+1.090100000000000000e+04,3.197900000000000000e+04
+-2.423300000000000000e+04,1.429700000000000000e+04
+1.544400000000000000e+04,-3.017500000000000000e+04
+-3.206100000000000000e+04,-2.922700000000000000e+04
+2.882600000000000000e+04,-9.789000000000000000e+03
+5.445000000000000000e+03,-3.202200000000000000e+04
+1.991300000000000000e+04,9.312000000000000000e+03
diff --git a/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/kernel.txt b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/kernel.txt
new file mode 100644
index 0000000..4ef976d
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_mult4/kernel.txt
@@ -0,0 +1,25 @@
+# 4,3,2,4
+-2.204300000000000000e+04,8.290000000000000000e+02,1.209100000000000000e+04,9.927000000000000000e+03
+1.876000000000000000e+03,-7.717000000000000000e+03,-2.111600000000000000e+04,1.725100000000000000e+04
+-3.754000000000000000e+03,-2.688400000000000000e+04,-1.348100000000000000e+04,2.715300000000000000e+04
+1.864900000000000000e+04,-9.133000000000000000e+03,-3.135900000000000000e+04,1.076600000000000000e+04
+2.053200000000000000e+04,-3.000000000000000000e+01,-2.513000000000000000e+04,-1.872000000000000000e+03
+1.954000000000000000e+03,-6.241000000000000000e+03,-1.730000000000000000e+03,2.911100000000000000e+04
+-1.843600000000000000e+04,-2.133700000000000000e+04,-5.508000000000000000e+03,-9.538000000000000000e+03
+2.880300000000000000e+04,1.326400000000000000e+04,2.713000000000000000e+03,1.715800000000000000e+04
+-1.026300000000000000e+04,2.984000000000000000e+03,-1.666000000000000000e+04,-2.379400000000000000e+04
+-2.750000000000000000e+04,-2.577000000000000000e+04,1.090000000000000000e+03,2.097600000000000000e+04
+7.494000000000000000e+03,5.668000000000000000e+03,-1.014800000000000000e+04,-2.330300000000000000e+04
+4.114000000000000000e+03,2.963300000000000000e+04,3.131800000000000000e+04,1.411800000000000000e+04
+-1.181100000000000000e+04,-2.866600000000000000e+04,-6.682000000000000000e+03,2.747700000000000000e+04
+7.446000000000000000e+03,-1.548300000000000000e+04,6.605000000000000000e+03,1.252800000000000000e+04
+2.012900000000000000e+04,8.510000000000000000e+02,3.077100000000000000e+04,-8.977000000000000000e+03
+-1.844600000000000000e+04,2.006400000000000000e+04,-4.102000000000000000e+03,1.073100000000000000e+04
+3.235100000000000000e+04,-6.595000000000000000e+03,7.492000000000000000e+03,-2.963400000000000000e+04
+2.971000000000000000e+03,-4.701000000000000000e+03,3.272000000000000000e+03,1.595000000000000000e+03
+2.967200000000000000e+04,3.008000000000000000e+03,1.705500000000000000e+04,2.425300000000000000e+04
+-1.778900000000000000e+04,-1.688600000000000000e+04,-3.005800000000000000e+04,-2.047000000000000000e+03
+3.013600000000000000e+04,-9.795000000000000000e+03,6.234000000000000000e+03,-2.966200000000000000e+04
+-1.078500000000000000e+04,-1.464100000000000000e+04,-1.599900000000000000e+04,1.948900000000000000e+04
+2.809200000000000000e+04,2.544100000000000000e+04,-1.237000000000000000e+03,9.296000000000000000e+03
+-2.618200000000000000e+04,8.015000000000000000e+03,5.585000000000000000e+03,-4.513000000000000000e+03
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/biases_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/biases_data.h
new file mode 100644
index 0000000..c27064a
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/biases_data.h
@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_biases[8] = {3864101, -2174812, -4273364, -1538703, 2379869, 1637715, 842417, 3285024};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/config_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/config_data.h
new file mode 100644
index 0000000..b98262c
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/config_data.h
@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#define DW_INT16XINT8_OUT_CH 8
+#define DW_INT16XINT8_IN_CH 4
+#define DW_INT16XINT8_INPUT_W 9
+#define DW_INT16XINT8_INPUT_H 5
+#define DW_INT16XINT8_DST_SIZE 72
+#define DW_INT16XINT8_INPUT_SIZE 180
+#define DW_INT16XINT8_OUT_ACTIVATION_MIN -21111
+#define DW_INT16XINT8_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_INPUT_BATCHES 1
+#define DW_INT16XINT8_FILTER_X 3
+#define DW_INT16XINT8_FILTER_Y 4
+#define DW_INT16XINT8_STRIDE_X 3
+#define DW_INT16XINT8_STRIDE_Y 2
+#define DW_INT16XINT8_PAD_X 0
+#define DW_INT16XINT8_PAD_Y 1
+#define DW_INT16XINT8_OUTPUT_W 3
+#define DW_INT16XINT8_OUTPUT_H 3
+#define DW_INT16XINT8_CH_MULT 2
+#define DW_INT16XINT8_INPUT_OFFSET 0
+#define DW_INT16XINT8_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_DILATION_X 1
+#define DW_INT16XINT8_DILATION_Y 1
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/input_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/input_data.h
new file mode 100644
index 0000000..93c17dc
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/input_data.h
@@ -0,0 +1,18 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_input[180] = {
+    -31272, -7747,  -20872, 11323,  12532,  -28006, 4641,   -18993, 26937,  23599,  -13413, -21128, 9067,   27330,
+    19072,  -29214, 2215,   -28760, 15889,  -19111, -29016, 17360,  25972,  23732,  30101,  -8538,  -9263,  8245,
+    -15673, 9282,   18236,  23528,  3966,   5270,   10916,  -11256, -16177, -4326,  21812,  -1821,  5136,   -11171,
+    5400,   -18662, 11845,  24089,  -32398, -9535,  18135,  14915,  22600,  22647,  -19342, -9410,  16888,  -30212,
+    -21025, -18553, -18000, 21763,  29012,  -21974, 7414,   15015,  -27917, -1871,  26507,  32690,  -12366, -28392,
+    -7318,  -4925,  8718,   4905,   15174,  -20385, -31051, -650,   -27269, 29978,  22945,  -16595, 13766,  -6699,
+    28186,  13919,  -23011, 22745,  -29558, -5496,  31917,  -13095, 8373,   -19773, -21035, 32032,  16088,  30283,
+    10953,  -27222, -1997,  -16998, -6534,  -24320, 20578,  8070,   1502,   8771,   -31637, -19552, 32186,  19305,
+    31784,  21202,  27523,  -30325, 7806,   20465,  13892,  -11324, 8090,   17474,  11760,  -19597, -19895, 21356,
+    16523,  -9791,  -1856,  -12215, 12097,  30987,  -14605, -12124, -6613,  15774,  11627,  24710,  -22689, 24543,
+    26097,  -10724, 31315,  -12693, 27565,  3149,   19102,  18579,  3479,   -26297, 12193,  29862,  29088,  -32281,
+    -10658, -1952,  7832,   11085,  32214,  -6342,  5380,   24418,  -5636,  -19027, 9984,   -4049,  18478,  -31168,
+    -23112, 16554,  -14829, 30110,  9078,   19446,  16222,  21207,  -1322,  32464,  20852,  -370};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_mult_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_mult_data.h
new file mode 100644
index 0000000..153839d
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_mult_data.h
@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_output_mult[8] =
+    {1814563076, 1795978464, 1686052907, 1782534381, 1817839397, 1745817143, 1660576855, 1657639441};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_ref_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_ref_data.h
new file mode 100644
index 0000000..38d8faa
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_ref_data.h
@@ -0,0 +1,10 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_output_ref[72] = {
+    -12590, -21111, 403,   -15091, 15704,  6791,   15782, 5124,   13868,  -10195, -17592, -7150, -11747, -21111, -12457,
+    5852,   17333,  5061,  -21111, 5956,   6110,   1181,  -7691,  14996,  -21111, -12003, 11163, -21111, -18704, -10357,
+    22395,  -18556, -7600, 4244,   -21111, 23562,  -2820, 14575,  -21111, 31088,  -14324, -8578, -16985, 32734,  12942,
+    13438,  -117,   -130,  29034,  2148,   2462,   5599,  -4725,  -12125, -11525, 7517,   1607,  -5720,  -13112, -1061,
+    8747,   -18178, 19050, 10784,  28408,  -13156, -5073, -10973, 8473,   9277,   4904,   570};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_shift_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_shift_data.h
new file mode 100644
index 0000000..73cf72c
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/output_shift_data.h
@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_output_shift[8] = {-8, -8, -8, -8, -8, -8, -8, -8};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/test_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/test_data.h
new file mode 100644
index 0000000..99f0472
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/test_data.h
@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/weights_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/weights_data.h
new file mode 100644
index 0000000..60d343c
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8/weights_data.h
@@ -0,0 +1,10 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_weights[96] = {
+    -101, 18,  -66,  28,  -125, -9,  -127, -63,  118, 16,   81,   79,  101,  -99, 112, -79, 10,   26,  63, -118,
+    -14,  -31, 116,  31,  -21,  94,  58,   -40,  -51, -127, -12,  74,  117,  -30, -19, -92, -35,  -28, 87, -92,
+    -66,  7,   -3,   -70, 91,   -71, -66,  127,  125, 37,   -108, -60, -127, -53, -26, -74, -127, 0,   41, 127,
+    -77,  17,  -106, 81,  10,   -89, 117,  -105, -88, 94,   -102, -49, -90,  14,  -98, 47,  53,   -78, 24, -51,
+    105,  127, 95,   96,  -44,  -21, 60,   -41,  -62, -85,  -127, 50,  74,   127, -70, 58};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/biases_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/biases_data.h
new file mode 100644
index 0000000..853848e
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/biases_data.h
@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_dilation_biases[8] =
+    {3304891, 3902926, -72743, 2093896, 3110820, -3486535, 1440939, -3022670};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/config_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/config_data.h
new file mode 100644
index 0000000..05c6820
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/config_data.h
@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#define DW_INT16XINT8_DILATION_OUT_CH 8
+#define DW_INT16XINT8_DILATION_IN_CH 4
+#define DW_INT16XINT8_DILATION_INPUT_W 9
+#define DW_INT16XINT8_DILATION_INPUT_H 5
+#define DW_INT16XINT8_DILATION_DST_SIZE 360
+#define DW_INT16XINT8_DILATION_INPUT_SIZE 180
+#define DW_INT16XINT8_DILATION_OUT_ACTIVATION_MIN -32700
+#define DW_INT16XINT8_DILATION_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_DILATION_INPUT_BATCHES 1
+#define DW_INT16XINT8_DILATION_FILTER_X 4
+#define DW_INT16XINT8_DILATION_FILTER_Y 4
+#define DW_INT16XINT8_DILATION_STRIDE_X 1
+#define DW_INT16XINT8_DILATION_STRIDE_Y 1
+#define DW_INT16XINT8_DILATION_PAD_X 4
+#define DW_INT16XINT8_DILATION_PAD_Y 3
+#define DW_INT16XINT8_DILATION_OUTPUT_W 9
+#define DW_INT16XINT8_DILATION_OUTPUT_H 5
+#define DW_INT16XINT8_DILATION_CH_MULT 2
+#define DW_INT16XINT8_DILATION_INPUT_OFFSET 0
+#define DW_INT16XINT8_DILATION_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_DILATION_DILATION_X 3
+#define DW_INT16XINT8_DILATION_DILATION_Y 2
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/input_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/input_data.h
new file mode 100644
index 0000000..ecefafe
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/input_data.h
@@ -0,0 +1,18 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_dilation_input[180] = {
+    22451,  -28127, -13846, -18543, -30579, -4546,  -20753, 22652,  24014,  -28380, -6297,  -13749, 22666,  -26454,
+    -15750, 19613,  -16461, -19956, 11149,  23495,  -2964,  -27788, 19575,  1134,   10011,  -14895, -345,   19980,
+    -13789, 30481,  18308,  12397,  -734,   25166,  27510,  5664,   -15633, -10398, -14816, 31901,  23441,  22687,
+    -23326, -4507,  -3885,  32689,  -13208, 20578,  -28234, -7725,  21711,  -22800, -23918, -28226, -9705,  -1268,
+    -26090, -6391,  -19913, -18431, -32069, -30119, -17152, -12752, -12874, -19725, 14509,  -17136, -18410, 28007,
+    -4493,  -19568, -24191, -11187, 5532,   13850,  12798,  24173,  -2798,  -204,   834,    11121,  12999,  29440,
+    7309,   -16036, -22060, -26931, -17188, -20848, -43,    -19371, -11716, -29771, 14916,  27169,  -28139, -13046,
+    16309,  -13691, -30129, -10034, -3905,  12556,  16191,  -31211, -5861,  -13761, 23854,  -32400, -16333, -32763,
+    -1751,  -30097, -21817, 20747,  8616,   -13972, -12512, -9105,  26121,  15054,  -15848, 30816,  6712,   -26966,
+    4734,   -27942, -12836, 29214,  -23711, -28440, 757,    16948,  -21848, -9721,  1885,   -12599, 9509,   18349,
+    -24952, 2045,   -30926, -27183, -12586, -27746, 11091,  -11144, -13718, -29976, -30871, -18171, 18080,  28511,
+    -12678, -19169, 22563,  -9642,  -5862,  17191,  -27786, -19379, 4749,   13224,  -22403, -15638, -31762, -23288,
+    23548,  1926,   -26614, 11446,  6810,   11442,  -7535,  -13126, -11756, 7695,   -16461, 23262};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_mult_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_mult_data.h
new file mode 100644
index 0000000..ccef8d2
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_mult_data.h
@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_dilation_output_mult[8] =
+    {1549993836, 1706358422, 1667941929, 1727736113, 1677837094, 1633388220, 1653496022, 1592431848};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_ref_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_ref_data.h
new file mode 100644
index 0000000..959a6a0
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_ref_data.h
@@ -0,0 +1,31 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_dilation_output_ref[360] = {
+    10529,  11233,  4690,   6046,   7445,   -4422,  9342,   -3487,  11467,  9322,   9004,   9332,   -110,  -2071,
+    -14492, 404,    -988,   4094,   4001,   68,     -1458,  -5983,  4474,   -11772, 6505,   18010,  6267,  2740,
+    6270,   -590,   1041,   -5276,  3851,   15086,  -1112,  11195,  6908,   -3069,  8818,   7170,   10981, 7041,
+    4567,   6366,   -5390,  -9017,  -8133,  -3354,  742,    18350,  7397,   12875,  277,    -3247,  3302,  -1586,
+    5492,   17180,  -12438, -455,   773,    -8419,  1175,   -10052, 2858,   12008,  -2245,  2411,   7282,  -3584,
+    9428,   2374,   13640,  4843,   988,    7696,   5764,   745,    16811,  -8377,  10586,  -749,   -4089, -1226,
+    15498,  -5354,  -8825,  -430,   -1895,  20961,  9363,   9848,   524,    -7675,  -190,   9575,   -880,  7462,
+    80,     16246,  5646,   2300,   1634,   -21187, 56,     -66,    1025,   -663,   9764,   -4034,  947,   12239,
+    698,    13313,  13020,  5634,   -80,    -4731,  16036,  1159,   8692,   12072,  -4805,  -12354, -1180, 4788,
+    -11530, 4866,   13978,  8844,   -1926,  -3867,  485,    -3429,  7445,   -6348,  5791,   13226,  -3086, 3080,
+    -962,   -634,   2303,   -9771,  5894,   9532,   -4507,  -6244,  8293,   -10032, -4544,  -1614,  3260,  8312,
+    -12976, -7458,  3290,   -8514,  9852,   -4927,  2016,   8133,   -13871, 4484,   6759,   -12098, -6385, -5010,
+    2080,   15297,  4477,   10929,  10592,  -17347, -7565,  -2336,  4172,   6492,   -3540,  -196,   1644,  -6118,
+    2260,   -16319, 7009,   10012,  -15423, 2037,   2453,   -3737,  7625,   -3739,  4805,   14941,  3518,  6223,
+    7861,   -13103, 4513,   -4732,  9104,   5314,   7048,   2290,   179,    -8899,  -1532,  -615,   4434,  5898,
+    -5321,  -7988,  5032,   -2171,  1096,   -11298, 11857,  9275,   12029,  6958,   -294,   -11694, 206,   4456,
+    6697,   9635,   1096,   5356,   8351,   -11894, 6078,   -8585,  1229,   8639,   -15189, 2516,   2364,  -4932,
+    11821,  -4878,  -4673,  8560,   -2034,  3120,   1350,   -3570,  496,    7800,   -2076,  11148,  4053,  1524,
+    9193,   -7723,  7444,   -14780, 7420,   13849,  -12612, -3345,  -662,   -6830,  1417,   -5119,  13787, 9861,
+    3974,   6167,   -3566,  -6414,  15691,  -6896,  8222,   5493,   6251,   -136,   -4359,  -5964,  -2233, -2911,
+    584,    8648,   -953,   -1372,  4574,   -8506,  1420,   1974,   3424,   1848,   -1343,  4792,   7908,  -5635,
+    -4525,  -10220, 3463,   -7344,  13322,  6717,   18371,  -2963,  4737,   -1635,  -1389,  3753,   1308,  8402,
+    8067,   -7506,  769,    -7442,  -283,   5211,   -4792,  -683,   11366,  -8909,  -1085,  -11245, 5146,  -1966,
+    12096,  2940,   4571,   -19281, -6910,  -1480,  9236,   6262,   5110,   -2252,  3455,   -2396,  5843,  -4738,
+    7679,   8977,   -9549,  -5515,  15199,  -13524, -2377,  -11595, 9857,   242,    3986,   7628,   16677, -4646,
+    5580,   -7965,  5095,   1744,   9455,   -998,   -1292,  -6668,  -4312,  -2268};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_shift_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_shift_data.h
new file mode 100644
index 0000000..034b1ce
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/output_shift_data.h
@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_dilation_output_shift[8] = {-9, -9, -9, -9, -9, -9, -9, -9};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/test_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/test_data.h
new file mode 100644
index 0000000..99f0472
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/test_data.h
@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/weights_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/weights_data.h
new file mode 100644
index 0000000..7371514
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_dilation/weights_data.h
@@ -0,0 +1,12 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_dilation_weights[128] = {
+    67,   91,  -111, -53,  118,  69,  -23,  11,   -124, 40,  -91, 12,   -92, -112, 95,   45,   67,   28,  -127,
+    -120, 95,  80,   -13,  -33,  15,  80,   -116, -56,  -79, -46, 97,   66,  63,   6,    -45,  125,  -57, 18,
+    95,   -29, -57,  -68,  -13,  49,  -127, 127,  -3,   78,  35,  -115, 65,  -58,  -106, 108,  10,   113, 41,
+    -37,  120, 127,  -18,  -50,  81,  39,   122,  -92,  127, 85,  109,  -21, -42,  68,   -109, -115, 122, 31,
+    101,  48,  -99,  -86,  118,  -47, 83,   0,    -66,  39,  127, -80,  -83, -96,  -106, -115, 11,   99,  71,
+    15,   19,  35,   -127, -118, 72,  101,  -78,  -108, 58,  -43, -92,  58,  96,   -32,  49,   14,   96,  -114,
+    81,   86,  -110, -86,  -109, 118, -127, -127, 31,   76,  11,  -81,  -87, -127};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/biases_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/biases_data.h
new file mode 100644
index 0000000..c911b4a
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/biases_data.h
@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_mult4_biases[8] =
+    {223178, -4314998, 3439509, 1837855, -3006016, 2312206, -2798955, -2952197};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/config_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/config_data.h
new file mode 100644
index 0000000..23de6a2
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/config_data.h
@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#define DW_INT16XINT8_MULT4_OUT_CH 8
+#define DW_INT16XINT8_MULT4_IN_CH 2
+#define DW_INT16XINT8_MULT4_INPUT_W 4
+#define DW_INT16XINT8_MULT4_INPUT_H 5
+#define DW_INT16XINT8_MULT4_DST_SIZE 8
+#define DW_INT16XINT8_MULT4_INPUT_SIZE 40
+#define DW_INT16XINT8_MULT4_OUT_ACTIVATION_MIN -32767
+#define DW_INT16XINT8_MULT4_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_MULT4_INPUT_BATCHES 1
+#define DW_INT16XINT8_MULT4_FILTER_X 3
+#define DW_INT16XINT8_MULT4_FILTER_Y 4
+#define DW_INT16XINT8_MULT4_STRIDE_X 3
+#define DW_INT16XINT8_MULT4_STRIDE_Y 2
+#define DW_INT16XINT8_MULT4_PAD_X 0
+#define DW_INT16XINT8_MULT4_PAD_Y 0
+#define DW_INT16XINT8_MULT4_OUTPUT_W 1
+#define DW_INT16XINT8_MULT4_OUTPUT_H 1
+#define DW_INT16XINT8_MULT4_CH_MULT 4
+#define DW_INT16XINT8_MULT4_INPUT_OFFSET 0
+#define DW_INT16XINT8_MULT4_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_MULT4_DILATION_X 1
+#define DW_INT16XINT8_MULT4_DILATION_Y 1
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/input_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/input_data.h
new file mode 100644
index 0000000..1269b99
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/input_data.h
@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_mult4_input[40] = {
+    -17485, 21819, -26312, -14446, 32326,  26482,  -4390, -18621, -32186, 15037,  -7461,  30312,  -18990, 15605,
+    -20064, 3712,  -25567, -22216, -7666,  4861,   30918, -1954,  14950,  6892,   -21628, -10369, 10901,  31979,
+    -24233, 14297, 15444,  -30175, -32061, -29227, 28826, -9789,  5445,   -32022, 19913,  9312};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_mult_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_mult_data.h
new file mode 100644
index 0000000..2798c14
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_mult_data.h
@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_mult4_output_mult[8] =
+    {1106943712, 1961710526, 2105762695, 2029870097, 1971085854, 2027885518, 2146001571, 1992163366};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_ref_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_ref_data.h
new file mode 100644
index 0000000..6694984
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_ref_data.h
@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_mult4_output_ref[8] = {14964, 113, 4445, -17539, -15735, 3848, -3806, 14080};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_shift_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_shift_data.h
new file mode 100644
index 0000000..4a9d395
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/output_shift_data.h
@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_mult4_output_shift[8] = {-8, -9, -9, -9, -9, -9, -9, -9};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/test_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/test_data.h
new file mode 100644
index 0000000..99f0472
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/test_data.h
@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/weights_data.h b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/weights_data.h
new file mode 100644
index 0000000..0a71cd7
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_mult4/weights_data.h
@@ -0,0 +1,10 @@
+// Generated by generate_test_data.py using TFL version 2.6.0 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_mult4_weights[96] = {
+    -87, 4,   50,  43,   8,   -33, -86, 75,   -15, -119, -56, 116, 82,   -39, -127, 47,   81,   0,    -104, -8,
+    9,   -27, -7,  127,  -72, -95, -23, -41,  127, 57,   11,  75,  -40,  13,  -69,  -102, -121, -110, 4,    92,
+    29,  25,  -42, -100, 18,  127, 127, 62,   -46, -127, -28, 118, 33,   -66, 27,   55,   79,   4,    127,  -38,
+    -81, 86,  -17, 47,   127, -29, 31,  -127, 13,  -20,  13,  7,   116,  13,  70,   104,  -78,  -72,  -122, -9,
+    118, -43, 26,  -127, -48, -63, -65, 85,   110, 113,  -5,  40,  -115, 34,  23,   -20};
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_1x1_s8_fast/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_1x1_s8_fast/CMakeLists.txt
index 24dbab4..1149de7 100644
--- a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_1x1_s8_fast/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_1x1_s8_fast/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2010-2022 Arm Limited or its affiliates.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -19,7 +19,7 @@
 add_cmsis_nn_unit_test_executable(test_arm_convolve_1x1_s8_fast)
 
 target_sources(test_arm_convolve_1x1_s8_fast PRIVATE
-    Unity/unity_test_arm_convolve_1x1_s8_fast
+    Unity/unity_test_arm_convolve_1x1_s8_fast.c
     Unity/TestRunner/unity_test_arm_convolve_1x1_s8_fast_runner.c)
 
 
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/CMakeLists.txt
new file mode 100644
index 0000000..f38f7e6
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright (C) 2010-2022 Arm Limited or its affiliates.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_cmsis_nn_unit_test_executable(test_arm_depthwise_conv_s16)
+
+target_sources(test_arm_depthwise_conv_s16 PRIVATE
+    Unity/unity_test_arm_depthwise_conv_s16.c
+    Unity/TestRunner/unity_test_arm_depthwise_conv_s16_runner.c)
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/Unity/unity_test_arm_depthwise_conv_s16.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/Unity/unity_test_arm_depthwise_conv_s16.c
new file mode 100644
index 0000000..df6d289
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/Unity/unity_test_arm_depthwise_conv_s16.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../test_arm_depthwise_conv_s16.c"
+#include "unity.h"
+
+#ifdef USING_FVP_CORSTONE_300
+extern void uart_init(void);
+#endif
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void setUp(void)
+{ /* This is run before EACH TEST */
+#ifdef USING_FVP_CORSTONE_300
+    uart_init();
+#endif
+}
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void tearDown(void) {}
+
+void test_dw_int16xint8_arm_depthwise_conv_s16(void) { dw_int16xint8_arm_depthwise_conv_s16(); }
+void test_dw_int16xint8_dilation_arm_depthwise_conv_s16(void) { dw_int16xint8_dilation_arm_depthwise_conv_s16(); }
+void test_dw_int16xint8_mult4_arm_depthwise_conv_s16(void) { dw_int16xint8_mult4_arm_depthwise_conv_s16(); }
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/test_arm_depthwise_conv_s16.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/test_arm_depthwise_conv_s16.c
new file mode 100644
index 0000000..d0044ea
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_s16/test_arm_depthwise_conv_s16.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_nnfunctions.h>
+#include <unity.h>
+
+#include "../TestData/dw_int16xint8/test_data.h"
+#include "../TestData/dw_int16xint8_dilation/test_data.h"
+#include "../TestData/dw_int16xint8_mult4/test_data.h"
+#include "../Utils/validate.h"
+
+void dw_int16xint8_arm_depthwise_conv_s16(void)
+{
+    const arm_status expected = ARM_MATH_SUCCESS;
+    q15_t output[DW_INT16XINT8_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data = dw_int16xint8_biases;
+    const q15_t *input_data = dw_int16xint8_input;
+
+    input_dims.n = DW_INT16XINT8_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_INPUT_W;
+    input_dims.h = DW_INT16XINT8_INPUT_H;
+    input_dims.c = DW_INT16XINT8_IN_CH;
+    filter_dims.w = DW_INT16XINT8_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_output_shift;
+
+    ctx.buf = NULL;
+    ctx.size = 0;
+
+    arm_status result = arm_depthwise_conv_s16(&ctx,
+                                               &dw_conv_params,
+                                               &quant_params,
+                                               &input_dims,
+                                               input_data,
+                                               &filter_dims,
+                                               dw_int16xint8_weights,
+                                               &bias_dims,
+                                               bias_data,
+                                               &output_dims,
+                                               output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, dw_int16xint8_output_ref, DW_INT16XINT8_DST_SIZE));
+}
+
+void dw_int16xint8_dilation_arm_depthwise_conv_s16(void)
+{
+    const arm_status expected = ARM_MATH_SUCCESS;
+    q15_t output[DW_INT16XINT8_DILATION_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data = dw_int16xint8_dilation_biases;
+    const q15_t *input_data = dw_int16xint8_dilation_input;
+
+    input_dims.n = DW_INT16XINT8_DILATION_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_DILATION_INPUT_W;
+    input_dims.h = DW_INT16XINT8_DILATION_INPUT_H;
+    input_dims.c = DW_INT16XINT8_DILATION_IN_CH;
+    filter_dims.w = DW_INT16XINT8_DILATION_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_DILATION_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_DILATION_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_DILATION_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_DILATION_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_DILATION_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_DILATION_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_DILATION_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_DILATION_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_DILATION_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_DILATION_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_DILATION_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_DILATION_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_DILATION_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_DILATION_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_DILATION_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_dilation_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_dilation_output_shift;
+
+    ctx.buf = NULL;
+    ctx.size = 0;
+
+    arm_status result = arm_depthwise_conv_s16(&ctx,
+                                               &dw_conv_params,
+                                               &quant_params,
+                                               &input_dims,
+                                               input_data,
+                                               &filter_dims,
+                                               dw_int16xint8_dilation_weights,
+                                               &bias_dims,
+                                               bias_data,
+                                               &output_dims,
+                                               output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, dw_int16xint8_dilation_output_ref, DW_INT16XINT8_DILATION_DST_SIZE));
+}
+
+void dw_int16xint8_mult4_arm_depthwise_conv_s16(void)
+{
+    const arm_status expected = ARM_MATH_SUCCESS;
+    q15_t output[DW_INT16XINT8_MULT4_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data = dw_int16xint8_mult4_biases;
+    const q15_t *input_data = dw_int16xint8_mult4_input;
+
+    input_dims.n = DW_INT16XINT8_MULT4_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_MULT4_INPUT_W;
+    input_dims.h = DW_INT16XINT8_MULT4_INPUT_H;
+    input_dims.c = DW_INT16XINT8_MULT4_IN_CH;
+    filter_dims.w = DW_INT16XINT8_MULT4_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_MULT4_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_MULT4_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_MULT4_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_MULT4_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_MULT4_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_MULT4_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_MULT4_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_MULT4_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_MULT4_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_MULT4_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_MULT4_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_MULT4_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_MULT4_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_MULT4_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_MULT4_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_mult4_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_mult4_output_shift;
+
+    ctx.buf = NULL;
+    ctx.size = 0;
+
+    arm_status result = arm_depthwise_conv_s16(&ctx,
+                                               &dw_conv_params,
+                                               &quant_params,
+                                               &input_dims,
+                                               input_data,
+                                               &filter_dims,
+                                               dw_int16xint8_mult4_weights,
+                                               &bias_dims,
+                                               bias_data,
+                                               &output_dims,
+                                               output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, dw_int16xint8_mult4_output_ref, DW_INT16XINT8_MULT4_DST_SIZE));
+}
diff --git a/CMSIS/NN/Tests/UnitTest/generate_test_data.py b/CMSIS/NN/Tests/UnitTest/generate_test_data.py
index 58e0ce5..db992ed 100755
--- a/CMSIS/NN/Tests/UnitTest/generate_test_data.py
+++ b/CMSIS/NN/Tests/UnitTest/generate_test_data.py
@@ -424,14 +424,11 @@
 
         self.scaling_factors = []
 
-        if self.test_type == 'conv':
-            self.quantized_dimension = 0
-        elif self.test_type == 'depthwise_conv':
-            self.quantized_dimension = 3
+        if self.test_type == 'depthwise_conv':
             self.channel_multiplier = self.output_ch // self.input_ch
             if self.output_ch % self.input_ch != 0:
                 raise RuntimeError("out channel ({}) is not multiple of in channel ({})".format(out_ch, in_ch))
-        else:
+        elif self.test_type != 'conv':
             raise RuntimeError("Invalid test type {}".format(self.test_type))
 
     def write_c_config_header(self):
@@ -1111,6 +1108,21 @@
                                               w_y=4, stride_x=2, stride_y=2, pad=True,
                                               out_activation_min=-70, out_activation_max=127, dilation_x=2,
                                               dilation_y=3)
+    dataset = 'dw_int16xint8'
+    ALL_TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=4, out_ch=8, x_in=9, y_in=5, w_x=3,
+                                              w_y=4, stride_x=3, stride_y=2, pad=True, randmin=INT16_MIN,
+                                              randmax=INT16_MAX, out_activation_min=-21111,
+                                              out_activation_max=32767, int16xint8=True)
+    dataset = 'dw_int16xint8_dilation'
+    ALL_TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=4, out_ch=8, x_in=9, y_in=5, w_x=4,
+                                              w_y=4, stride_x=1, stride_y=1, pad=True, randmin=INT16_MIN,
+                                              randmax=INT16_MAX, out_activation_min=-32700, dilation_x=3, dilation_y=2,
+                                              out_activation_max=32767, int16xint8=True)
+    dataset = 'dw_int16xint8_mult4'
+    ALL_TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=2, out_ch=8, x_in=4, y_in=5, w_x=3,
+                                              w_y=4, stride_x=3, stride_y=2, pad=False, randmin=INT16_MIN,
+                                              randmax=INT16_MAX, out_activation_min=-32767,
+                                              out_activation_max=32767, int16xint8=True)
 
     type_of_test = 'fully_connected'
     dataset = 'fully_connected'