CMSIS NN: Fix data type for bias of s8 conv functions

Data type was int8 and it is corrected to int32.
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index f73244f..9cbac58 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -135,7 +135,7 @@
    * @param[in]       pad_y      padding along height
    * @param[in]       stride_x   convolution stride x
    * @param[in]       stride_y   convolution stride y
-   * @param[in]       bias       pointer to per output channel bias. Range: int8
+   * @param[in]       bias       pointer to per output channel bias. Range: int32
    * @param[in,out]   output     pointer to output tensor. format: [H, W, out_ch]
    * @param[in]       output_shift    pointer to per output channel requantization shift parameter.
    * @param[in]       output_mult     pointer to per output channel requantization multiplier parameter.
@@ -168,7 +168,7 @@
                                const uint16_t pad_y,
                                const uint16_t stride_x,
                                const uint16_t stride_y,
-                               const q7_t *bias,
+                               const int32_t *bias,
                                q7_t *output,
                                const int32_t *output_shift,
                                const int32_t *output_mult,
@@ -457,7 +457,7 @@
    * @param[in]      pad_y         padding size y
    * @param[in]      stride_x      convolution stride x
    * @param[in]      stride_y      convolution stride y
-   * @param[in]      bias          pointer to bias
+   * @param[in]      bias          pointer to per channel bias. Range : int32
    * @param[in,out]  output        pointer to output tensor.  Format: [H, W, out_ch]
    * @param[in]      output_shift  pointer to per output channel requantization shift parameter.
    * @param[in]      output_mult   pointer to per output channel requantization multiplier parameter.
@@ -492,7 +492,7 @@
                                         const uint16_t pad_y,
                                         const uint16_t stride_x,
                                         const uint16_t stride_y,
-                                        const q7_t *bias,
+                                        const int32_t *bias,
                                         q7_t *output,
                                         const int32_t *output_shift,
                                         const int32_t *output_mult,
@@ -758,7 +758,7 @@
    * @param[in]       pad_y      padding along height
    * @param[in]       stride_x   convolution stride along width
    * @param[in]       stride_y   convolution stride along height
-   * @param[in]       bias       pointer to per output channel bias. Range: int8
+   * @param[in]       bias       pointer to per output channel bias. Range: int32
    * @param[in,out]   output     pointer to output tensor. Format: [H, W, out_ch]
    * @param[in]       output_shift pointer to per output channel requantization shift parameter.
    * @param[in]       output_mult  pointer to per output channel requantization multiplier parameter.
@@ -794,7 +794,7 @@
                                      const uint16_t pad_y,
                                      const uint16_t stride_x,
                                      const uint16_t stride_y,
-                                     const q7_t *bias,
+                                     const int32_t *bias,
                                      q7_t *output,
                                      const int32_t *output_shift,
                                      const int32_t *output_mult,
@@ -860,7 +860,7 @@
                                        const uint16_t pad_y,
                                        const uint16_t stride_x,
                                        const uint16_t stride_y,
-                                       const q7_t *bias,
+                                       const int32_t *bias,
                                        q7_t *output,
                                        const int32_t *output_shift,
                                        const int32_t *output_mult,
@@ -1137,7 +1137,7 @@
    * @param[in]       activation_min   minimum value to clamp the output to. Range : int8
    * @param[in]       activation_max   maximum value to clamp the output to. Range : int8
    * @param[in]       num_col_a   number of columns of A
-   * @param[in]       output_bias per output channel bias
+   * @param[in]       output_bias per output channel bias. Range : int32
    * @param[in,out]   out_0       pointer to output
    * @return     The function returns one of the two
    *              1. The incremented output pointer for a successful operation or
@@ -1157,7 +1157,7 @@
                                         const int16_t activation_min,
                                         const int16_t activation_max,
                                         const uint16_t num_col_a,
-                                        const q7_t *const output_bias,
+                                        const int32_t *const output_bias,
                                         q7_t *out_0);
 
    /**
@@ -1177,7 +1177,7 @@
                                                   const int16_t activation_min,
                                                   const int16_t activation_max,
                                                   const uint16_t num_col_a,
-                                                  const q7_t *const output_bias,
+                                                  const int32_t *const output_bias,
                                                   q7_t *out_0);
 
     /**
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
index 2f5ebb7..89f1976 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
@@ -61,7 +61,7 @@
                                     const uint16_t pad_y,
                                     const uint16_t stride_x,
                                     const uint16_t stride_y,
-                                    const q7_t *bias,
+                                    const int32_t *bias,
                                     q7_t *output,
                                     const int32_t *output_shift,
                                     const int32_t *output_mult,
@@ -124,7 +124,7 @@
         const q7_t *ker_a = kernel;
         for (i_ch_out = 0; i_ch_out < output_ch; i_ch_out++)
         {
-            q31_t sum = (q31_t)bias[i_ch_out];
+            q31_t sum = bias[i_ch_out];
 
             /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
             q15_t *ip_as_col = buffer_a;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
index be5f16a..5966297 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
@@ -60,7 +60,7 @@
                            const uint16_t pad_y,
                            const uint16_t stride_x,
                            const uint16_t stride_y,
-                           const q7_t *bias,
+                           const int32_t *bias,
                            q7_t *output,
                            const int32_t *output_shift,
                            const int32_t *output_mult,
@@ -134,7 +134,7 @@
         for (i = 0; i < output_ch; i++)
         {
             /* Load the accumulator with bias first */
-            q31_t sum = (q31_t)bias[i];
+            q31_t sum = bias[i];
 
             /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
             q15_t *ip_as_col = buffer_a;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
index 7a224a7..02e4164 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
@@ -60,7 +60,7 @@
                                  const uint16_t pad_y,
                                  const uint16_t stride_x,
                                  const uint16_t stride_y,
-                                 const q7_t *bias,
+                                 const int32_t *bias,
                                  q7_t *output,
                                  const int32_t *output_shift,
                                  const int32_t *output_mult,
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
index 114bcf7..0b80f5b 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
@@ -60,7 +60,7 @@
                                      const uint16_t pad_y,
                                      const uint16_t stride_x,
                                      const uint16_t stride_y,
-                                     const q7_t *bias,
+                                     const int32_t *bias,
                                      q7_t *output,
                                      const int32_t *output_shift,
                                      const int32_t *output_mult,
@@ -91,7 +91,7 @@
     int16_t i_ker_y, i_ker_x;
     q15_t *const col_buffer_start = buffer_a;
     q15_t *col_buffer = col_buffer_start;
-    const q7_t *const bias_start_pos = bias;
+    const int32_t *const bias_start_pos = bias;
     const q31_t *const out_mult_start_pos = output_mult;
     const q31_t *const out_shift_start_pos = output_shift;
     uint16_t row_count;
@@ -151,10 +151,10 @@
 
             while (row_count)
             {
-                q31_t sum = (q31_t)(*bias++);
-                q31_t sum_2 = (q31_t)(*bias++);
-                q31_t sum_3 = (q31_t)(*bias++);
-                q31_t sum_4 = (q31_t)(*bias++);
+                q31_t sum =   *bias++;
+                q31_t sum_2 = *bias++;
+                q31_t sum_3 = *bias++;
+                q31_t sum_4 = *bias++;
 
                 uint16_t col_count = (kernel_x * kernel_y) / 2;
                 q15_t *col_pos = col_buffer_start + row_shift;
@@ -255,7 +255,7 @@
             {
                 q15_t *col_pos = col_buffer_start + row_shift;
                 const q7_t *row_pos = kernel + row_shift;
-                q31_t sum = (q31_t)*bias++;
+                q31_t sum = *bias++;
                 const uint16_t col_count = (kernel_x * kernel_y);
                 row_shift += 1;
 
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
index a299436..314a579 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
@@ -45,13 +45,13 @@
                                     const int16_t activation_min,
                                     const int16_t activation_max,
                                     const uint16_t num_col_a,
-                                    const q7_t *const output_bias,
+                                    const int32_t *const output_bias,
                                     q7_t *out_0)
 {
 #if defined(ARM_MATH_LOOPUNROLL) && defined(ARM_MATH_DSP)
     /* set up the second output pointers */
     q7_t *out_1 = out_0 + output_ch;
-    const q7_t *bias = output_bias;
+    const int32_t *bias = output_bias;
 
     uint16_t row_count = output_ch / 2;
     const q7_t *ip_a0 = input_a;
@@ -66,10 +66,10 @@
         const q7_t *ip_a1 = ip_a0 + num_col_a;
 
         /* Init accumulator with bias for channel N and N + 1 */
-        q31_t ch_0_out_0 = (q31_t)*bias;
-        q31_t ch_0_out_1 = (q31_t)*bias++;
-        q31_t ch_1_out_0 = (q31_t)*bias;
-        q31_t ch_1_out_1 = (q31_t)*bias++;
+        q31_t ch_0_out_0 = *bias;
+        q31_t ch_0_out_1 = *bias++;
+        q31_t ch_1_out_0 = *bias;
+        q31_t ch_1_out_1 = *bias++;
 
         uint16_t col_count = num_col_a / 4;
         /* accumulate over the vector */
@@ -153,8 +153,8 @@
         const q15_t *ip_b1 = ip_b0 + num_col_a;
 
         /* load the bias */
-        q31_t ch_0_out_0 = (q31_t)*bias;
-        q31_t ch_0_out_1 = (q31_t)*bias++;
+        q31_t ch_0_out_0 = *bias;
+        q31_t ch_0_out_1 = *bias++;
 
         uint16_t col_count = num_col_a >> 2;
         while (col_count)
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
index 75b1d6f..4708261 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
@@ -49,13 +49,13 @@
                                               const int16_t activation_min,
                                               const int16_t activation_max,
                                               const uint16_t num_col_a,
-                                              const q7_t *const output_bias,
+                                              const int32_t *const output_bias,
                                               q7_t *out_0)
 {
 #if defined(ARM_MATH_LOOPUNROLL) && defined(ARM_MATH_DSP)
     /* set up the second output pointers */
     q7_t *out_1 = out_0 + output_ch;
-    const q7_t *bias = output_bias;
+    const int32_t *bias = output_bias;
 
     uint16_t row_count = output_ch / 2;
     const q7_t *ip_a0 = input_a;
@@ -70,10 +70,10 @@
         const q7_t *ip_a1 = ip_a0 + num_col_a;
 
         /* Init accumulator with bias for channel N and N + 1 */
-        q31_t ch_0_out_0 = (q31_t)*bias;
-        q31_t ch_0_out_1 = (q31_t)*bias++;
-        q31_t ch_1_out_0 = (q31_t)*bias;
-        q31_t ch_1_out_1 = (q31_t)*bias++;
+        q31_t ch_0_out_0 = *bias;
+        q31_t ch_0_out_1 = *bias++;
+        q31_t ch_1_out_0 = *bias;
+        q31_t ch_1_out_1 = *bias++;
 
         uint16_t col_count = num_col_a / 4;
         /* accumulate over the vector */
@@ -157,8 +157,8 @@
         const q15_t *ip_b1 = ip_b0 + num_col_a;
 
         /* load the bias */
-        q31_t ch_0_out_0 = (q31_t)*bias;
-        q31_t ch_0_out_1 = (q31_t)*bias++;
+        q31_t ch_0_out_0 = *bias;
+        q31_t ch_0_out_1 = *bias++;
 
         uint16_t col_count = num_col_a >> 2;
         while (col_count)