Убрано лишнее из CMSIS библиотеки

Добавление подключение DSP в конфиг периферии
2025-11-15 08:22:07 +03:00
parent 5a03fbb513
commit d7dec9df35
158 changed files with 106 additions and 36689 deletions
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_armcc.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_armcc.h
@@ -1,563 +0,0 @@
 /**************************************************************************//**
 * @file     cmsis_armcc.h
 * @brief    CMSIS compiler specific macros, functions, instructions
 * @version  V1.0.5
 * @date     05. May 2021
 ******************************************************************************/
 /*
 * Copyright (c) 2009-2021 Arm Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef __CMSIS_ARMCC_H
 #define __CMSIS_ARMCC_H
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 400677)
  #error "Please use Arm Compiler Toolchain V4.0.677 or later!"
 #endif
 /* CMSIS compiler control architecture macros */
 #if (defined (__TARGET_ARCH_7_A ) && (__TARGET_ARCH_7_A  == 1))
  #define __ARM_ARCH_7A__           1
 #endif
 /* CMSIS compiler specific defines */
 #ifndef   __ASM
  #define __ASM                                  __asm
 #endif
 #ifndef   __INLINE
  #define __INLINE                               __inline
 #endif
 #ifndef   __FORCEINLINE
  #define __FORCEINLINE                          __forceinline
 #endif
 #ifndef   __STATIC_INLINE
  #define __STATIC_INLINE                        static __inline
 #endif
 #ifndef   __STATIC_FORCEINLINE
  #define __STATIC_FORCEINLINE                   static __forceinline
 #endif
 #ifndef   __NO_RETURN
  #define __NO_RETURN                            __declspec(noreturn)
 #endif
 #ifndef   CMSIS_DEPRECATED
  #define CMSIS_DEPRECATED                       __attribute__((deprecated))
 #endif
 #ifndef   __USED
  #define __USED                                 __attribute__((used))
 #endif
 #ifndef   __WEAK
  #define __WEAK                                 __attribute__((weak))
 #endif
 #ifndef   __PACKED
  #define __PACKED                               __attribute__((packed))
 #endif
 #ifndef   __PACKED_STRUCT
  #define __PACKED_STRUCT                        __packed struct
 #endif
 #ifndef   __UNALIGNED_UINT16_WRITE
  #define __UNALIGNED_UINT16_WRITE(addr, val)    ((*((__packed uint16_t *)(addr))) = (val))
 #endif
 #ifndef   __UNALIGNED_UINT16_READ
  #define __UNALIGNED_UINT16_READ(addr)          (*((const __packed uint16_t *)(addr)))
 #endif
 #ifndef   __UNALIGNED_UINT32_WRITE
  #define __UNALIGNED_UINT32_WRITE(addr, val)    ((*((__packed uint32_t *)(addr))) = (val))
 #endif
 #ifndef   __UNALIGNED_UINT32_READ
  #define __UNALIGNED_UINT32_READ(addr)          (*((const __packed uint32_t *)(addr)))
 #endif
 #ifndef   __ALIGNED
  #define __ALIGNED(x)                           __attribute__((aligned(x)))
 #endif
 #ifndef   __PACKED
  #define __PACKED                               __attribute__((packed))
 #endif
 #ifndef   __COMPILER_BARRIER
  #define __COMPILER_BARRIER()                   __memory_changed()
 #endif
 /* ##########################  Core Instruction Access  ######################### */
 /**
  \brief   No Operation
 */
 #define __NOP                             __nop
 /**
  \brief   Wait For Interrupt
 */
 #define __WFI                             __wfi
 /**
  \brief   Wait For Event
 */
 #define __WFE                             __wfe
 /**
  \brief   Send Event
 */
 #define __SEV                             __sev
 /**
  \brief   Instruction Synchronization Barrier
 */
 #define __ISB()                           __isb(0xF)
 /**
  \brief   Data Synchronization Barrier
 */
 #define __DSB()                           __dsb(0xF)
 /**
  \brief   Data Memory Barrier
 */
 #define __DMB()                           __dmb(0xF)
 /**
  \brief   Reverse byte order (32 bit)
  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #define __REV                             __rev
 /**
  \brief   Reverse byte order (16 bit)
  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #ifndef __NO_EMBEDDED_ASM
 __attribute__((section(".rev16_text"))) __STATIC_INLINE __ASM uint32_t __REV16(uint32_t value)
 {
  rev16 r0, r0
  bx lr
 }
 #endif
 /**
  \brief   Reverse byte order (16 bit)
  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #ifndef __NO_EMBEDDED_ASM
 __attribute__((section(".revsh_text"))) __STATIC_INLINE __ASM int16_t __REVSH(int16_t value)
 {
  revsh r0, r0
  bx lr
 }
 #endif
 /**
  \brief   Rotate Right in unsigned value (32 bit)
  \param [in]    op1  Value to rotate
  \param [in]    op2  Number of Bits to rotate
  \return               Rotated value
 */
 #define __ROR                             __ror
 /**
  \brief   Breakpoint
  \param [in]    value  is ignored by the processor.
                 If required, a debugger can use it to store additional information about the breakpoint.
 */
 #define __BKPT(value)                     __breakpoint(value)
 /**
  \brief   Reverse bit order of value
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #define __RBIT                            __rbit
 /**
  \brief   Count leading zeros
  \param [in]  value  Value to count the leading zeros
  \return             number of leading zeros in value
 */
 #define __CLZ                             __clz
 /**
  \brief   LDR Exclusive (8 bit)
  \details Executes a exclusive LDR instruction for 8 bit value.
  \param [in]    ptr  Pointer to data
  \return             value of type uint8_t at (*ptr)
 */
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
  #define __LDREXB(ptr)                                                        ((uint8_t ) __ldrex(ptr))
 #else
  #define __LDREXB(ptr)          _Pragma("push") _Pragma("diag_suppress 3731") ((uint8_t ) __ldrex(ptr))  _Pragma("pop")
 #endif
 /**
  \brief   LDR Exclusive (16 bit)
  \details Executes a exclusive LDR instruction for 16 bit values.
  \param [in]    ptr  Pointer to data
  \return        value of type uint16_t at (*ptr)
 */
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
  #define __LDREXH(ptr)                                                        ((uint16_t) __ldrex(ptr))
 #else
  #define __LDREXH(ptr)          _Pragma("push") _Pragma("diag_suppress 3731") ((uint16_t) __ldrex(ptr))  _Pragma("pop")
 #endif
 /**
  \brief   LDR Exclusive (32 bit)
  \details Executes a exclusive LDR instruction for 32 bit values.
  \param [in]    ptr  Pointer to data
  \return        value of type uint32_t at (*ptr)
 */
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
  #define __LDREXW(ptr)                                                        ((uint32_t ) __ldrex(ptr))
 #else
  #define __LDREXW(ptr)          _Pragma("push") _Pragma("diag_suppress 3731") ((uint32_t ) __ldrex(ptr))  _Pragma("pop")
 #endif
 /**
  \brief   STR Exclusive (8 bit)
  \details Executes a exclusive STR instruction for 8 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
  #define __STREXB(value, ptr)                                                 __strex(value, ptr)
 #else
  #define __STREXB(value, ptr)   _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr)        _Pragma("pop")
 #endif
 /**
  \brief   STR Exclusive (16 bit)
  \details Executes a exclusive STR instruction for 16 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
  #define __STREXH(value, ptr)                                                 __strex(value, ptr)
 #else
  #define __STREXH(value, ptr)   _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr)        _Pragma("pop")
 #endif
 /**
  \brief   STR Exclusive (32 bit)
  \details Executes a exclusive STR instruction for 32 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
  #define __STREXW(value, ptr)                                                 __strex(value, ptr)
 #else
  #define __STREXW(value, ptr)   _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr)        _Pragma("pop")
 #endif
 /**
  \brief   Remove the exclusive lock
  \details Removes the exclusive lock which is created by LDREX.
 */
 #define __CLREX                           __clrex
 /**
  \brief   Signed Saturate
  \details Saturates a signed value.
  \param [in]  value  Value to be saturated
  \param [in]    sat  Bit position to saturate to (1..32)
  \return             Saturated value
 */
 #define __SSAT                            __ssat
 /**
  \brief   Unsigned Saturate
  \details Saturates an unsigned value.
  \param [in]  value  Value to be saturated
  \param [in]    sat  Bit position to saturate to (0..31)
  \return             Saturated value
 */
 #define __USAT                            __usat
 /* ###########################  Core Function Access  ########################### */
 /**
  \brief   Enable IRQ Interrupts
  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 /* intrinsic void __enable_irq(); */
 /**
  \brief   Disable IRQ Interrupts
  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
  Can only be executed in Privileged modes.
 */
 /* intrinsic void __disable_irq(void); */
 /**
  \brief   Enable FIQ
  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 #define __enable_fault_irq                __enable_fiq
 /**
  \brief   Disable FIQ
  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 #define __disable_fault_irq               __disable_fiq
 /**
  \brief   Get FPSCR (Floating Point Status/Control)
  \return               Floating Point Status/Control register value
 */
 __STATIC_INLINE uint32_t __get_FPSCR(void)
 {
 #if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
  register uint32_t __regfpscr         __ASM("fpscr");
  return(__regfpscr);
 #else
   return(0U);
 #endif
 }
 /**
  \brief   Set FPSCR (Floating Point Status/Control)
  \param [in]    fpscr  Floating Point Status/Control value to set
 */
 __STATIC_INLINE void __set_FPSCR(uint32_t fpscr)
 {
 #if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
  register uint32_t __regfpscr         __ASM("fpscr");
  __regfpscr = (fpscr);
 #else
  (void)fpscr;
 #endif
 }
 /** \brief  Get CPSR (Current Program Status Register)
    \return               CPSR Register value
 */
 __STATIC_INLINE uint32_t __get_CPSR(void)
 {
  register uint32_t __regCPSR          __ASM("cpsr");
  return(__regCPSR);
 }
 /** \brief  Set CPSR (Current Program Status Register)
    \param [in]    cpsr  CPSR value to set
 */
 __STATIC_INLINE void __set_CPSR(uint32_t cpsr)
 {
  register uint32_t __regCPSR          __ASM("cpsr");
  __regCPSR = cpsr;
 }
 /** \brief  Get Mode
    \return                Processor Mode
 */
 __STATIC_INLINE uint32_t __get_mode(void)
 {
  return (__get_CPSR() & 0x1FU);
 }
 /** \brief  Set Mode
    \param [in]    mode  Mode value to set
 */
 __STATIC_INLINE __ASM void __set_mode(uint32_t mode)
 {
  MOV  r1, lr
  MSR  CPSR_C, r0
  BX   r1
 }
 /** \brief  Get Stack Pointer
    \return Stack Pointer
 */
 __STATIC_INLINE __ASM uint32_t __get_SP(void)
 {
  MOV  r0, sp
  BX   lr
 }
 /** \brief  Set Stack Pointer
    \param [in]    stack  Stack Pointer value to set
 */
 __STATIC_INLINE __ASM void __set_SP(uint32_t stack)
 {
  MOV  sp, r0
  BX   lr
 }
 /** \brief  Get USR/SYS Stack Pointer
    \return USR/SYSStack Pointer
 */
 __STATIC_INLINE __ASM uint32_t __get_SP_usr(void)
 {
  ARM
  PRESERVE8
  MRS     R1, CPSR
  CPS     #0x1F       ;no effect in USR mode
  MOV     R0, SP
  MSR     CPSR_c, R1  ;no effect in USR mode
  ISB
  BX      LR
 }
 /** \brief  Set USR/SYS Stack Pointer
    \param [in]    topOfProcStack  USR/SYS Stack Pointer value to set
 */
 __STATIC_INLINE __ASM void __set_SP_usr(uint32_t topOfProcStack)
 {
  ARM
  PRESERVE8
  MRS     R1, CPSR
  CPS     #0x1F       ;no effect in USR mode
  MOV     SP, R0
  MSR     CPSR_c, R1  ;no effect in USR mode
  ISB
  BX      LR
 }
 /** \brief  Get FPEXC (Floating Point Exception Control Register)
    \return               Floating Point Exception Control Register value
 */
 __STATIC_INLINE uint32_t __get_FPEXC(void)
 {
 #if (__FPU_PRESENT == 1)
  register uint32_t __regfpexc         __ASM("fpexc");
  return(__regfpexc);
 #else
  return(0);
 #endif
 }
 /** \brief  Set FPEXC (Floating Point Exception Control Register)
    \param [in]    fpexc  Floating Point Exception Control value to set
 */
 __STATIC_INLINE void __set_FPEXC(uint32_t fpexc)
 {
 #if (__FPU_PRESENT == 1)
  register uint32_t __regfpexc         __ASM("fpexc");
  __regfpexc = (fpexc);
 #endif
 }
 /*
 * Include common core functions to access Coprocessor 15 registers
 */
 #define __get_CP(cp, op1, Rt, CRn, CRm, op2) do { register volatile uint32_t tmp __ASM("cp" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2); (Rt) = tmp; } while(0)
 #define __set_CP(cp, op1, Rt, CRn, CRm, op2) do { register volatile uint32_t tmp __ASM("cp" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2); tmp = (Rt); } while(0)
 #define __get_CP64(cp, op1, Rt, CRm) \
  do { \
    uint32_t ltmp, htmp; \
    __ASM volatile("MRRC p" # cp ", " # op1 ", ltmp, htmp, c" # CRm); \
    (Rt) = ((((uint64_t)htmp) << 32U) | ((uint64_t)ltmp)); \
  } while(0)
 #define __set_CP64(cp, op1, Rt, CRm) \
  do { \
    const uint64_t tmp = (Rt); \
    const uint32_t ltmp = (uint32_t)(tmp); \
    const uint32_t htmp = (uint32_t)(tmp >> 32U); \
    __ASM volatile("MCRR p" # cp ", " # op1 ", ltmp, htmp, c" # CRm); \
  } while(0)
 #include "cmsis_cp15.h"
 /** \brief  Enable Floating Point Unit
  Critical section, called from undef handler, so systick is disabled
 */
 __STATIC_INLINE __ASM void __FPU_Enable(void)
 {
        ARM
        //Permit access to VFP/NEON, registers by modifying CPACR
        MRC     p15,0,R1,c1,c0,2
        ORR     R1,R1,#0x00F00000
        MCR     p15,0,R1,c1,c0,2
        //Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
        ISB
        //Enable VFP/NEON
        VMRS    R1,FPEXC
        ORR     R1,R1,#0x40000000
        VMSR    FPEXC,R1
        //Initialise VFP/NEON registers to 0
        MOV     R2,#0
        //Initialise D16 registers to 0
        VMOV    D0, R2,R2
        VMOV    D1, R2,R2
        VMOV    D2, R2,R2
        VMOV    D3, R2,R2
        VMOV    D4, R2,R2
        VMOV    D5, R2,R2
        VMOV    D6, R2,R2
        VMOV    D7, R2,R2
        VMOV    D8, R2,R2
        VMOV    D9, R2,R2
        VMOV    D10,R2,R2
        VMOV    D11,R2,R2
        VMOV    D12,R2,R2
        VMOV    D13,R2,R2
        VMOV    D14,R2,R2
        VMOV    D15,R2,R2
  IF {TARGET_FEATURE_EXTENSION_REGISTER_COUNT} == 32
        //Initialise D32 registers to 0
        VMOV    D16,R2,R2
        VMOV    D17,R2,R2
        VMOV    D18,R2,R2
        VMOV    D19,R2,R2
        VMOV    D20,R2,R2
        VMOV    D21,R2,R2
        VMOV    D22,R2,R2
        VMOV    D23,R2,R2
        VMOV    D24,R2,R2
        VMOV    D25,R2,R2
        VMOV    D26,R2,R2
        VMOV    D27,R2,R2
        VMOV    D28,R2,R2
        VMOV    D29,R2,R2
        VMOV    D30,R2,R2
        VMOV    D31,R2,R2
  ENDIF
        //Initialise FPSCR to a known state
        VMRS    R1,FPSCR
        LDR     R2,=0x00086060 //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
        AND     R1,R1,R2
        VMSR    FPSCR,R1
        BX      LR
 }
 #endif /* __CMSIS_ARMCC_H */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_armclang.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_armclang.h
@@ -1,614 +0,0 @@
 /**************************************************************************//**
 * @file     cmsis_armclang.h
 * @brief    CMSIS compiler specific macros, functions, instructions
 * @version  V1.2.1
 * @date     05. May 2021
 ******************************************************************************/
 /*
 * Copyright (c) 2009-2021 Arm Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef __CMSIS_ARMCLANG_H
 #define __CMSIS_ARMCLANG_H
 #pragma clang system_header   /* treat file as system include file */
 /* CMSIS compiler specific defines */
 #ifndef   __ASM
  #define __ASM                                  __asm
 #endif
 #ifndef   __INLINE
  #define __INLINE                               __inline
 #endif
 #ifndef   __FORCEINLINE
  #define __FORCEINLINE                          __attribute__((always_inline))
 #endif
 #ifndef   __STATIC_INLINE
  #define __STATIC_INLINE                        static __inline
 #endif
 #ifndef   __STATIC_FORCEINLINE
  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static __inline
 #endif
 #ifndef   __NO_RETURN
  #define __NO_RETURN                            __attribute__((__noreturn__))
 #endif
 #ifndef   CMSIS_DEPRECATED
  #define CMSIS_DEPRECATED                       __attribute__((deprecated))
 #endif
 #ifndef   __USED
  #define __USED                                 __attribute__((used))
 #endif
 #ifndef   __WEAK
  #define __WEAK                                 __attribute__((weak))
 #endif
 #ifndef   __PACKED
  #define __PACKED                               __attribute__((packed, aligned(1)))
 #endif
 #ifndef   __PACKED_STRUCT
  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
 #endif
 #ifndef   __UNALIGNED_UINT16_WRITE
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wpacked"
 /*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
  #pragma clang diagnostic pop
  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
 #endif
 #ifndef   __UNALIGNED_UINT16_READ
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wpacked"
 /*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
  #pragma clang diagnostic pop
  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
 #endif
 #ifndef   __UNALIGNED_UINT32_WRITE
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wpacked"
 /*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
  #pragma clang diagnostic pop
  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
 #endif
 #ifndef   __UNALIGNED_UINT32_READ
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wpacked"
  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
  #pragma clang diagnostic pop
  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
 #endif
 #ifndef   __ALIGNED
  #define __ALIGNED(x)                           __attribute__((aligned(x)))
 #endif
 #ifndef   __PACKED
  #define __PACKED                               __attribute__((packed))
 #endif
 #ifndef   __COMPILER_BARRIER
  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
 #endif
 /* ##########################  Core Instruction Access  ######################### */
 /**
  \brief   No Operation
 */
 #define __NOP                             __builtin_arm_nop
 /**
  \brief   Wait For Interrupt
 */
 #define __WFI                             __builtin_arm_wfi
 /**
  \brief   Wait For Event
 */
 #define __WFE                             __builtin_arm_wfe
 /**
  \brief   Send Event
 */
 #define __SEV                             __builtin_arm_sev
 /**
  \brief   Instruction Synchronization Barrier
 */
 #define __ISB()                           __builtin_arm_isb(0xF)
 /**
  \brief   Data Synchronization Barrier
 */
 #define __DSB()                           __builtin_arm_dsb(0xF)
 /**
  \brief   Data Memory Barrier
 */
 #define __DMB()                           __builtin_arm_dmb(0xF)
 /**
  \brief   Reverse byte order (32 bit)
  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #define __REV(value)   __builtin_bswap32(value)
 /**
  \brief   Reverse byte order (16 bit)
  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #define __REV16(value) __ROR(__REV(value), 16)
 /**
  \brief   Reverse byte order (16 bit)
  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #define __REVSH(value) (int16_t)__builtin_bswap16(value)
 /**
  \brief   Rotate Right in unsigned value (32 bit)
  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
  \param [in]    op1  Value to rotate
  \param [in]    op2  Number of Bits to rotate
  \return               Rotated value
 */
 __STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
 {
  op2 %= 32U;
  if (op2 == 0U)
  {
    return op1;
  }
  return (op1 >> op2) | (op1 << (32U - op2));
 }
 /**
  \brief   Breakpoint
  \param [in]    value  is ignored by the processor.
                 If required, a debugger can use it to store additional information about the breakpoint.
 */
 #define __BKPT(value)   __ASM volatile ("bkpt "#value)
 /**
  \brief   Reverse bit order of value
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 #define __RBIT          __builtin_arm_rbit
 /**
  \brief   Count leading zeros
  \param [in]  value  Value to count the leading zeros
  \return             number of leading zeros in value
 */
 __STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
 {
  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
     __builtin_clz(0) is undefined behaviour, so handle this case specially.
     This guarantees ARM-compatible results if happening to compile on a non-ARM
     target, and ensures the compiler doesn't decide to activate any
     optimisations using the logic "value was passed to __builtin_clz, so it
     is non-zero".
     ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a
     single CLZ instruction.
   */
  if (value == 0U)
  {
    return 32U;
  }
  return __builtin_clz(value);
 }
 /**
  \brief   LDR Exclusive (8 bit)
  \details Executes a exclusive LDR instruction for 8 bit value.
  \param [in]    ptr  Pointer to data
  \return             value of type uint8_t at (*ptr)
 */
 #define __LDREXB        (uint8_t)__builtin_arm_ldrex
 /**
  \brief   LDR Exclusive (16 bit)
  \details Executes a exclusive LDR instruction for 16 bit values.
  \param [in]    ptr  Pointer to data
  \return        value of type uint16_t at (*ptr)
 */
 #define __LDREXH        (uint16_t)__builtin_arm_ldrex
 /**
  \brief   LDR Exclusive (32 bit)
  \details Executes a exclusive LDR instruction for 32 bit values.
  \param [in]    ptr  Pointer to data
  \return        value of type uint32_t at (*ptr)
 */
 #define __LDREXW        (uint32_t)__builtin_arm_ldrex
 /**
  \brief   STR Exclusive (8 bit)
  \details Executes a exclusive STR instruction for 8 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 #define __STREXB        (uint32_t)__builtin_arm_strex
 /**
  \brief   STR Exclusive (16 bit)
  \details Executes a exclusive STR instruction for 16 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 #define __STREXH        (uint32_t)__builtin_arm_strex
 /**
  \brief   STR Exclusive (32 bit)
  \details Executes a exclusive STR instruction for 32 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 #define __STREXW        (uint32_t)__builtin_arm_strex
 /**
  \brief   Remove the exclusive lock
  \details Removes the exclusive lock which is created by LDREX.
 */
 #define __CLREX             __builtin_arm_clrex
 /**
  \brief   Signed Saturate
  \details Saturates a signed value.
  \param [in]  value  Value to be saturated
  \param [in]    sat  Bit position to saturate to (1..32)
  \return             Saturated value
 */
 #define __SSAT             __builtin_arm_ssat
 /**
  \brief   Unsigned Saturate
  \details Saturates an unsigned value.
  \param [in]  value  Value to be saturated
  \param [in]    sat  Bit position to saturate to (0..31)
  \return             Saturated value
 */
 #define __USAT             __builtin_arm_usat
 /* ###################  Compiler specific Intrinsics  ########################### */
 /** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
  Access to dedicated SIMD instructions
  @{
 */
 #if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
 #define     __SADD8                 __builtin_arm_sadd8
 #define     __SADD16                __builtin_arm_sadd16
 #define     __QADD8                 __builtin_arm_qadd8
 #define     __QSUB8                 __builtin_arm_qsub8
 #define     __QADD16                __builtin_arm_qadd16
 #define     __SHADD16               __builtin_arm_shadd16
 #define     __QSUB16                __builtin_arm_qsub16
 #define     __SHSUB16               __builtin_arm_shsub16
 #define     __QASX                  __builtin_arm_qasx
 #define     __SHASX                 __builtin_arm_shasx
 #define     __QSAX                  __builtin_arm_qsax
 #define     __SHSAX                 __builtin_arm_shsax
 #define     __SXTB16                __builtin_arm_sxtb16
 #define     __SMUAD                 __builtin_arm_smuad
 #define     __SMUADX                __builtin_arm_smuadx
 #define     __SMLAD                 __builtin_arm_smlad
 #define     __SMLADX                __builtin_arm_smladx
 #define     __SMLALD                __builtin_arm_smlald
 #define     __SMLALDX               __builtin_arm_smlaldx
 #define     __SMUSD                 __builtin_arm_smusd
 #define     __SMUSDX                __builtin_arm_smusdx
 #define     __SMLSDX                __builtin_arm_smlsdx
 #define     __USAT16                __builtin_arm_usat16
 #define     __SSUB8                 __builtin_arm_ssub8
 #define     __SXTB16                __builtin_arm_sxtb16
 #define     __SXTAB16               __builtin_arm_sxtab16
 __STATIC_FORCEINLINE  int32_t __QADD( int32_t op1,  int32_t op2)
 {
  int32_t result;
  __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE  int32_t __QSUB( int32_t op1,  int32_t op2)
 {
  int32_t result;
  __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 #define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
 #define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
 __STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
 {
  int32_t result;
  __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
  return(result);
 }
 #endif /* (__ARM_FEATURE_DSP == 1) */
 /* ###########################  Core Function Access  ########################### */
 /**
  \brief   Enable IRQ Interrupts
  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __enable_irq(void)
 {
  __ASM volatile ("cpsie i" : : : "memory");
 }
 /**
  \brief   Disable IRQ Interrupts
  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
  Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __disable_irq(void)
 {
  __ASM volatile ("cpsid i" : : : "memory");
 }
 /**
  \brief   Enable FIQ
  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __enable_fault_irq(void)
 {
  __ASM volatile ("cpsie f" : : : "memory");
 }
 /**
  \brief   Disable FIQ
  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __disable_fault_irq(void)
 {
  __ASM volatile ("cpsid f" : : : "memory");
 }
 /**
  \brief   Get FPSCR
  \details Returns the current value of the Floating Point Status/Control register.
  \return               Floating Point Status/Control register value
 */
 #define __get_FPSCR      __builtin_arm_get_fpscr
 /**
  \brief   Set FPSCR
  \details Assigns the given value to the Floating Point Status/Control register.
  \param [in]    fpscr  Floating Point Status/Control value to set
 */
 #define __set_FPSCR      __builtin_arm_set_fpscr
 /** \brief  Get CPSR Register
    \return               CPSR Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CPSR(void)
 {
  uint32_t result;
  __ASM volatile("MRS %0, cpsr" : "=r" (result) );
  return(result);
 }
 /** \brief  Set CPSR Register
    \param [in]    cpsr  CPSR value to set
 */
 __STATIC_FORCEINLINE void __set_CPSR(uint32_t cpsr)
 {
  __ASM volatile ("MSR cpsr, %0" : : "r" (cpsr) : "cc", "memory");
 }
 /** \brief  Get Mode
    \return                Processor Mode
 */
 __STATIC_FORCEINLINE uint32_t __get_mode(void)
 {
  return (__get_CPSR() & 0x1FU);
 }
 /** \brief  Set Mode
    \param [in]    mode  Mode value to set
 */
 __STATIC_FORCEINLINE void __set_mode(uint32_t mode)
 {
  __ASM volatile("MSR  cpsr_c, %0" : : "r" (mode) : "memory");
 }
 /** \brief  Get Stack Pointer
    \return Stack Pointer value
 */
 __STATIC_FORCEINLINE uint32_t __get_SP(void)
 {
  uint32_t result;
  __ASM volatile("MOV  %0, sp" : "=r" (result) : : "memory");
  return result;
 }
 /** \brief  Set Stack Pointer
    \param [in]    stack  Stack Pointer value to set
 */
 __STATIC_FORCEINLINE void __set_SP(uint32_t stack)
 {
  __ASM volatile("MOV  sp, %0" : : "r" (stack) : "memory");
 }
 /** \brief  Get USR/SYS Stack Pointer
    \return USR/SYS Stack Pointer value
 */
 __STATIC_FORCEINLINE uint32_t __get_SP_usr(void)
 {
  uint32_t cpsr;
  uint32_t result;
  __ASM volatile(
    "MRS     %0, cpsr   \n"
    "CPS     #0x1F      \n" // no effect in USR mode
    "MOV     %1, sp     \n"
    "MSR     cpsr_c, %0 \n" // no effect in USR mode
    "ISB" :  "=r"(cpsr), "=r"(result) : : "memory"
   );
  return result;
 }
 /** \brief  Set USR/SYS Stack Pointer
    \param [in]    topOfProcStack  USR/SYS Stack Pointer value to set
 */
 __STATIC_FORCEINLINE void __set_SP_usr(uint32_t topOfProcStack)
 {
  uint32_t cpsr;
  __ASM volatile(
    "MRS     %0, cpsr   \n"
    "CPS     #0x1F      \n" // no effect in USR mode
    "MOV     sp, %1     \n"
    "MSR     cpsr_c, %0 \n" // no effect in USR mode
    "ISB" : "=r"(cpsr) : "r" (topOfProcStack) : "memory"
   );
 }
 /** \brief  Get FPEXC
    \return               Floating Point Exception Control register value
 */
 __STATIC_FORCEINLINE uint32_t __get_FPEXC(void)
 {
 #if (__FPU_PRESENT == 1)
  uint32_t result;
  __ASM volatile("VMRS %0, fpexc" : "=r" (result) : : "memory");
  return(result);
 #else
  return(0);
 #endif
 }
 /** \brief  Set FPEXC
    \param [in]    fpexc  Floating Point Exception Control value to set
 */
 __STATIC_FORCEINLINE void __set_FPEXC(uint32_t fpexc)
 {
 #if (__FPU_PRESENT == 1)
  __ASM volatile ("VMSR fpexc, %0" : : "r" (fpexc) : "memory");
 #endif
 }
 /*
 * Include common core functions to access Coprocessor 15 registers
 */
 #define __get_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MRC p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : "=r" (Rt) : : "memory" )
 #define __set_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MCR p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : : "r" (Rt) : "memory" )
 #define __get_CP64(cp, op1, Rt, CRm)         __ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : "=r" (Rt) : : "memory" )
 #define __set_CP64(cp, op1, Rt, CRm)         __ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : : "r" (Rt) : "memory" )
 #include "cmsis_cp15.h"
 /** \brief  Enable Floating Point Unit
  Critical section, called from undef handler, so systick is disabled
 */
 __STATIC_INLINE void __FPU_Enable(void)
 {
  __ASM volatile(
    //Permit access to VFP/NEON, registers by modifying CPACR
    "        MRC     p15,0,R1,c1,c0,2  \n"
    "        ORR     R1,R1,#0x00F00000 \n"
    "        MCR     p15,0,R1,c1,c0,2  \n"
    //Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
    "        ISB                       \n"
    //Enable VFP/NEON
    "        VMRS    R1,FPEXC          \n"
    "        ORR     R1,R1,#0x40000000 \n"
    "        VMSR    FPEXC,R1          \n"
    //Initialise VFP/NEON registers to 0
    "        MOV     R2,#0             \n"
    //Initialise D16 registers to 0
    "        VMOV    D0, R2,R2         \n"
    "        VMOV    D1, R2,R2         \n"
    "        VMOV    D2, R2,R2         \n"
    "        VMOV    D3, R2,R2         \n"
    "        VMOV    D4, R2,R2         \n"
    "        VMOV    D5, R2,R2         \n"
    "        VMOV    D6, R2,R2         \n"
    "        VMOV    D7, R2,R2         \n"
    "        VMOV    D8, R2,R2         \n"
    "        VMOV    D9, R2,R2         \n"
    "        VMOV    D10,R2,R2         \n"
    "        VMOV    D11,R2,R2         \n"
    "        VMOV    D12,R2,R2         \n"
    "        VMOV    D13,R2,R2         \n"
    "        VMOV    D14,R2,R2         \n"
    "        VMOV    D15,R2,R2         \n"
 #if (defined(__ARM_NEON) && (__ARM_NEON == 1))
    //Initialise D32 registers to 0
    "        VMOV    D16,R2,R2         \n"
    "        VMOV    D17,R2,R2         \n"
    "        VMOV    D18,R2,R2         \n"
    "        VMOV    D19,R2,R2         \n"
    "        VMOV    D20,R2,R2         \n"
    "        VMOV    D21,R2,R2         \n"
    "        VMOV    D22,R2,R2         \n"
    "        VMOV    D23,R2,R2         \n"
    "        VMOV    D24,R2,R2         \n"
    "        VMOV    D25,R2,R2         \n"
    "        VMOV    D26,R2,R2         \n"
    "        VMOV    D27,R2,R2         \n"
    "        VMOV    D28,R2,R2         \n"
    "        VMOV    D29,R2,R2         \n"
    "        VMOV    D30,R2,R2         \n"
    "        VMOV    D31,R2,R2         \n"
 #endif
    //Initialise FPSCR to a known state
    "        VMRS    R1,FPSCR          \n"
    "        LDR     R2,=0x00086060    \n" //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
    "        AND     R1,R1,R2          \n"
    "        VMSR    FPSCR,R1            "
    : : : "cc", "r1", "r2"
  );
 }
 #endif /* __CMSIS_ARMCLANG_H */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_compiler.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_compiler.h
@@ -1,213 +0,0 @@
 /**************************************************************************//**
 * @file     cmsis_compiler.h
 * @brief    CMSIS compiler specific macros, functions, instructions
 * @version  V1.0.2
 * @date     10. January 2018
 ******************************************************************************/
 /*
 * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef __CMSIS_COMPILER_H
 #define __CMSIS_COMPILER_H
 #include <stdint.h>
 /*
 * Arm Compiler 4/5
 */
 #if   defined ( __CC_ARM )
  #include "cmsis_armcc.h"
 /*
 * Arm Compiler 6 (armclang)
 */
 #elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
  #include "cmsis_armclang.h"
 /*
 * GNU Compiler
 */
 #elif defined ( __GNUC__ )
  #include "cmsis_gcc.h"
 /*
 * IAR Compiler
 */
 #elif defined ( __ICCARM__ )
  #include "cmsis_iccarm.h"
 /*
 * TI Arm Compiler
 */
 #elif defined ( __TI_ARM__ )
  #include <cmsis_ccs.h>
  #ifndef   __ASM
    #define __ASM                     __asm
  #endif
  #ifndef   __INLINE
    #define __INLINE                  inline
  #endif
  #ifndef   __STATIC_INLINE
    #define __STATIC_INLINE           static inline
  #endif
  #ifndef   __STATIC_INLINE
    #define __STATIC_INLINE           static inline
  #endif
  #ifndef   __STATIC_FORCEINLINE
    #define __STATIC_FORCEINLINE      __STATIC_INLINE
  #endif
  #ifndef   __NO_RETURN
    #define __NO_RETURN               __attribute__((noreturn))
  #endif
  #ifndef   CMSIS_DEPRECATED
    #define CMSIS_DEPRECATED          __attribute__((deprecated))
  #endif
  #ifndef   __USED
    #define __USED                    __attribute__((used))
  #endif
  #ifndef   __WEAK
    #define __WEAK                    __attribute__((weak))
  #endif
  #ifndef   __UNALIGNED_UINT32
    struct __attribute__((packed)) T_UINT32 { uint32_t v; };
    #define __UNALIGNED_UINT32(x)     (((struct T_UINT32 *)(x))->v)
  #endif
  #ifndef   __ALIGNED
    #define __ALIGNED(x)              __attribute__((aligned(x)))
  #endif
  #ifndef   __PACKED
    #define __PACKED                  __attribute__((packed))
  #endif
  #ifndef   __COMPILER_BARRIER
    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
    #define __COMPILER_BARRIER()      (void)0
  #endif
 /*
 * TASKING Compiler
 */
 #elif defined ( __TASKING__ )
  /*
   * The CMSIS functions have been implemented as intrinsics in the compiler.
   * Please use "carm -?i" to get an up to date list of all intrinsics,
   * Including the CMSIS ones.
   */
  #ifndef   __ASM
    #define __ASM                     __asm
  #endif
  #ifndef   __INLINE
    #define __INLINE                  inline
  #endif
  #ifndef   __STATIC_INLINE
    #define __STATIC_INLINE           static inline
  #endif
  #ifndef   __STATIC_FORCEINLINE
    #define __STATIC_FORCEINLINE      __STATIC_INLINE
  #endif
  #ifndef   __NO_RETURN
    #define __NO_RETURN               __attribute__((noreturn))
  #endif
  #ifndef   CMSIS_DEPRECATED
    #define CMSIS_DEPRECATED          __attribute__((deprecated))
  #endif
  #ifndef   __USED
    #define __USED                    __attribute__((used))
  #endif
  #ifndef   __WEAK
    #define __WEAK                    __attribute__((weak))
  #endif
  #ifndef   __UNALIGNED_UINT32
    struct __packed__ T_UINT32 { uint32_t v; };
    #define __UNALIGNED_UINT32(x)     (((struct T_UINT32 *)(x))->v)
  #endif
  #ifndef   __ALIGNED
    #define __ALIGNED(x)              __align(x)
  #endif
  #ifndef   __PACKED
    #define __PACKED                  __packed__
  #endif
  #ifndef   __COMPILER_BARRIER
    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
    #define __COMPILER_BARRIER()      (void)0
  #endif
 /*
 * COSMIC Compiler
 */
 #elif defined ( __CSMC__ )
   #include <cmsis_csm.h>
 #ifndef   __ASM
    #define __ASM                     _asm
  #endif
  #ifndef   __INLINE
    #define __INLINE                  inline
  #endif
  #ifndef   __STATIC_INLINE
    #define __STATIC_INLINE           static inline
  #endif
  #ifndef   __STATIC_FORCEINLINE
    #define __STATIC_FORCEINLINE      __STATIC_INLINE
  #endif
  #ifndef   __NO_RETURN
    // NO RETURN is automatically detected hence no warning here
    #define __NO_RETURN
  #endif
  #ifndef   __USED
    #warning No compiler specific solution for __USED. __USED is ignored.
    #define __USED
  #endif
  #ifndef   CMSIS_DEPRECATED
    #warning No compiler specific solution for CMSIS_DEPRECATED. CMSIS_DEPRECATED is ignored.
    #define CMSIS_DEPRECATED
  #endif
  #ifndef   __WEAK
    #define __WEAK                    __weak
  #endif
  #ifndef   __UNALIGNED_UINT32
    @packed struct T_UINT32 { uint32_t v; };
    #define __UNALIGNED_UINT32(x)     (((struct T_UINT32 *)(x))->v)
  #endif
  #ifndef   __ALIGNED
    #warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored.
    #define __ALIGNED(x)
  #endif
  #ifndef   __PACKED
    #define __PACKED                  @packed
  #endif
  #ifndef   __COMPILER_BARRIER
    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
    #define __COMPILER_BARRIER()      (void)0
  #endif
 #else
  #error Unknown compiler.
 #endif
 #endif /* __CMSIS_COMPILER_H */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_cp15.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_cp15.h
@@ -1,514 +0,0 @@
 /**************************************************************************//**
 * @file     cmsis_cp15.h
 * @brief    CMSIS compiler specific macros, functions, instructions
 * @version  V1.0.1
 * @date     07. Sep 2017
 ******************************************************************************/
 /*
 * Copyright (c) 2009-2017 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #if   defined ( __ICCARM__ )
  #pragma system_include         /* treat file as system include file for MISRA check */
 #elif defined (__clang__)
  #pragma clang system_header   /* treat file as system include file */
 #endif
 #ifndef __CMSIS_CP15_H
 #define __CMSIS_CP15_H
 /** \brief  Get ACTLR
    \return               Auxiliary Control register value
 */
 __STATIC_FORCEINLINE uint32_t __get_ACTLR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 1, 0, 1);
  return(result);
 }
 /** \brief  Set ACTLR
    \param [in]    actlr  Auxiliary Control value to set
 */
 __STATIC_FORCEINLINE void __set_ACTLR(uint32_t actlr)
 {
  __set_CP(15, 0, actlr, 1, 0, 1);
 }
 /** \brief  Get CPACR
    \return               Coprocessor Access Control register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CPACR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 1, 0, 2);
  return result;
 }
 /** \brief  Set CPACR
    \param [in]    cpacr  Coprocessor Access Control value to set
 */
 __STATIC_FORCEINLINE void __set_CPACR(uint32_t cpacr)
 {
  __set_CP(15, 0, cpacr, 1, 0, 2);
 }
 /** \brief  Get DFSR
    \return               Data Fault Status Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_DFSR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 5, 0, 0);
  return result;
 }
 /** \brief  Set DFSR
    \param [in]    dfsr  Data Fault Status value to set
 */
 __STATIC_FORCEINLINE void __set_DFSR(uint32_t dfsr)
 {
  __set_CP(15, 0, dfsr, 5, 0, 0);
 }
 /** \brief  Get IFSR
    \return               Instruction Fault Status Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_IFSR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 5, 0, 1);
  return result;
 }
 /** \brief  Set IFSR
    \param [in]    ifsr  Instruction Fault Status value to set
 */
 __STATIC_FORCEINLINE void __set_IFSR(uint32_t ifsr)
 {
  __set_CP(15, 0, ifsr, 5, 0, 1);
 }
 /** \brief  Get ISR
    \return               Interrupt Status Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_ISR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 12, 1, 0);
  return result;
 }
 /** \brief  Get CBAR
    \return               Configuration Base Address register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CBAR(void)
 {
  uint32_t result;
  __get_CP(15, 4, result, 15, 0, 0);
  return result;
 }
 /** \brief  Get TTBR0
    This function returns the value of the Translation Table Base Register 0.
    \return               Translation Table Base Register 0 value
 */
 __STATIC_FORCEINLINE uint32_t __get_TTBR0(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 2, 0, 0);
  return result;
 }
 /** \brief  Set TTBR0
    This function assigns the given value to the Translation Table Base Register 0.
    \param [in]    ttbr0  Translation Table Base Register 0 value to set
 */
 __STATIC_FORCEINLINE void __set_TTBR0(uint32_t ttbr0)
 {
  __set_CP(15, 0, ttbr0, 2, 0, 0);
 }
 /** \brief  Get DACR
    This function returns the value of the Domain Access Control Register.
    \return               Domain Access Control Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_DACR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 3, 0, 0);
  return result;
 }
 /** \brief  Set DACR
    This function assigns the given value to the Domain Access Control Register.
    \param [in]    dacr   Domain Access Control Register value to set
 */
 __STATIC_FORCEINLINE void __set_DACR(uint32_t dacr)
 {
  __set_CP(15, 0, dacr, 3, 0, 0);
 }
 /** \brief  Set SCTLR
    This function assigns the given value to the System Control Register.
    \param [in]    sctlr  System Control Register value to set
 */
 __STATIC_FORCEINLINE void __set_SCTLR(uint32_t sctlr)
 {
  __set_CP(15, 0, sctlr, 1, 0, 0);
 }
 /** \brief  Get SCTLR
    \return               System Control Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_SCTLR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 1, 0, 0);
  return result;
 }
 /** \brief  Set ACTRL
    \param [in]    actrl  Auxiliary Control Register value to set
 */
 __STATIC_FORCEINLINE void __set_ACTRL(uint32_t actrl)
 {
  __set_CP(15, 0, actrl, 1, 0, 1);
 }
 /** \brief  Get ACTRL
    \return               Auxiliary Control Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_ACTRL(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 1, 0, 1);
  return result;
 }
 /** \brief  Get MPIDR
    This function returns the value of the Multiprocessor Affinity Register.
    \return               Multiprocessor Affinity Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_MPIDR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 0, 0, 5);
  return result;
 }
 /** \brief  Get VBAR
    This function returns the value of the Vector Base Address Register.
    \return               Vector Base Address Register
 */
 __STATIC_FORCEINLINE uint32_t __get_VBAR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 12, 0, 0);
  return result;
 }
 /** \brief  Set VBAR
    This function assigns the given value to the Vector Base Address Register.
    \param [in]    vbar  Vector Base Address Register value to set
 */
 __STATIC_FORCEINLINE void __set_VBAR(uint32_t vbar)
 {
  __set_CP(15, 0, vbar, 12, 0, 0);
 }
 /** \brief  Get MVBAR
    This function returns the value of the Monitor Vector Base Address Register.
    \return               Monitor Vector Base Address Register
 */
 __STATIC_FORCEINLINE uint32_t __get_MVBAR(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 12, 0, 1);
  return result;
 }
 /** \brief  Set MVBAR
    This function assigns the given value to the Monitor Vector Base Address Register.
    \param [in]    mvbar  Monitor Vector Base Address Register value to set
 */
 __STATIC_FORCEINLINE void __set_MVBAR(uint32_t mvbar)
 {
  __set_CP(15, 0, mvbar, 12, 0, 1);
 }
 #if (defined(__CORTEX_A) && (__CORTEX_A == 7U) && \
    defined(__TIM_PRESENT) && (__TIM_PRESENT == 1U)) || \
    defined(DOXYGEN)
 /** \brief  Set CNTFRQ
  This function assigns the given value to PL1 Physical Timer Counter Frequency Register (CNTFRQ).
  \param [in]    value  CNTFRQ Register value to set
 */
 __STATIC_FORCEINLINE void __set_CNTFRQ(uint32_t value)
 {
  __set_CP(15, 0, value, 14, 0, 0);
 }
 /** \brief  Get CNTFRQ
    This function returns the value of the PL1 Physical Timer Counter Frequency Register (CNTFRQ).
    \return               CNTFRQ Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CNTFRQ(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 14, 0 , 0);
  return result;
 }
 /** \brief  Set CNTP_TVAL
  This function assigns the given value to PL1 Physical Timer Value Register (CNTP_TVAL).
  \param [in]    value  CNTP_TVAL Register value to set
 */
 __STATIC_FORCEINLINE void __set_CNTP_TVAL(uint32_t value)
 {
  __set_CP(15, 0, value, 14, 2, 0);
 }
 /** \brief  Get CNTP_TVAL
    This function returns the value of the PL1 Physical Timer Value Register (CNTP_TVAL).
    \return               CNTP_TVAL Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CNTP_TVAL(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 14, 2, 0);
  return result;
 }
 /** \brief  Get CNTPCT
    This function returns the value of the 64 bits PL1 Physical Count Register (CNTPCT).
    \return               CNTPCT Register value
 */
 __STATIC_FORCEINLINE uint64_t __get_CNTPCT(void)
 {
  uint64_t result;
  __get_CP64(15, 0, result, 14);
  return result;
 }
 /** \brief  Set CNTP_CVAL
  This function assigns the given value to 64bits PL1 Physical Timer CompareValue Register (CNTP_CVAL).
  \param [in]    value  CNTP_CVAL Register value to set
 */
 __STATIC_FORCEINLINE void __set_CNTP_CVAL(uint64_t value)
 {
  __set_CP64(15, 2, value, 14);
 }
 /** \brief  Get CNTP_CVAL
    This function returns the value of the 64 bits PL1 Physical Timer CompareValue Register (CNTP_CVAL).
    \return               CNTP_CVAL Register value
 */
 __STATIC_FORCEINLINE uint64_t __get_CNTP_CVAL(void)
 {
  uint64_t result;
  __get_CP64(15, 2, result, 14);
  return result;
 }
 /** \brief  Set CNTP_CTL
  This function assigns the given value to PL1 Physical Timer Control Register (CNTP_CTL).
  \param [in]    value  CNTP_CTL Register value to set
 */
 __STATIC_FORCEINLINE void __set_CNTP_CTL(uint32_t value)
 {
  __set_CP(15, 0, value, 14, 2, 1);
 }
 /** \brief  Get CNTP_CTL register
    \return               CNTP_CTL Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CNTP_CTL(void)
 {
  uint32_t result;
  __get_CP(15, 0, result, 14, 2, 1);
  return result;
 }
 #endif
 /** \brief  Set TLBIALL
  TLB Invalidate All
 */
 __STATIC_FORCEINLINE void __set_TLBIALL(uint32_t value)
 {
  __set_CP(15, 0, value, 8, 7, 0);
 }
 /** \brief  Set BPIALL.
  Branch Predictor Invalidate All
 */
 __STATIC_FORCEINLINE void __set_BPIALL(uint32_t value)
 {
  __set_CP(15, 0, value, 7, 5, 6);
 }
 /** \brief  Set ICIALLU
  Instruction Cache Invalidate All
 */
 __STATIC_FORCEINLINE void __set_ICIALLU(uint32_t value)
 {
  __set_CP(15, 0, value, 7, 5, 0);
 }
 /** \brief  Set DCCMVAC
  Data cache clean
 */
 __STATIC_FORCEINLINE void __set_DCCMVAC(uint32_t value)
 {
  __set_CP(15, 0, value, 7, 10, 1);
 }
 /** \brief  Set DCIMVAC
  Data cache invalidate
 */
 __STATIC_FORCEINLINE void __set_DCIMVAC(uint32_t value)
 {
  __set_CP(15, 0, value, 7, 6, 1);
 }
 /** \brief  Set DCCIMVAC
  Data cache clean and invalidate
 */
 __STATIC_FORCEINLINE void __set_DCCIMVAC(uint32_t value)
 {
  __set_CP(15, 0, value, 7, 14, 1);
 }
 /** \brief  Set CSSELR
 */
 __STATIC_FORCEINLINE void __set_CSSELR(uint32_t value)
 {
 //  __ASM volatile("MCR p15, 2, %0, c0, c0, 0" : : "r"(value) : "memory");
  __set_CP(15, 2, value, 0, 0, 0);
 }
 /** \brief  Get CSSELR
    \return CSSELR Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CSSELR(void)
 {
  uint32_t result;
 //  __ASM volatile("MRC p15, 2, %0, c0, c0, 0" : "=r"(result) : : "memory");
  __get_CP(15, 2, result, 0, 0, 0);
  return result;
 }
 /** \brief  Set CCSIDR
    \deprecated CCSIDR itself is read-only. Use __set_CSSELR to select cache level instead.
 */
 CMSIS_DEPRECATED
 __STATIC_FORCEINLINE void __set_CCSIDR(uint32_t value)
 {
  __set_CSSELR(value);
 }
 /** \brief  Get CCSIDR
    \return CCSIDR Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CCSIDR(void)
 {
  uint32_t result;
 //  __ASM volatile("MRC p15, 1, %0, c0, c0, 0" : "=r"(result) : : "memory");
  __get_CP(15, 1, result, 0, 0, 0);
  return result;
 }
 /** \brief  Get CLIDR
    \return CLIDR Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CLIDR(void)
 {
  uint32_t result;
 //  __ASM volatile("MRC p15, 1, %0, c0, c0, 1" : "=r"(result) : : "memory");
  __get_CP(15, 1, result, 0, 0, 1);
  return result;
 }
 /** \brief  Set DCISW
 */
 __STATIC_FORCEINLINE void __set_DCISW(uint32_t value)
 {
 //  __ASM volatile("MCR p15, 0, %0, c7, c6, 2" : : "r"(value) : "memory")
  __set_CP(15, 0, value, 7, 6, 2);
 }
 /** \brief  Set DCCSW
 */
 __STATIC_FORCEINLINE void __set_DCCSW(uint32_t value)
 {
 //  __ASM volatile("MCR p15, 0, %0, c7, c10, 2" : : "r"(value) : "memory")
  __set_CP(15, 0, value, 7, 10, 2);
 }
 /** \brief  Set DCCISW
 */
 __STATIC_FORCEINLINE void __set_DCCISW(uint32_t value)
 {
 //  __ASM volatile("MCR p15, 0, %0, c7, c14, 2" : : "r"(value) : "memory")
  __set_CP(15, 0, value, 7, 14, 2);
 }
 #endif
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_gcc.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_gcc.h
@@ -1,917 +0,0 @@
 /**************************************************************************//**
 * @file     cmsis_gcc.h
 * @brief    CMSIS compiler specific macros, functions, instructions
 * @version  V1.3.2
 * @date     24. March 2022
 ******************************************************************************/
 /*
 * Copyright (c) 2009-2022 Arm Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef __CMSIS_GCC_H
 #define __CMSIS_GCC_H
 /* ignore some GCC warnings */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wsign-conversion"
 #pragma GCC diagnostic ignored "-Wconversion"
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 /* Fallback for __has_builtin */
 #ifndef __has_builtin
  #define __has_builtin(x) (0)
 #endif
 /* CMSIS compiler specific defines */
 #ifndef   __ASM
  #define __ASM                                  __asm
 #endif
 #ifndef   __INLINE
  #define __INLINE                               inline
 #endif
 #ifndef   __FORCEINLINE
  #define __FORCEINLINE                          __attribute__((always_inline))
 #endif
 #ifndef   __STATIC_INLINE
  #define __STATIC_INLINE                        static inline
 #endif
 #ifndef   __STATIC_FORCEINLINE
  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static inline
 #endif
 #ifndef   __NO_RETURN
  #define __NO_RETURN                            __attribute__((__noreturn__))
 #endif
 #ifndef   CMSIS_DEPRECATED
  #define CMSIS_DEPRECATED                       __attribute__((deprecated))
 #endif
 #ifndef   __USED
  #define __USED                                 __attribute__((used))
 #endif
 #ifndef   __WEAK
  #define __WEAK                                 __attribute__((weak))
 #endif
 #ifndef   __PACKED
  #define __PACKED                               __attribute__((packed, aligned(1)))
 #endif
 #ifndef   __PACKED_STRUCT
  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
 #endif
 #ifndef   __UNALIGNED_UINT16_WRITE
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wpacked"
 /*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
  #pragma GCC diagnostic pop
  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
 #endif
 #ifndef   __UNALIGNED_UINT16_READ
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wpacked"
 /*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
  #pragma GCC diagnostic pop
  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
 #endif
 #ifndef   __UNALIGNED_UINT32_WRITE
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wpacked"
 /*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
  #pragma GCC diagnostic pop
  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
 #endif
 #ifndef   __UNALIGNED_UINT32_READ
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wpacked"
  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
  #pragma GCC diagnostic pop
  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
 #endif
 #ifndef   __ALIGNED
  #define __ALIGNED(x)                           __attribute__((aligned(x)))
 #endif
 #ifndef   __COMPILER_BARRIER
  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
 #endif
 __STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE  int32_t __QADD( int32_t op1,  int32_t op2)
 {
  int32_t result;
  __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
 {
  union llreg_u{
    uint32_t w32[2];
    uint64_t w64;
  } llr;
  llr.w64 = acc;
 #ifndef __ARMEB__   /* Little endian */
  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
 #else               /* Big endian */
  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
 #endif
  return(llr.w64);
 }
 __STATIC_FORCEINLINE  int32_t __QSUB( int32_t op1,  int32_t op2)
 {
  int32_t result;
  __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
 {
  uint32_t result;
  __ASM ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SMUAD  (uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 #define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
 #define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
 __STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
 {
  uint32_t result;
  __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
 {
  uint32_t result;
  __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
  return(result);
 }
 __STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
 {
  union llreg_u{
    uint32_t w32[2];
    uint64_t w64;
  } llr;
  llr.w64 = acc;
 #ifndef __ARMEB__   /* Little endian */
  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
 #else               /* Big endian */
  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
 #endif
  return(llr.w64);
 }
 __STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
 {
 int32_t result;
 __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
 return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SMUSD  (uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
 {
  uint32_t result;
  __ASM ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
 }
 __STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
 {
  uint32_t result;
  __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
  return(result);
 }
 /* ##########################  Core Instruction Access  ######################### */
 /**
  \brief   No Operation
 */
 #define __NOP()                             __ASM volatile ("nop")
 /**
  \brief   Wait For Interrupt
 */
 #define __WFI()                             __ASM volatile ("wfi":::"memory")
 /**
  \brief   Wait For Event
 */
 #define __WFE()                             __ASM volatile ("wfe":::"memory")
 /**
  \brief   Send Event
 */
 #define __SEV()                             __ASM volatile ("sev")
 /**
  \brief   Instruction Synchronization Barrier
  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
           so that all instructions following the ISB are fetched from cache or memory,
           after the instruction has been completed.
 */
 __STATIC_FORCEINLINE  void __ISB(void)
 {
  __ASM volatile ("isb 0xF":::"memory");
 }
 /**
  \brief   Data Synchronization Barrier
  \details Acts as a special kind of Data Memory Barrier.
           It completes when all explicit memory accesses before this instruction complete.
 */
 __STATIC_FORCEINLINE  void __DSB(void)
 {
  __ASM volatile ("dsb 0xF":::"memory");
 }
 /**
  \brief   Data Memory Barrier
  \details Ensures the apparent order of the explicit memory operations before
           and after the instruction, without ensuring their completion.
 */
 __STATIC_FORCEINLINE  void __DMB(void)
 {
  __ASM volatile ("dmb 0xF":::"memory");
 }
 /**
  \brief   Reverse byte order (32 bit)
  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 __STATIC_FORCEINLINE  uint32_t __REV(uint32_t value)
 {
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
  return __builtin_bswap32(value);
 #else
  uint32_t result;
  __ASM ("rev %0, %1" : "=r" (result) : "r" (value) );
  return result;
 #endif
 }
 /**
  \brief   Reverse byte order (16 bit)
  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 __STATIC_FORCEINLINE uint32_t __REV16(uint32_t value)
 {
  uint32_t result;
  __ASM ("rev16 %0, %1" : "=r" (result) : "r" (value));
  return result;
 }
 /**
  \brief   Reverse byte order (16 bit)
  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 __STATIC_FORCEINLINE  int16_t __REVSH(int16_t value)
 {
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
  return (int16_t)__builtin_bswap16(value);
 #else
  int16_t result;
  __ASM ("revsh %0, %1" : "=r" (result) : "r" (value) );
  return result;
 #endif
 }
 /**
  \brief   Rotate Right in unsigned value (32 bit)
  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
  \param [in]    op1  Value to rotate
  \param [in]    op2  Number of Bits to rotate
  \return               Rotated value
 */
 __STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
 {
  op2 %= 32U;
  if (op2 == 0U)
  {
    return op1;
  }
  return (op1 >> op2) | (op1 << (32U - op2));
 }
 /**
  \brief   Breakpoint
  \param [in]    value  is ignored by the processor.
                 If required, a debugger can use it to store additional information about the breakpoint.
 */
 #define __BKPT(value)   __ASM volatile ("bkpt "#value)
 /**
  \brief   Reverse bit order of value
  \details Reverses the bit order of the given value.
  \param [in]    value  Value to reverse
  \return               Reversed value
 */
 __STATIC_FORCEINLINE  uint32_t __RBIT(uint32_t value)
 {
  uint32_t result;
   __ASM ("rbit %0, %1" : "=r" (result) : "r" (value) );
  return result;
 }
 /**
  \brief   Count leading zeros
  \param [in]  value  Value to count the leading zeros
  \return             number of leading zeros in value
 */
 __STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
 {
  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
     __builtin_clz(0) is undefined behaviour, so handle this case specially.
     This guarantees ARM-compatible results if happening to compile on a non-ARM
     target, and ensures the compiler doesn't decide to activate any
     optimisations using the logic "value was passed to __builtin_clz, so it
     is non-zero".
     ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
     single CLZ instruction.
   */
  if (value == 0U)
  {
    return 32U;
  }
  return __builtin_clz(value);
 }
 /**
  \brief   LDR Exclusive (8 bit)
  \details Executes a exclusive LDR instruction for 8 bit value.
  \param [in]    ptr  Pointer to data
  \return             value of type uint8_t at (*ptr)
 */
 __STATIC_FORCEINLINE  uint8_t __LDREXB(volatile uint8_t *addr)
 {
    uint32_t result;
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
   __ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) );
 #else
    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
       accepted by assembler. So has to use following less efficient pattern.
    */
   __ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
 #endif
   return ((uint8_t) result);    /* Add explicit type cast here */
 }
 /**
  \brief   LDR Exclusive (16 bit)
  \details Executes a exclusive LDR instruction for 16 bit values.
  \param [in]    ptr  Pointer to data
  \return        value of type uint16_t at (*ptr)
 */
 __STATIC_FORCEINLINE  uint16_t __LDREXH(volatile uint16_t *addr)
 {
    uint32_t result;
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
   __ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) );
 #else
    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
       accepted by assembler. So has to use following less efficient pattern.
    */
   __ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
 #endif
   return ((uint16_t) result);    /* Add explicit type cast here */
 }
 /**
  \brief   LDR Exclusive (32 bit)
  \details Executes a exclusive LDR instruction for 32 bit values.
  \param [in]    ptr  Pointer to data
  \return        value of type uint32_t at (*ptr)
 */
 __STATIC_FORCEINLINE  uint32_t __LDREXW(volatile uint32_t *addr)
 {
    uint32_t result;
   __ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) );
   return(result);
 }
 /**
  \brief   STR Exclusive (8 bit)
  \details Executes a exclusive STR instruction for 8 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 __STATIC_FORCEINLINE  uint32_t __STREXB(uint8_t value, volatile uint8_t *addr)
 {
   uint32_t result;
   __ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
   return(result);
 }
 /**
  \brief   STR Exclusive (16 bit)
  \details Executes a exclusive STR instruction for 16 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 __STATIC_FORCEINLINE  uint32_t __STREXH(uint16_t value, volatile uint16_t *addr)
 {
   uint32_t result;
   __ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
   return(result);
 }
 /**
  \brief   STR Exclusive (32 bit)
  \details Executes a exclusive STR instruction for 32 bit values.
  \param [in]  value  Value to store
  \param [in]    ptr  Pointer to location
  \return          0  Function succeeded
  \return          1  Function failed
 */
 __STATIC_FORCEINLINE  uint32_t __STREXW(uint32_t value, volatile uint32_t *addr)
 {
   uint32_t result;
   __ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) );
   return(result);
 }
 /**
  \brief   Remove the exclusive lock
  \details Removes the exclusive lock which is created by LDREX.
 */
 __STATIC_FORCEINLINE  void __CLREX(void)
 {
  __ASM volatile ("clrex" ::: "memory");
 }
 /**
  \brief   Signed Saturate
  \details Saturates a signed value.
  \param [in]  value  Value to be saturated
  \param [in]    sat  Bit position to saturate to (1..32)
  \return             Saturated value
 */
 #define __SSAT(ARG1, ARG2) \
 __extension__ \
 ({                          \
  int32_t __RES, __ARG1 = (ARG1); \
  __ASM volatile ("ssat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
  __RES; \
 })
 /**
  \brief   Unsigned Saturate
  \details Saturates an unsigned value.
  \param [in]  value  Value to be saturated
  \param [in]    sat  Bit position to saturate to (0..31)
  \return             Saturated value
 */
 #define __USAT(ARG1, ARG2) \
 __extension__ \
 ({                          \
  uint32_t __RES, __ARG1 = (ARG1); \
  __ASM volatile ("usat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
  __RES; \
 })
 /* ###########################  Core Function Access  ########################### */
 /**
  \brief   Enable IRQ Interrupts
  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __enable_irq(void)
 {
  __ASM volatile ("cpsie i" : : : "memory");
 }
 /**
  \brief   Disable IRQ Interrupts
  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
  Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __disable_irq(void)
 {
  __ASM volatile ("cpsid i" : : : "memory");
 }
 /**
  \brief   Enable FIQ
  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __enable_fault_irq(void)
 {
  __ASM volatile ("cpsie f" : : : "memory");
 }
 /**
  \brief   Disable FIQ
  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
           Can only be executed in Privileged modes.
 */
 __STATIC_FORCEINLINE void __disable_fault_irq(void)
 {
  __ASM volatile ("cpsid f" : : : "memory");
 }
 /**
  \brief   Get FPSCR
  \details Returns the current value of the Floating Point Status/Control register.
  \return               Floating Point Status/Control register value
 */
 __STATIC_FORCEINLINE  uint32_t __get_FPSCR(void)
 {
  #if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
       (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
  #if __has_builtin(__builtin_arm_get_fpscr) 
  // Re-enable using built-in when GCC has been fixed
  // || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
    /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
    return __builtin_arm_get_fpscr();
  #else
    uint32_t result;
    __ASM volatile ("VMRS %0, fpscr" : "=r" (result) );
    return(result);
  #endif
  #else
    return(0U);
  #endif
 }
 /**
  \brief   Set FPSCR
  \details Assigns the given value to the Floating Point Status/Control register.
  \param [in]    fpscr  Floating Point Status/Control value to set
 */
 __STATIC_FORCEINLINE void __set_FPSCR(uint32_t fpscr)
 {
  #if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
       (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
  #if __has_builtin(__builtin_arm_set_fpscr)
  // Re-enable using built-in when GCC has been fixed
  // || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
    /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
    __builtin_arm_set_fpscr(fpscr);
  #else
    __ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc", "memory");
  #endif
  #else
    (void)fpscr;
  #endif
 }
 /** \brief  Get CPSR Register
    \return               CPSR Register value
 */
 __STATIC_FORCEINLINE uint32_t __get_CPSR(void)
 {
  uint32_t result;
  __ASM volatile("MRS %0, cpsr" : "=r" (result) );
  return(result);
 }
 /** \brief  Set CPSR Register
    \param [in]    cpsr  CPSR value to set
 */
 __STATIC_FORCEINLINE void __set_CPSR(uint32_t cpsr)
 {
  __ASM volatile ("MSR cpsr, %0" : : "r" (cpsr) : "cc", "memory");
 }
 /** \brief  Get Mode
    \return                Processor Mode
 */
 __STATIC_FORCEINLINE uint32_t __get_mode(void)
 {
  return (__get_CPSR() & 0x1FU);
 }
 /** \brief  Set Mode
    \param [in]    mode  Mode value to set
 */
 __STATIC_FORCEINLINE void __set_mode(uint32_t mode)
 {
  __ASM volatile("MSR  cpsr_c, %0" : : "r" (mode) : "memory");
 }
 /** \brief  Get Stack Pointer
    \return Stack Pointer value
 */
 __STATIC_FORCEINLINE uint32_t __get_SP(void)
 {
  uint32_t result;
  __ASM volatile("MOV  %0, sp" : "=r" (result) : : "memory");
  return result;
 }
 /** \brief  Set Stack Pointer
    \param [in]    stack  Stack Pointer value to set
 */
 __STATIC_FORCEINLINE void __set_SP(uint32_t stack)
 {
  __ASM volatile("MOV  sp, %0" : : "r" (stack) : "memory");
 }
 /** \brief  Get USR/SYS Stack Pointer
    \return USR/SYS Stack Pointer value
 */
 __STATIC_FORCEINLINE uint32_t __get_SP_usr(void)
 {
  uint32_t cpsr = __get_CPSR();
  uint32_t result;
  __ASM volatile(
    "CPS     #0x1F  \n"
    "MOV     %0, sp   " : "=r"(result) : : "memory"
   );
  __set_CPSR(cpsr);
  __ISB();
  return result;
 }
 /** \brief  Set USR/SYS Stack Pointer
    \param [in]    topOfProcStack  USR/SYS Stack Pointer value to set
 */
 __STATIC_FORCEINLINE void __set_SP_usr(uint32_t topOfProcStack)
 {
  uint32_t cpsr = __get_CPSR();
  __ASM volatile(
    "CPS     #0x1F  \n"
    "MOV     sp, %0   " : : "r" (topOfProcStack) : "memory"
   );
  __set_CPSR(cpsr);
  __ISB();
 }
 /** \brief  Get FPEXC
    \return               Floating Point Exception Control register value
 */
 __STATIC_FORCEINLINE uint32_t __get_FPEXC(void)
 {
 #if (__FPU_PRESENT == 1)
  uint32_t result;
  __ASM volatile("VMRS %0, fpexc" : "=r" (result) : : "memory");
  return(result);
 #else
  return(0);
 #endif
 }
 /** \brief  Set FPEXC
    \param [in]    fpexc  Floating Point Exception Control value to set
 */
 __STATIC_FORCEINLINE void __set_FPEXC(uint32_t fpexc)
 {
 #if (__FPU_PRESENT == 1)
  __ASM volatile ("VMSR fpexc, %0" : : "r" (fpexc) : "memory");
 #endif
 }
 /*
 * Include common core functions to access Coprocessor 15 registers
 */
 #define __get_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MRC p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : "=r" (Rt) : : "memory" )
 #define __set_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MCR p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : : "r" (Rt) : "memory" )
 #define __get_CP64(cp, op1, Rt, CRm)         __ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : "=r" (Rt) : : "memory" )
 #define __set_CP64(cp, op1, Rt, CRm)         __ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : : "r" (Rt) : "memory" )
 #include "cmsis_cp15.h"
 /** \brief  Enable Floating Point Unit
  Critical section, called from undef handler, so systick is disabled
 */
 __STATIC_INLINE void __FPU_Enable(void)
 {
  __ASM volatile(
    //Permit access to VFP/NEON, registers by modifying CPACR
    "        MRC     p15,0,R1,c1,c0,2  \n"
    "        ORR     R1,R1,#0x00F00000 \n"
    "        MCR     p15,0,R1,c1,c0,2  \n"
    //Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
    "        ISB                       \n"
    //Enable VFP/NEON
    "        VMRS    R1,FPEXC          \n"
    "        ORR     R1,R1,#0x40000000 \n"
    "        VMSR    FPEXC,R1          \n"
    //Initialise VFP/NEON registers to 0
    "        MOV     R2,#0             \n"
    //Initialise D16 registers to 0
    "        VMOV    D0, R2,R2         \n"
    "        VMOV    D1, R2,R2         \n"
    "        VMOV    D2, R2,R2         \n"
    "        VMOV    D3, R2,R2         \n"
    "        VMOV    D4, R2,R2         \n"
    "        VMOV    D5, R2,R2         \n"
    "        VMOV    D6, R2,R2         \n"
    "        VMOV    D7, R2,R2         \n"
    "        VMOV    D8, R2,R2         \n"
    "        VMOV    D9, R2,R2         \n"
    "        VMOV    D10,R2,R2         \n"
    "        VMOV    D11,R2,R2         \n"
    "        VMOV    D12,R2,R2         \n"
    "        VMOV    D13,R2,R2         \n"
    "        VMOV    D14,R2,R2         \n"
    "        VMOV    D15,R2,R2         \n"
 #if (defined(__ARM_NEON) && (__ARM_NEON == 1))
    //Initialise D32 registers to 0
    "        VMOV    D16,R2,R2         \n"
    "        VMOV    D17,R2,R2         \n"
    "        VMOV    D18,R2,R2         \n"
    "        VMOV    D19,R2,R2         \n"
    "        VMOV    D20,R2,R2         \n"
    "        VMOV    D21,R2,R2         \n"
    "        VMOV    D22,R2,R2         \n"
    "        VMOV    D23,R2,R2         \n"
    "        VMOV    D24,R2,R2         \n"
    "        VMOV    D25,R2,R2         \n"
    "        VMOV    D26,R2,R2         \n"
    "        VMOV    D27,R2,R2         \n"
    "        VMOV    D28,R2,R2         \n"
    "        VMOV    D29,R2,R2         \n"
    "        VMOV    D30,R2,R2         \n"
    "        VMOV    D31,R2,R2         \n"
 #endif
    //Initialise FPSCR to a known state
    "        VMRS    R1,FPSCR          \n"
    "        LDR     R2,=0x00086060    \n" //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
    "        AND     R1,R1,R2          \n"
    "        VMSR    FPSCR,R1            "
    : : : "cc", "r1", "r2"
  );
 }
 #pragma GCC diagnostic pop
 #endif /* __CMSIS_GCC_H */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_iccarm.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/cmsis_iccarm.h
@@ -1,573 +0,0 @@
 /**************************************************************************//**
 * @file     cmsis_iccarm.h
 * @brief    CMSIS compiler ICCARM (IAR Compiler for Arm) header file
 * @version  V5.0.7
 * @date     15. May 2019
 ******************************************************************************/
 //------------------------------------------------------------------------------
 //
 // Copyright (c) 2017-2018 IAR Systems
 // Copyright (c) 2018-2019 Arm Limited 
 //
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License")
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 //------------------------------------------------------------------------------
 #ifndef __CMSIS_ICCARM_H__
 #define __CMSIS_ICCARM_H__
 #ifndef __ICCARM__
  #error This file should only be compiled by ICCARM
 #endif
 #pragma system_include
 #define __IAR_FT _Pragma("inline=forced") __intrinsic
 #if (__VER__ >= 8000000)
  #define __ICCARM_V8 1
 #else
  #define __ICCARM_V8 0
 #endif
 #pragma language=extended
 #ifndef __ALIGNED
  #if __ICCARM_V8
    #define __ALIGNED(x) __attribute__((aligned(x)))
  #elif (__VER__ >= 7080000)
    /* Needs IAR language extensions */
    #define __ALIGNED(x) __attribute__((aligned(x)))
  #else
    #warning No compiler specific solution for __ALIGNED.__ALIGNED is ignored.
    #define __ALIGNED(x)
  #endif
 #endif
 /* Define compiler macros for CPU architecture, used in CMSIS 5.
 */
 #if __ARM_ARCH_7A__
 /* Macro already defined */
 #else
  #if defined(__ARM7A__)
    #define __ARM_ARCH_7A__ 1
  #endif
 #endif
 #ifndef __ASM
  #define __ASM __asm
 #endif
 #ifndef   __COMPILER_BARRIER
  #define __COMPILER_BARRIER() __ASM volatile("":::"memory")
 #endif
 #ifndef __INLINE
  #define __INLINE inline
 #endif
 #ifndef   __NO_RETURN
  #if __ICCARM_V8
    #define __NO_RETURN __attribute__((__noreturn__))
  #else
    #define __NO_RETURN _Pragma("object_attribute=__noreturn")
  #endif
 #endif
 #ifndef   __PACKED
  /* Needs IAR language extensions */
  #if __ICCARM_V8
    #define __PACKED __attribute__((packed, aligned(1)))
  #else
    #define __PACKED __packed
  #endif
 #endif
 #ifndef   __PACKED_STRUCT
  /* Needs IAR language extensions */
  #if __ICCARM_V8
    #define __PACKED_STRUCT struct __attribute__((packed, aligned(1)))
  #else
    #define __PACKED_STRUCT __packed struct
  #endif
 #endif
 #ifndef   __PACKED_UNION
  /* Needs IAR language extensions */
  #if __ICCARM_V8
    #define __PACKED_UNION union __attribute__((packed, aligned(1)))
  #else
    #define __PACKED_UNION __packed union
  #endif
 #endif
 #ifndef   __RESTRICT
  #if __ICCARM_V8
    #define __RESTRICT            __restrict
  #else
    /* Needs IAR language extensions */
    #define __RESTRICT            restrict
  #endif
 #endif
 #ifndef   __STATIC_INLINE
  #define __STATIC_INLINE       static inline
 #endif
 #ifndef   __FORCEINLINE
  #define __FORCEINLINE         _Pragma("inline=forced")
 #endif
 #ifndef   __STATIC_FORCEINLINE
  #define __STATIC_FORCEINLINE  __FORCEINLINE __STATIC_INLINE
 #endif
 #ifndef   CMSIS_DEPRECATED
  #define CMSIS_DEPRECATED      __attribute__((deprecated))
 #endif
 #ifndef __UNALIGNED_UINT16_READ
  #pragma language=save
  #pragma language=extended
  __IAR_FT uint16_t __iar_uint16_read(void const *ptr)
  {
    return *(__packed uint16_t*)(ptr);
  }
  #pragma language=restore
  #define __UNALIGNED_UINT16_READ(PTR) __iar_uint16_read(PTR)
 #endif
 #ifndef __UNALIGNED_UINT16_WRITE
  #pragma language=save
  #pragma language=extended
  __IAR_FT void __iar_uint16_write(void const *ptr, uint16_t val)
  {
    *(__packed uint16_t*)(ptr) = val;;
  }
  #pragma language=restore
  #define __UNALIGNED_UINT16_WRITE(PTR,VAL) __iar_uint16_write(PTR,VAL)
 #endif
 #ifndef __UNALIGNED_UINT32_READ
  #pragma language=save
  #pragma language=extended
  __IAR_FT uint32_t __iar_uint32_read(void const *ptr)
  {
    return *(__packed uint32_t*)(ptr);
  }
  #pragma language=restore
  #define __UNALIGNED_UINT32_READ(PTR) __iar_uint32_read(PTR)
 #endif
 #ifndef __UNALIGNED_UINT32_WRITE
  #pragma language=save
  #pragma language=extended
  __IAR_FT void __iar_uint32_write(void const *ptr, uint32_t val)
  {
    *(__packed uint32_t*)(ptr) = val;;
  }
  #pragma language=restore
  #define __UNALIGNED_UINT32_WRITE(PTR,VAL) __iar_uint32_write(PTR,VAL)
 #endif
 #if 0
 #ifndef __UNALIGNED_UINT32   /* deprecated */
  #pragma language=save
  #pragma language=extended
  __packed struct  __iar_u32 { uint32_t v; };
  #pragma language=restore
  #define __UNALIGNED_UINT32(PTR) (((struct __iar_u32 *)(PTR))->v)
 #endif
 #endif
 #ifndef   __USED
  #if __ICCARM_V8
    #define __USED __attribute__((used))
  #else
    #define __USED _Pragma("__root")
  #endif
 #endif
 #ifndef   __WEAK
  #if __ICCARM_V8
    #define __WEAK __attribute__((weak))
  #else
    #define __WEAK _Pragma("__weak")
  #endif
 #endif
 #ifndef __ICCARM_INTRINSICS_VERSION__
  #define __ICCARM_INTRINSICS_VERSION__  0
 #endif
 #if __ICCARM_INTRINSICS_VERSION__ == 2
  #if defined(__CLZ)
    #undef __CLZ
  #endif
  #if defined(__REVSH)
    #undef __REVSH
  #endif
  #if defined(__RBIT)
    #undef __RBIT
  #endif
  #if defined(__SSAT)
    #undef __SSAT
  #endif
  #if defined(__USAT)
    #undef __USAT
  #endif
  #include "iccarm_builtin.h"
  #define __enable_irq        __iar_builtin_enable_interrupt
  #define __disable_irq       __iar_builtin_disable_interrupt
  #define __enable_fault_irq    __iar_builtin_enable_fiq
  #define __disable_fault_irq   __iar_builtin_disable_fiq
  #define __arm_rsr           __iar_builtin_rsr
  #define __arm_wsr           __iar_builtin_wsr
  #if __FPU_PRESENT
    #define __get_FPSCR()             (__arm_rsr("FPSCR"))
  #else
    #define __get_FPSCR()             ( 0 )
  #endif
  #define __set_FPSCR(VALUE)          (__arm_wsr("FPSCR", VALUE))
  #define __get_CPSR()                (__arm_rsr("CPSR"))
  #define __get_mode()                (__get_CPSR() & 0x1FU)
  #define __set_CPSR(VALUE)           (__arm_wsr("CPSR", (VALUE)))
  #define __set_mode(VALUE)           (__arm_wsr("CPSR_c", (VALUE)))
  #define __get_FPEXC()       (__arm_rsr("FPEXC"))
  #define __set_FPEXC(VALUE)    (__arm_wsr("FPEXC", VALUE))
  #define __get_CP(cp, op1, RT, CRn, CRm, op2) \
    ((RT) = __arm_rsr("p" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2))
  #define __set_CP(cp, op1, RT, CRn, CRm, op2) \
    (__arm_wsr("p" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2, (RT)))
  #define __get_CP64(cp, op1, Rt, CRm) \
    __ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : "=r" (Rt) : : "memory" )
  #define __set_CP64(cp, op1, Rt, CRm) \
    __ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : : "r" (Rt) : "memory" )
  #include "cmsis_cp15.h"
  #define __NOP     __iar_builtin_no_operation
  #define __CLZ     __iar_builtin_CLZ
  #define __CLREX   __iar_builtin_CLREX
  #define __DMB     __iar_builtin_DMB
  #define __DSB     __iar_builtin_DSB
  #define __ISB     __iar_builtin_ISB
  #define __LDREXB  __iar_builtin_LDREXB
  #define __LDREXH  __iar_builtin_LDREXH
  #define __LDREXW  __iar_builtin_LDREX
  #define __RBIT    __iar_builtin_RBIT
  #define __REV     __iar_builtin_REV
  #define __REV16   __iar_builtin_REV16
  __IAR_FT int16_t __REVSH(int16_t val)
  {
    return (int16_t) __iar_builtin_REVSH(val);
  }
  #define __ROR     __iar_builtin_ROR
  #define __RRX     __iar_builtin_RRX
  #define __SEV     __iar_builtin_SEV
  #define __SSAT    __iar_builtin_SSAT
  #define __STREXB  __iar_builtin_STREXB
  #define __STREXH  __iar_builtin_STREXH
  #define __STREXW  __iar_builtin_STREX
  #define __USAT    __iar_builtin_USAT
  #define __WFE     __iar_builtin_WFE
  #define __WFI     __iar_builtin_WFI
  #define __SADD8   __iar_builtin_SADD8
  #define __QADD8   __iar_builtin_QADD8
  #define __SHADD8  __iar_builtin_SHADD8
  #define __UADD8   __iar_builtin_UADD8
  #define __UQADD8  __iar_builtin_UQADD8
  #define __UHADD8  __iar_builtin_UHADD8
  #define __SSUB8   __iar_builtin_SSUB8
  #define __QSUB8   __iar_builtin_QSUB8
  #define __SHSUB8  __iar_builtin_SHSUB8
  #define __USUB8   __iar_builtin_USUB8
  #define __UQSUB8  __iar_builtin_UQSUB8
  #define __UHSUB8  __iar_builtin_UHSUB8
  #define __SADD16  __iar_builtin_SADD16
  #define __QADD16  __iar_builtin_QADD16
  #define __SHADD16 __iar_builtin_SHADD16
  #define __UADD16  __iar_builtin_UADD16
  #define __UQADD16 __iar_builtin_UQADD16
  #define __UHADD16 __iar_builtin_UHADD16
  #define __SSUB16  __iar_builtin_SSUB16
  #define __QSUB16  __iar_builtin_QSUB16
  #define __SHSUB16 __iar_builtin_SHSUB16
  #define __USUB16  __iar_builtin_USUB16
  #define __UQSUB16 __iar_builtin_UQSUB16
  #define __UHSUB16 __iar_builtin_UHSUB16
  #define __SASX    __iar_builtin_SASX
  #define __QASX    __iar_builtin_QASX
  #define __SHASX   __iar_builtin_SHASX
  #define __UASX    __iar_builtin_UASX
  #define __UQASX   __iar_builtin_UQASX
  #define __UHASX   __iar_builtin_UHASX
  #define __SSAX    __iar_builtin_SSAX
  #define __QSAX    __iar_builtin_QSAX
  #define __SHSAX   __iar_builtin_SHSAX
  #define __USAX    __iar_builtin_USAX
  #define __UQSAX   __iar_builtin_UQSAX
  #define __UHSAX   __iar_builtin_UHSAX
  #define __USAD8   __iar_builtin_USAD8
  #define __USADA8  __iar_builtin_USADA8
  #define __SSAT16  __iar_builtin_SSAT16
  #define __USAT16  __iar_builtin_USAT16
  #define __UXTB16  __iar_builtin_UXTB16
  #define __UXTAB16 __iar_builtin_UXTAB16
  #define __SXTB16  __iar_builtin_SXTB16
  #define __SXTAB16 __iar_builtin_SXTAB16
  #define __SMUAD   __iar_builtin_SMUAD
  #define __SMUADX  __iar_builtin_SMUADX
  #define __SMMLA   __iar_builtin_SMMLA
  #define __SMLAD   __iar_builtin_SMLAD
  #define __SMLADX  __iar_builtin_SMLADX
  #define __SMLALD  __iar_builtin_SMLALD
  #define __SMLALDX __iar_builtin_SMLALDX
  #define __SMUSD   __iar_builtin_SMUSD
  #define __SMUSDX  __iar_builtin_SMUSDX
  #define __SMLSD   __iar_builtin_SMLSD
  #define __SMLSDX  __iar_builtin_SMLSDX
  #define __SMLSLD  __iar_builtin_SMLSLD
  #define __SMLSLDX __iar_builtin_SMLSLDX
  #define __SEL     __iar_builtin_SEL
  #define __QADD    __iar_builtin_QADD
  #define __QSUB    __iar_builtin_QSUB
  #define __PKHBT   __iar_builtin_PKHBT
  #define __PKHTB   __iar_builtin_PKHTB
 #else /* __ICCARM_INTRINSICS_VERSION__ == 2 */
  #if !__FPU_PRESENT
  #define __get_FPSCR __cmsis_iar_get_FPSR_not_active
  #endif
  #ifdef __INTRINSICS_INCLUDED
  #error intrinsics.h is already included previously!
  #endif
  #include <intrinsics.h>
  #if !__FPU_PRESENT
  #define __get_FPSCR() (0)
  #endif
  #pragma diag_suppress=Pe940
  #pragma diag_suppress=Pe177
  #define __enable_irq        __enable_interrupt
  #define __disable_irq       __disable_interrupt
  #define __enable_fault_irq    __enable_fiq
  #define __disable_fault_irq   __disable_fiq
  #define __NOP               __no_operation
  #define __get_xPSR          __get_PSR
  __IAR_FT void __set_mode(uint32_t mode)
  {
    __ASM volatile("MSR  cpsr_c, %0" : : "r" (mode) : "memory");
  }
  __IAR_FT uint32_t __LDREXW(uint32_t volatile *ptr)
  {
    return __LDREX((unsigned long *)ptr);
  }
  __IAR_FT uint32_t __STREXW(uint32_t value, uint32_t volatile *ptr)
  {
    return __STREX(value, (unsigned long *)ptr);
  }
  __IAR_FT uint32_t __RRX(uint32_t value)
  {
    uint32_t result;
    __ASM("RRX      %0, %1" : "=r"(result) : "r" (value) : "cc");
    return(result);
  }
  __IAR_FT uint32_t __ROR(uint32_t op1, uint32_t op2)
  {
    return (op1 >> op2) | (op1 << ((sizeof(op1)*8)-op2));
  }
  __IAR_FT uint32_t __get_FPEXC(void)
  {
  #if (__FPU_PRESENT == 1)
    uint32_t result;
    __ASM volatile("VMRS %0, fpexc" : "=r" (result) : : "memory");
    return(result);
  #else
    return(0);
  #endif
  }
  __IAR_FT void __set_FPEXC(uint32_t fpexc)
  {
  #if (__FPU_PRESENT == 1)
    __ASM volatile ("VMSR fpexc, %0" : : "r" (fpexc) : "memory");
  #endif
  }
  #define __get_CP(cp, op1, Rt, CRn, CRm, op2) \
    __ASM volatile("MRC p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : "=r" (Rt) : : "memory" )
  #define __set_CP(cp, op1, Rt, CRn, CRm, op2) \
    __ASM volatile("MCR p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : : "r" (Rt) : "memory" )
  #define __get_CP64(cp, op1, Rt, CRm) \
    __ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : "=r" (Rt) : : "memory" )
  #define __set_CP64(cp, op1, Rt, CRm) \
    __ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm  : : "r" (Rt) : "memory" )
  #include "cmsis_cp15.h"
 #endif   /* __ICCARM_INTRINSICS_VERSION__ == 2 */
 #define __BKPT(value)    __asm volatile ("BKPT     %0" : : "i"(value))
 __IAR_FT uint32_t __get_SP_usr(void)
 {
  uint32_t cpsr;
  uint32_t result;
  __ASM volatile(
    "MRS     %0, cpsr   \n"
    "CPS     #0x1F      \n" // no effect in USR mode
    "MOV     %1, sp     \n"
    "MSR     cpsr_c, %2 \n" // no effect in USR mode
    "ISB" :  "=r"(cpsr), "=r"(result) : "r"(cpsr) : "memory"
   );
  return result;
 }
 __IAR_FT void __set_SP_usr(uint32_t topOfProcStack)
 {
  uint32_t cpsr;
  __ASM volatile(
    "MRS     %0, cpsr   \n"
    "CPS     #0x1F      \n" // no effect in USR mode
    "MOV     sp, %1     \n"
    "MSR     cpsr_c, %2 \n" // no effect in USR mode
    "ISB" : "=r"(cpsr) : "r" (topOfProcStack), "r"(cpsr) : "memory"
   );
 }
 #define __get_mode()                (__get_CPSR() & 0x1FU)
 __STATIC_INLINE
 void __FPU_Enable(void)
 {
  __ASM volatile(
    //Permit access to VFP/NEON, registers by modifying CPACR
    "        MRC     p15,0,R1,c1,c0,2  \n"
    "        ORR     R1,R1,#0x00F00000 \n"
    "        MCR     p15,0,R1,c1,c0,2  \n"
    //Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
    "        ISB                       \n"
    //Enable VFP/NEON
    "        VMRS    R1,FPEXC          \n"
    "        ORR     R1,R1,#0x40000000 \n"
    "        VMSR    FPEXC,R1          \n"
    //Initialise VFP/NEON registers to 0
    "        MOV     R2,#0             \n"
    //Initialise D16 registers to 0
    "        VMOV    D0, R2,R2         \n"
    "        VMOV    D1, R2,R2         \n"
    "        VMOV    D2, R2,R2         \n"
    "        VMOV    D3, R2,R2         \n"
    "        VMOV    D4, R2,R2         \n"
    "        VMOV    D5, R2,R2         \n"
    "        VMOV    D6, R2,R2         \n"
    "        VMOV    D7, R2,R2         \n"
    "        VMOV    D8, R2,R2         \n"
    "        VMOV    D9, R2,R2         \n"
    "        VMOV    D10,R2,R2         \n"
    "        VMOV    D11,R2,R2         \n"
    "        VMOV    D12,R2,R2         \n"
    "        VMOV    D13,R2,R2         \n"
    "        VMOV    D14,R2,R2         \n"
    "        VMOV    D15,R2,R2         \n"
 #ifdef __ARM_ADVANCED_SIMD__
    //Initialise D32 registers to 0
    "        VMOV    D16,R2,R2         \n"
    "        VMOV    D17,R2,R2         \n"
    "        VMOV    D18,R2,R2         \n"
    "        VMOV    D19,R2,R2         \n"
    "        VMOV    D20,R2,R2         \n"
    "        VMOV    D21,R2,R2         \n"
    "        VMOV    D22,R2,R2         \n"
    "        VMOV    D23,R2,R2         \n"
    "        VMOV    D24,R2,R2         \n"
    "        VMOV    D25,R2,R2         \n"
    "        VMOV    D26,R2,R2         \n"
    "        VMOV    D27,R2,R2         \n"
    "        VMOV    D28,R2,R2         \n"
    "        VMOV    D29,R2,R2         \n"
    "        VMOV    D30,R2,R2         \n"
    "        VMOV    D31,R2,R2         \n"
 #endif
    //Initialise FPSCR to a known state
    "        VMRS    R1,FPSCR          \n"
    "        MOV32   R2,#0x00086060    \n" //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
    "        AND     R1,R1,R2          \n"
    "        VMSR    FPSCR,R1          \n"
    : : : "cc", "r1", "r2"
  );
 }
 #undef __IAR_FT
 #undef __ICCARM_V8
 #pragma diag_default=Pe940
 #pragma diag_default=Pe177
 #endif /* __CMSIS_ICCARM_H__ */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/core_ca.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/core_ca.h
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/irq_ctrl.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Include/irq_ctrl.h
@@ -1,192 +0,0 @@
 /**************************************************************************//**
 * @file     irq_ctrl.h
 * @brief    Interrupt Controller API header file
 * @version  V1.1.0
 * @date     03. March 2020
 ******************************************************************************/
 /*
 * Copyright (c) 2017-2020 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #if   defined ( __ICCARM__ )
  #pragma system_include         /* treat file as system include file for MISRA check */
 #elif defined (__clang__)
  #pragma clang system_header   /* treat file as system include file */
 #endif
 #ifndef IRQ_CTRL_H_
 #define IRQ_CTRL_H_
 #include <stdint.h>
 #ifndef IRQHANDLER_T
 #define IRQHANDLER_T
 /// Interrupt handler data type
 typedef void (*IRQHandler_t) (void);
 #endif
 #ifndef IRQN_ID_T
 #define IRQN_ID_T
 /// Interrupt ID number data type
 typedef int32_t IRQn_ID_t;
 #endif
 /* Interrupt mode bit-masks */
 #define IRQ_MODE_TRIG_Pos           (0U)
 #define IRQ_MODE_TRIG_Msk           (0x07UL /*<< IRQ_MODE_TRIG_Pos*/)
 #define IRQ_MODE_TRIG_LEVEL         (0x00UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: level triggered interrupt
 #define IRQ_MODE_TRIG_LEVEL_LOW     (0x01UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: low level triggered interrupt
 #define IRQ_MODE_TRIG_LEVEL_HIGH    (0x02UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: high level triggered interrupt
 #define IRQ_MODE_TRIG_EDGE          (0x04UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: edge triggered interrupt
 #define IRQ_MODE_TRIG_EDGE_RISING   (0x05UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: rising edge triggered interrupt
 #define IRQ_MODE_TRIG_EDGE_FALLING  (0x06UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: falling edge triggered interrupt
 #define IRQ_MODE_TRIG_EDGE_BOTH     (0x07UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: rising and falling edge triggered interrupt
 #define IRQ_MODE_TYPE_Pos           (3U)
 #define IRQ_MODE_TYPE_Msk           (0x01UL << IRQ_MODE_TYPE_Pos)
 #define IRQ_MODE_TYPE_IRQ           (0x00UL << IRQ_MODE_TYPE_Pos)     ///< Type: interrupt source triggers CPU IRQ line
 #define IRQ_MODE_TYPE_FIQ           (0x01UL << IRQ_MODE_TYPE_Pos)     ///< Type: interrupt source triggers CPU FIQ line
 #define IRQ_MODE_DOMAIN_Pos         (4U)
 #define IRQ_MODE_DOMAIN_Msk         (0x01UL << IRQ_MODE_DOMAIN_Pos)
 #define IRQ_MODE_DOMAIN_NONSECURE   (0x00UL << IRQ_MODE_DOMAIN_Pos)   ///< Domain: interrupt is targeting non-secure domain
 #define IRQ_MODE_DOMAIN_SECURE      (0x01UL << IRQ_MODE_DOMAIN_Pos)   ///< Domain: interrupt is targeting secure domain
 #define IRQ_MODE_CPU_Pos            (5U)
 #define IRQ_MODE_CPU_Msk            (0xFFUL << IRQ_MODE_CPU_Pos)
 #define IRQ_MODE_CPU_ALL            (0x00UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets all CPUs
 #define IRQ_MODE_CPU_0              (0x01UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 0
 #define IRQ_MODE_CPU_1              (0x02UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 1
 #define IRQ_MODE_CPU_2              (0x04UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 2
 #define IRQ_MODE_CPU_3              (0x08UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 3
 #define IRQ_MODE_CPU_4              (0x10UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 4
 #define IRQ_MODE_CPU_5              (0x20UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 5
 #define IRQ_MODE_CPU_6              (0x40UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 6
 #define IRQ_MODE_CPU_7              (0x80UL << IRQ_MODE_CPU_Pos)      ///< CPU: interrupt targets CPU 7
 // Encoding in some early GIC implementations
 #define IRQ_MODE_MODEL_Pos          (13U)
 #define IRQ_MODE_MODEL_Msk          (0x1UL << IRQ_MODE_MODEL_Pos)
 #define IRQ_MODE_MODEL_NN           (0x0UL << IRQ_MODE_MODEL_Pos)     ///< Corresponding interrupt is handled using the N-N model
 #define IRQ_MODE_MODEL_1N           (0x1UL << IRQ_MODE_MODEL_Pos)     ///< Corresponding interrupt is handled using the 1-N model
 #define IRQ_MODE_ERROR              (0x80000000UL)                    ///< Bit indicating mode value error
 /* Interrupt priority bit-masks */
 #define IRQ_PRIORITY_Msk            (0x0000FFFFUL)                    ///< Interrupt priority value bit-mask
 #define IRQ_PRIORITY_ERROR          (0x80000000UL)                    ///< Bit indicating priority value error
 /// Initialize interrupt controller.
 /// \return 0 on success, -1 on error.
 int32_t IRQ_Initialize (void);
 /// Register interrupt handler.
 /// \param[in]     irqn          interrupt ID number
 /// \param[in]     handler       interrupt handler function address
 /// \return 0 on success, -1 on error.
 int32_t IRQ_SetHandler (IRQn_ID_t irqn, IRQHandler_t handler);
 /// Get the registered interrupt handler.
 /// \param[in]     irqn          interrupt ID number
 /// \return registered interrupt handler function address.
 IRQHandler_t IRQ_GetHandler (IRQn_ID_t irqn);
 /// Enable interrupt.
 /// \param[in]     irqn          interrupt ID number
 /// \return 0 on success, -1 on error.
 int32_t IRQ_Enable (IRQn_ID_t irqn);
 /// Disable interrupt.
 /// \param[in]     irqn          interrupt ID number
 /// \return 0 on success, -1 on error.
 int32_t IRQ_Disable (IRQn_ID_t irqn);
 /// Get interrupt enable state.
 /// \param[in]     irqn          interrupt ID number
 /// \return 0 - interrupt is disabled, 1 - interrupt is enabled.
 uint32_t IRQ_GetEnableState (IRQn_ID_t irqn);
 /// Configure interrupt request mode.
 /// \param[in]     irqn          interrupt ID number
 /// \param[in]     mode          mode configuration
 /// \return 0 on success, -1 on error.
 int32_t IRQ_SetMode (IRQn_ID_t irqn, uint32_t mode);
 /// Get interrupt mode configuration.
 /// \param[in]     irqn          interrupt ID number
 /// \return current interrupt mode configuration with optional IRQ_MODE_ERROR bit set.
 uint32_t IRQ_GetMode (IRQn_ID_t irqn);
 /// Get ID number of current interrupt request (IRQ).
 /// \return interrupt ID number.
 IRQn_ID_t IRQ_GetActiveIRQ (void);
 /// Get ID number of current fast interrupt request (FIQ).
 /// \return interrupt ID number.
 IRQn_ID_t IRQ_GetActiveFIQ (void);
 /// Signal end of interrupt processing.
 /// \param[in]     irqn          interrupt ID number
 /// \return 0 on success, -1 on error.
 int32_t IRQ_EndOfInterrupt (IRQn_ID_t irqn);
 /// Set interrupt pending flag.
 /// \param[in]     irqn          interrupt ID number
 /// \return 0 on success, -1 on error.
 int32_t IRQ_SetPending (IRQn_ID_t irqn);
 /// Get interrupt pending flag.
 /// \param[in]     irqn          interrupt ID number
 /// \return 0 - interrupt is not pending, 1 - interrupt is pending.
 uint32_t IRQ_GetPending (IRQn_ID_t irqn);
 /// Clear interrupt pending flag.
 /// \param[in]     irqn          interrupt ID number
 /// \return 0 on success, -1 on error.
 int32_t IRQ_ClearPending (IRQn_ID_t irqn);
 /// Set interrupt priority value.
 /// \param[in]     irqn          interrupt ID number
 /// \param[in]     priority      interrupt priority value
 /// \return 0 on success, -1 on error.
 int32_t IRQ_SetPriority (IRQn_ID_t irqn, uint32_t priority);
 /// Get interrupt priority.
 /// \param[in]     irqn          interrupt ID number
 /// \return current interrupt priority value with optional IRQ_PRIORITY_ERROR bit set.
 uint32_t IRQ_GetPriority (IRQn_ID_t irqn);
 /// Set priority masking threshold.
 /// \param[in]     priority      priority masking threshold value
 /// \return 0 on success, -1 on error.
 int32_t IRQ_SetPriorityMask (uint32_t priority);
 /// Get priority masking threshold
 /// \return current priority masking threshold value with optional IRQ_PRIORITY_ERROR bit set.
 uint32_t IRQ_GetPriorityMask (void);
 /// Set priority grouping field split point
 /// \param[in]     bits          number of MSB bits included in the group priority field comparison
 /// \return 0 on success, -1 on error.
 int32_t IRQ_SetPriorityGroupBits (uint32_t bits);
 /// Get priority grouping field split point
 /// \return current number of MSB bits included in the group priority field comparison with
 ///         optional IRQ_PRIORITY_ERROR bit set.
 uint32_t IRQ_GetPriorityGroupBits (void);
 #endif  // IRQ_CTRL_H_
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Source/irq_ctrl_gic.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/Core_A/Source/irq_ctrl_gic.c
@@ -1,418 +0,0 @@
 /**************************************************************************//**
 * @file     irq_ctrl_gic.c
 * @brief    Interrupt controller handling implementation for GIC
 * @version  V1.1.1
 * @date     29. March 2021
 ******************************************************************************/
 /*
 * Copyright (c) 2017-2021 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stddef.h>
 #include "RTE_Components.h"
 #include CMSIS_device_header
 #include "irq_ctrl.h"
 #if defined(__GIC_PRESENT) && (__GIC_PRESENT == 1U)
 /// Number of implemented interrupt lines
 #ifndef IRQ_GIC_LINE_COUNT
 #define IRQ_GIC_LINE_COUNT      (1020U)
 #endif
 static IRQHandler_t IRQTable[IRQ_GIC_LINE_COUNT] = { 0U };
 static uint32_t     IRQ_ID0;
 /// Initialize interrupt controller.
 __WEAK int32_t IRQ_Initialize (void) {
  uint32_t i;
  for (i = 0U; i < IRQ_GIC_LINE_COUNT; i++) {
    IRQTable[i] = (IRQHandler_t)NULL;
  }
  GIC_Enable();
  return (0);
 }
 /// Register interrupt handler.
 __WEAK int32_t IRQ_SetHandler (IRQn_ID_t irqn, IRQHandler_t handler) {
  int32_t status;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    IRQTable[irqn] = handler;
    status =  0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Get the registered interrupt handler.
 __WEAK IRQHandler_t IRQ_GetHandler (IRQn_ID_t irqn) {
  IRQHandler_t h;
  // Ignore CPUID field (software generated interrupts)
  irqn &= 0x3FFU;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    h = IRQTable[irqn];
  } else {
    h = (IRQHandler_t)0;
  }
  return (h);
 }
 /// Enable interrupt.
 __WEAK int32_t IRQ_Enable (IRQn_ID_t irqn) {
  int32_t status;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    GIC_EnableIRQ ((IRQn_Type)irqn);
    status = 0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Disable interrupt.
 __WEAK int32_t IRQ_Disable (IRQn_ID_t irqn) {
  int32_t status;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    GIC_DisableIRQ ((IRQn_Type)irqn);
    status = 0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Get interrupt enable state.
 __WEAK uint32_t IRQ_GetEnableState (IRQn_ID_t irqn) {
  uint32_t enable;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    enable = GIC_GetEnableIRQ((IRQn_Type)irqn);
  } else {
    enable = 0U;
  }
  return (enable);
 }
 /// Configure interrupt request mode.
 __WEAK int32_t IRQ_SetMode (IRQn_ID_t irqn, uint32_t mode) {
  uint32_t val;
  uint8_t cfg;
  uint8_t secure;
  uint8_t cpu;
  int32_t status = 0;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    // Check triggering mode
    val = (mode & IRQ_MODE_TRIG_Msk);
    if (val == IRQ_MODE_TRIG_LEVEL) {
      cfg = 0x00U;
    } else if (val == IRQ_MODE_TRIG_EDGE) {
      cfg = 0x02U;
    } else {
      cfg = 0x00U;
      status = -1;
    }
    val = (mode & IRQ_MODE_MODEL_Msk);
    if (val == IRQ_MODE_MODEL_1N) {
      cfg |= 1;   // 1-N model
    }
    // Check interrupt type
    val = mode & IRQ_MODE_TYPE_Msk;
    if (val != IRQ_MODE_TYPE_IRQ) {
      status = -1;
    }
    // Check interrupt domain
    val = mode & IRQ_MODE_DOMAIN_Msk;
    if (val == IRQ_MODE_DOMAIN_NONSECURE) {
      secure = 0U;
    } else {
      // Check security extensions support
      val = GIC_DistributorInfo() & (1UL << 10U);
      if (val != 0U) {
        // Security extensions are supported
        secure = 1U;
      } else {
        secure = 0U;
        status = -1;
      }
    }
    // Check interrupt CPU targets
    val = mode & IRQ_MODE_CPU_Msk;
    if (val == IRQ_MODE_CPU_ALL) {
      cpu = 0xFFU;
    } else {
      cpu = (uint8_t)(val >> IRQ_MODE_CPU_Pos);
    }
    // Apply configuration if no mode error
    if (status == 0) {
      GIC_SetConfiguration((IRQn_Type)irqn, cfg);
      GIC_SetTarget       ((IRQn_Type)irqn, cpu);
      if (secure != 0U) {
        GIC_SetGroup ((IRQn_Type)irqn, secure);
      }
    }
  }
  return (status);
 }
 /// Get interrupt mode configuration.
 __WEAK uint32_t IRQ_GetMode (IRQn_ID_t irqn) {
  uint32_t mode;
  uint32_t val;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    mode = IRQ_MODE_TYPE_IRQ;
    // Get trigger mode
    val = GIC_GetConfiguration((IRQn_Type)irqn);
    if ((val & 2U) != 0U) {
      // Corresponding interrupt is edge triggered
      mode |= IRQ_MODE_TRIG_EDGE;
    } else {
      // Corresponding interrupt is level triggered
      mode |= IRQ_MODE_TRIG_LEVEL;
    }
    if (val & 1U) {
      mode |= IRQ_MODE_MODEL_1N;
    }
    // Get interrupt CPU targets
    mode |= GIC_GetTarget ((IRQn_Type)irqn) << IRQ_MODE_CPU_Pos;
  } else {
    mode = IRQ_MODE_ERROR;
  }
  return (mode);
 }
 /// Get ID number of current interrupt request (IRQ).
 __WEAK IRQn_ID_t IRQ_GetActiveIRQ (void) {
  IRQn_ID_t irqn;
  uint32_t prio;
  /* Dummy read to avoid GIC 390 errata 801120 */
  GIC_GetHighPendingIRQ();
  irqn = GIC_AcknowledgePending();
  __DSB();
  /* Workaround GIC 390 errata 733075 (GIC-390_Errata_Notice_v6.pdf, 09-Jul-2014)  */
  /* The following workaround code is for a single-core system.  It would be       */
  /* different in a multi-core system.                                             */
  /* If the ID is 0 or 0x3FE or 0x3FF, then the GIC CPU interface may be locked-up */
  /* so unlock it, otherwise service the interrupt as normal.                      */
  /* Special IDs 1020=0x3FC and 1021=0x3FD are reserved values in GICv1 and GICv2  */
  /* so will not occur here.                                                       */
  if ((irqn == 0) || (irqn >= 0x3FE)) {
    /* Unlock the CPU interface with a dummy write to Interrupt Priority Register */
    prio = GIC_GetPriority((IRQn_Type)0);
    GIC_SetPriority ((IRQn_Type)0, prio);
    __DSB();
    if ((irqn == 0U) && ((GIC_GetIRQStatus ((IRQn_Type)irqn) & 1U) != 0U) && (IRQ_ID0 == 0U)) {
      /* If the ID is 0, is active and has not been seen before */
      IRQ_ID0 = 1U;
    }
    /* End of Workaround GIC 390 errata 733075 */
  }
  return (irqn);
 }
 /// Get ID number of current fast interrupt request (FIQ).
 __WEAK IRQn_ID_t IRQ_GetActiveFIQ (void) {
  return ((IRQn_ID_t)-1);
 }
 /// Signal end of interrupt processing.
 __WEAK int32_t IRQ_EndOfInterrupt (IRQn_ID_t irqn) {
  int32_t status;
  IRQn_Type irq = (IRQn_Type)irqn;
  irqn &= 0x3FFU;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    GIC_EndInterrupt (irq);
    if (irqn == 0) {
      IRQ_ID0 = 0U;
    }
    status = 0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Set interrupt pending flag.
 __WEAK int32_t IRQ_SetPending (IRQn_ID_t irqn) {
  int32_t status;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    GIC_SetPendingIRQ ((IRQn_Type)irqn);
    status = 0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Get interrupt pending flag.
 __WEAK uint32_t IRQ_GetPending (IRQn_ID_t irqn) {
  uint32_t pending;
  if ((irqn >= 16) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    pending = GIC_GetPendingIRQ ((IRQn_Type)irqn);
  } else {
    pending = 0U;
  }
  return (pending & 1U);
 }
 /// Clear interrupt pending flag.
 __WEAK int32_t IRQ_ClearPending (IRQn_ID_t irqn) {
  int32_t status;
  if ((irqn >= 16) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    GIC_ClearPendingIRQ ((IRQn_Type)irqn);
    status = 0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Set interrupt priority value.
 __WEAK int32_t IRQ_SetPriority (IRQn_ID_t irqn, uint32_t priority) {
  int32_t status;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    GIC_SetPriority ((IRQn_Type)irqn, priority);
    status = 0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Get interrupt priority.
 __WEAK uint32_t IRQ_GetPriority (IRQn_ID_t irqn) {
  uint32_t priority;
  if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
    priority = GIC_GetPriority ((IRQn_Type)irqn);
  } else {
    priority = IRQ_PRIORITY_ERROR;
  }
  return (priority);
 }
 /// Set priority masking threshold.
 __WEAK int32_t IRQ_SetPriorityMask (uint32_t priority) {
  GIC_SetInterfacePriorityMask (priority);
  return (0);
 }
 /// Get priority masking threshold
 __WEAK uint32_t IRQ_GetPriorityMask (void) {
  return GIC_GetInterfacePriorityMask();
 }
 /// Set priority grouping field split point
 __WEAK int32_t IRQ_SetPriorityGroupBits (uint32_t bits) {
  int32_t status;
  if (bits == IRQ_PRIORITY_Msk) {
    bits = 7U;
  }
  if (bits < 8U) {
    GIC_SetBinaryPoint (7U - bits);
    status = 0;
  } else {
    status = -1;
  }
  return (status);
 }
 /// Get priority grouping field split point
 __WEAK uint32_t IRQ_GetPriorityGroupBits (void) {
  uint32_t bp;
  bp = GIC_GetBinaryPoint() & 0x07U;
  return (7U - bp);
 }
 #endif
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Config/DAP_config.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Config/DAP_config.h
@@ -1,561 +0,0 @@
 /*
 * Copyright (c) 2013-2021 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ----------------------------------------------------------------------
 *
 * $Date:        16. June 2021
 * $Revision:    V2.1.0
 *
 * Project:      CMSIS-DAP Configuration
 * Title:        DAP_config.h CMSIS-DAP Configuration File (Template)
 *
 *---------------------------------------------------------------------------*/
 #ifndef __DAP_CONFIG_H__
 #define __DAP_CONFIG_H__
 //**************************************************************************************************
 /**
 \defgroup DAP_Config_Debug_gr CMSIS-DAP Debug Unit Information
 \ingroup DAP_ConfigIO_gr
@{
 Provides definitions about the hardware and configuration of the Debug Unit.
 This information includes:
 - Definition of Cortex-M processor parameters used in CMSIS-DAP Debug Unit.
 - Debug Unit Identification strings (Vendor, Product, Serial Number).
 - Debug Unit communication packet size.
 - Debug Access Port supported modes and settings (JTAG/SWD and SWO).
 - Optional information about a connected Target Device (for Evaluation Boards).
 */
 #ifdef _RTE_
 #include "RTE_Components.h"
 #include CMSIS_device_header
 #else
 #include "device.h"                             // Debug Unit Cortex-M Processor Header File
 #endif
 /// Processor Clock of the Cortex-M MCU used in the Debug Unit.
 /// This value is used to calculate the SWD/JTAG clock speed.
 #define CPU_CLOCK               100000000U      ///< Specifies the CPU Clock in Hz.
 /// Number of processor cycles for I/O Port write operations.
 /// This value is used to calculate the SWD/JTAG clock speed that is generated with I/O
 /// Port write operations in the Debug Unit by a Cortex-M MCU. Most Cortex-M processors
 /// require 2 processor cycles for a I/O Port Write operation.  If the Debug Unit uses
 /// a Cortex-M0+ processor with high-speed peripheral I/O only 1 processor cycle might be
 /// required.
 #define IO_PORT_WRITE_CYCLES    2U              ///< I/O Cycles: 2=default, 1=Cortex-M0+ fast I/0.
 /// Indicate that Serial Wire Debug (SWD) communication mode is available at the Debug Access Port.
 /// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
 #define DAP_SWD                 1               ///< SWD Mode:  1 = available, 0 = not available.
 /// Indicate that JTAG communication mode is available at the Debug Port.
 /// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
 #define DAP_JTAG                1               ///< JTAG Mode: 1 = available, 0 = not available.
 /// Configure maximum number of JTAG devices on the scan chain connected to the Debug Access Port.
 /// This setting impacts the RAM requirements of the Debug Unit. Valid range is 1 .. 255.
 #define DAP_JTAG_DEV_CNT        8U              ///< Maximum number of JTAG devices on scan chain.
 /// Default communication mode on the Debug Access Port.
 /// Used for the command \ref DAP_Connect when Port Default mode is selected.
 #define DAP_DEFAULT_PORT        1U              ///< Default JTAG/SWJ Port Mode: 1 = SWD, 2 = JTAG.
 /// Default communication speed on the Debug Access Port for SWD and JTAG mode.
 /// Used to initialize the default SWD/JTAG clock frequency.
 /// The command \ref DAP_SWJ_Clock can be used to overwrite this default setting.
 #define DAP_DEFAULT_SWJ_CLOCK   1000000U        ///< Default SWD/JTAG clock frequency in Hz.
 /// Maximum Package Size for Command and Response data.
 /// This configuration settings is used to optimize the communication performance with the
 /// debugger and depends on the USB peripheral. Typical vales are 64 for Full-speed USB HID or WinUSB,
 /// 1024 for High-speed USB HID and 512 for High-speed USB WinUSB.
 #define DAP_PACKET_SIZE         512U            ///< Specifies Packet Size in bytes.
 /// Maximum Package Buffers for Command and Response data.
 /// This configuration settings is used to optimize the communication performance with the
 /// debugger and depends on the USB peripheral. For devices with limited RAM or USB buffer the
 /// setting can be reduced (valid range is 1 .. 255).
 #define DAP_PACKET_COUNT        8U              ///< Specifies number of packets buffered.
 /// Indicate that UART Serial Wire Output (SWO) trace is available.
 /// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
 #define SWO_UART                1               ///< SWO UART:  1 = available, 0 = not available.
 /// USART Driver instance number for the UART SWO.
 #define SWO_UART_DRIVER         0               ///< USART Driver instance number (Driver_USART#).
 /// Maximum SWO UART Baudrate.
 #define SWO_UART_MAX_BAUDRATE   10000000U       ///< SWO UART Maximum Baudrate in Hz.
 /// Indicate that Manchester Serial Wire Output (SWO) trace is available.
 /// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
 #define SWO_MANCHESTER          0               ///< SWO Manchester:  1 = available, 0 = not available.
 /// SWO Trace Buffer Size.
 #define SWO_BUFFER_SIZE         4096U           ///< SWO Trace Buffer Size in bytes (must be 2^n).
 /// SWO Streaming Trace.
 #define SWO_STREAM              0               ///< SWO Streaming Trace: 1 = available, 0 = not available.
 /// Clock frequency of the Test Domain Timer. Timer value is returned with \ref TIMESTAMP_GET.
 #define TIMESTAMP_CLOCK         100000000U      ///< Timestamp clock in Hz (0 = timestamps not supported).
 /// Indicate that UART Communication Port is available.
 /// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
 #define DAP_UART                1               ///< DAP UART:  1 = available, 0 = not available.
 /// USART Driver instance number for the UART Communication Port.
 #define DAP_UART_DRIVER         1               ///< USART Driver instance number (Driver_USART#).
 /// UART Receive Buffer Size.
 #define DAP_UART_RX_BUFFER_SIZE 1024U           ///< Uart Receive Buffer Size in bytes (must be 2^n).
 /// UART Transmit Buffer Size.
 #define DAP_UART_TX_BUFFER_SIZE 1024U           ///< Uart Transmit Buffer Size in bytes (must be 2^n).
 /// Indicate that UART Communication via USB COM Port is available.
 /// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
 #define DAP_UART_USB_COM_PORT   1               ///< USB COM Port:  1 = available, 0 = not available.
 /// Debug Unit is connected to fixed Target Device.
 /// The Debug Unit may be part of an evaluation board and always connected to a fixed
 /// known device. In this case a Device Vendor, Device Name, Board Vendor and Board Name strings
 /// are stored and may be used by the debugger or IDE to configure device parameters.
 #define TARGET_FIXED            0               ///< Target: 1 = known, 0 = unknown;
 #define TARGET_DEVICE_VENDOR    "Arm"           ///< String indicating the Silicon Vendor
 #define TARGET_DEVICE_NAME      "Cortex-M"      ///< String indicating the Target Device
 #define TARGET_BOARD_VENDOR     "Arm"           ///< String indicating the Board Vendor
 #define TARGET_BOARD_NAME       "Arm board"     ///< String indicating the Board Name
 #if TARGET_FIXED != 0
 #include <string.h>
 static const char TargetDeviceVendor [] = TARGET_DEVICE_VENDOR;
 static const char TargetDeviceName   [] = TARGET_DEVICE_NAME;
 static const char TargetBoardVendor  [] = TARGET_BOARD_VENDOR;
 static const char TargetBoardName    [] = TARGET_BOARD_NAME;
 #endif
 /** Get Vendor Name string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetVendorString (char *str) {
  (void)str;
  return (0U);
 }
 /** Get Product Name string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetProductString (char *str) {
  (void)str;
  return (0U);
 }
 /** Get Serial Number string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetSerNumString (char *str) {
  (void)str;
  return (0U);
 }
 /** Get Target Device Vendor string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetTargetDeviceVendorString (char *str) {
 #if TARGET_FIXED != 0
  uint8_t len;
  strcpy(str, TargetDeviceVendor);
  len = (uint8_t)(strlen(TargetDeviceVendor) + 1U);
  return (len);
 #else
  (void)str;
  return (0U);
 #endif
 }
 /** Get Target Device Name string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetTargetDeviceNameString (char *str) {
 #if TARGET_FIXED != 0
  uint8_t len;
  strcpy(str, TargetDeviceName);
  len = (uint8_t)(strlen(TargetDeviceName) + 1U);
  return (len);
 #else
  (void)str;
  return (0U);
 #endif
 }
 /** Get Target Board Vendor string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetTargetBoardVendorString (char *str) {
 #if TARGET_FIXED != 0
  uint8_t len;
  strcpy(str, TargetBoardVendor);
  len = (uint8_t)(strlen(TargetBoardVendor) + 1U);
  return (len);
 #else
  (void)str;
  return (0U);
 #endif
 }
 /** Get Target Board Name string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetTargetBoardNameString (char *str) {
 #if TARGET_FIXED != 0
  uint8_t len;
  strcpy(str, TargetBoardName);
  len = (uint8_t)(strlen(TargetBoardName) + 1U);
  return (len);
 #else
  (void)str;
  return (0U);
 #endif
 }
 /** Get Product Firmware Version string.
 \param str Pointer to buffer to store the string (max 60 characters).
 \return String length (including terminating NULL character) or 0 (no string).
 */
 __STATIC_INLINE uint8_t DAP_GetProductFirmwareVersionString (char *str) {
  (void)str;
  return (0U);
 }
 ///@}
 //**************************************************************************************************
 /**
 \defgroup DAP_Config_PortIO_gr CMSIS-DAP Hardware I/O Pin Access
 \ingroup DAP_ConfigIO_gr
@{
 Standard I/O Pins of the CMSIS-DAP Hardware Debug Port support standard JTAG mode
 and Serial Wire Debug (SWD) mode. In SWD mode only 2 pins are required to implement the debug
 interface of a device. The following I/O Pins are provided:
 JTAG I/O Pin                 | SWD I/O Pin          | CMSIS-DAP Hardware pin mode
 ---------------------------- | -------------------- | ---------------------------------------------
 TCK: Test Clock              | SWCLK: Clock         | Output Push/Pull
 TMS: Test Mode Select        | SWDIO: Data I/O      | Output Push/Pull; Input (for receiving data)
 TDI: Test Data Input         |                      | Output Push/Pull
 TDO: Test Data Output        |                      | Input
 nTRST: Test Reset (optional) |                      | Output Open Drain with pull-up resistor
 nRESET: Device Reset         | nRESET: Device Reset | Output Open Drain with pull-up resistor
 DAP Hardware I/O Pin Access Functions
 -------------------------------------
 The various I/O Pins are accessed by functions that implement the Read, Write, Set, or Clear to
 these I/O Pins.
 For the SWDIO I/O Pin there are additional functions that are called in SWD I/O mode only.
 This functions are provided to achieve faster I/O that is possible with some advanced GPIO
 peripherals that can independently write/read a single I/O pin without affecting any other pins
 of the same I/O port. The following SWDIO I/O Pin functions are provided:
 - \ref PIN_SWDIO_OUT_ENABLE to enable the output mode from the DAP hardware.
 - \ref PIN_SWDIO_OUT_DISABLE to enable the input mode to the DAP hardware.
 - \ref PIN_SWDIO_IN to read from the SWDIO I/O pin with utmost possible speed.
 - \ref PIN_SWDIO_OUT to write to the SWDIO I/O pin with utmost possible speed.
 */
 // Configure DAP I/O pins ------------------------------
 /** Setup JTAG I/O pins: TCK, TMS, TDI, TDO, nTRST, and nRESET.
 Configures the DAP Hardware I/O pins for JTAG mode:
 - TCK, TMS, TDI, nTRST, nRESET to output mode and set to high level.
 - TDO to input mode.
 */
 __STATIC_INLINE void PORT_JTAG_SETUP (void) {
  ;
 }
 /** Setup SWD I/O pins: SWCLK, SWDIO, and nRESET.
 Configures the DAP Hardware I/O pins for Serial Wire Debug (SWD) mode:
 - SWCLK, SWDIO, nRESET to output mode and set to default high level.
 - TDI, nTRST to HighZ mode (pins are unused in SWD mode).
 */
 __STATIC_INLINE void PORT_SWD_SETUP (void) {
  ;
 }
 /** Disable JTAG/SWD I/O Pins.
 Disables the DAP Hardware I/O pins which configures:
 - TCK/SWCLK, TMS/SWDIO, TDI, TDO, nTRST, nRESET to High-Z mode.
 */
 __STATIC_INLINE void PORT_OFF (void) {
  ;
 }
 // SWCLK/TCK I/O pin -------------------------------------
 /** SWCLK/TCK I/O pin: Get Input.
 \return Current status of the SWCLK/TCK DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE uint32_t PIN_SWCLK_TCK_IN  (void) {
  return (0U);
 }
 /** SWCLK/TCK I/O pin: Set Output to High.
 Set the SWCLK/TCK DAP hardware I/O pin to high level.
 */
 __STATIC_FORCEINLINE void     PIN_SWCLK_TCK_SET (void) {
  ;
 }
 /** SWCLK/TCK I/O pin: Set Output to Low.
 Set the SWCLK/TCK DAP hardware I/O pin to low level.
 */
 __STATIC_FORCEINLINE void     PIN_SWCLK_TCK_CLR (void) {
  ;
 }
 // SWDIO/TMS Pin I/O --------------------------------------
 /** SWDIO/TMS I/O pin: Get Input.
 \return Current status of the SWDIO/TMS DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE uint32_t PIN_SWDIO_TMS_IN  (void) {
  return (0U);
 }
 /** SWDIO/TMS I/O pin: Set Output to High.
 Set the SWDIO/TMS DAP hardware I/O pin to high level.
 */
 __STATIC_FORCEINLINE void     PIN_SWDIO_TMS_SET (void) {
  ;
 }
 /** SWDIO/TMS I/O pin: Set Output to Low.
 Set the SWDIO/TMS DAP hardware I/O pin to low level.
 */
 __STATIC_FORCEINLINE void     PIN_SWDIO_TMS_CLR (void) {
  ;
 }
 /** SWDIO I/O pin: Get Input (used in SWD mode only).
 \return Current status of the SWDIO DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE uint32_t PIN_SWDIO_IN      (void) {
  return (0U);
 }
 /** SWDIO I/O pin: Set Output (used in SWD mode only).
 \param bit Output value for the SWDIO DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE void     PIN_SWDIO_OUT     (uint32_t bit) {
  ;
 }
 /** SWDIO I/O pin: Switch to Output mode (used in SWD mode only).
 Configure the SWDIO DAP hardware I/O pin to output mode. This function is
 called prior \ref PIN_SWDIO_OUT function calls.
 */
 __STATIC_FORCEINLINE void     PIN_SWDIO_OUT_ENABLE  (void) {
  ;
 }
 /** SWDIO I/O pin: Switch to Input mode (used in SWD mode only).
 Configure the SWDIO DAP hardware I/O pin to input mode. This function is
 called prior \ref PIN_SWDIO_IN function calls.
 */
 __STATIC_FORCEINLINE void     PIN_SWDIO_OUT_DISABLE (void) {
  ;
 }
 // TDI Pin I/O ---------------------------------------------
 /** TDI I/O pin: Get Input.
 \return Current status of the TDI DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE uint32_t PIN_TDI_IN  (void) {
  return (0U);
 }
 /** TDI I/O pin: Set Output.
 \param bit Output value for the TDI DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE void     PIN_TDI_OUT (uint32_t bit) {
  ;
 }
 // TDO Pin I/O ---------------------------------------------
 /** TDO I/O pin: Get Input.
 \return Current status of the TDO DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE uint32_t PIN_TDO_IN  (void) {
  return (0U);
 }
 // nTRST Pin I/O -------------------------------------------
 /** nTRST I/O pin: Get Input.
 \return Current status of the nTRST DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE uint32_t PIN_nTRST_IN   (void) {
  return (0U);
 }
 /** nTRST I/O pin: Set Output.
 \param bit JTAG TRST Test Reset pin status:
           - 0: issue a JTAG TRST Test Reset.
           - 1: release JTAG TRST Test Reset.
 */
 __STATIC_FORCEINLINE void     PIN_nTRST_OUT  (uint32_t bit) {
  ;
 }
 // nRESET Pin I/O------------------------------------------
 /** nRESET I/O pin: Get Input.
 \return Current status of the nRESET DAP hardware I/O pin.
 */
 __STATIC_FORCEINLINE uint32_t PIN_nRESET_IN  (void) {
  return (0U);
 }
 /** nRESET I/O pin: Set Output.
 \param bit target device hardware reset pin status:
           - 0: issue a device hardware reset.
           - 1: release device hardware reset.
 */
 __STATIC_FORCEINLINE void     PIN_nRESET_OUT (uint32_t bit) {
  ;
 }
 ///@}
 //**************************************************************************************************
 /**
 \defgroup DAP_Config_LEDs_gr CMSIS-DAP Hardware Status LEDs
 \ingroup DAP_ConfigIO_gr
@{
 CMSIS-DAP Hardware may provide LEDs that indicate the status of the CMSIS-DAP Debug Unit.
 It is recommended to provide the following LEDs for status indication:
 - Connect LED: is active when the DAP hardware is connected to a debugger.
 - Running LED: is active when the debugger has put the target device into running state.
 */
 /** Debug Unit: Set status of Connected LED.
 \param bit status of the Connect LED.
           - 1: Connect LED ON: debugger is connected to CMSIS-DAP Debug Unit.
           - 0: Connect LED OFF: debugger is not connected to CMSIS-DAP Debug Unit.
 */
 __STATIC_INLINE void LED_CONNECTED_OUT (uint32_t bit) {}
 /** Debug Unit: Set status Target Running LED.
 \param bit status of the Target Running LED.
           - 1: Target Running LED ON: program execution in target started.
           - 0: Target Running LED OFF: program execution in target stopped.
 */
 __STATIC_INLINE void LED_RUNNING_OUT (uint32_t bit) {}
 ///@}
 //**************************************************************************************************
 /**
 \defgroup DAP_Config_Timestamp_gr CMSIS-DAP Timestamp
 \ingroup DAP_ConfigIO_gr
@{
 Access function for Test Domain Timer.
 The value of the Test Domain Timer in the Debug Unit is returned by the function \ref TIMESTAMP_GET. By
 default, the DWT timer is used.  The frequency of this timer is configured with \ref TIMESTAMP_CLOCK.
 */
 /** Get timestamp of Test Domain Timer.
 \return Current timestamp value.
 */
 __STATIC_INLINE uint32_t TIMESTAMP_GET (void) {
  return (DWT->CYCCNT);
 }
 ///@}
 //**************************************************************************************************
 /**
 \defgroup DAP_Config_Initialization_gr CMSIS-DAP Initialization
 \ingroup DAP_ConfigIO_gr
@{
 CMSIS-DAP Hardware I/O and LED Pins are initialized with the function \ref DAP_SETUP.
 */
 /** Setup of the Debug Unit I/O pins and LEDs (called when Debug Unit is initialized).
 This function performs the initialization of the CMSIS-DAP Hardware I/O Pins and the
 Status LEDs. In detail the operation of Hardware I/O and LED pins are enabled and set:
 - I/O clock system enabled.
 - all I/O pins: input buffer enabled, output pins are set to HighZ mode.
 - for nTRST, nRESET a weak pull-up (if available) is enabled.
 - LED output pins are enabled and LEDs are turned off.
 */
 __STATIC_INLINE void DAP_SETUP (void) {
  ;
 }
 /** Reset Target Device with custom specific I/O pin or command sequence.
 This function allows the optional implementation of a device specific reset sequence.
 It is called when the command \ref DAP_ResetTarget and is for example required
 when a device needs a time-critical unlock sequence that enables the debug port.
 \return 0 = no device specific reset sequence is implemented.\n
        1 = a device specific reset sequence is implemented.
 */
 __STATIC_INLINE uint8_t RESET_TARGET (void) {
  return (0U);             // change to '1' when a device reset sequence is implemented
 }
 ///@}
 #endif /* __DAP_CONFIG_H__ */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Include/DAP.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Include/DAP.h
@@ -1,367 +0,0 @@
 /*
 * Copyright (c) 2013-2022 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ----------------------------------------------------------------------
 *
 * $Date:        26. April 2022
 * $Revision:    V2.1.1
 *
 * Project:      CMSIS-DAP Include
 * Title:        DAP.h Definitions
 *
 *---------------------------------------------------------------------------*/
 #ifndef __DAP_H__
 #define __DAP_H__
 // DAP Firmware Version
 #ifdef  DAP_FW_V1
 #define DAP_FW_VER                      "1.3.0"
 #else
 #define DAP_FW_VER                      "2.1.1"
 #endif
 // DAP Command IDs
 #define ID_DAP_Info                     0x00U
 #define ID_DAP_HostStatus               0x01U
 #define ID_DAP_Connect                  0x02U
 #define ID_DAP_Disconnect               0x03U
 #define ID_DAP_TransferConfigure        0x04U
 #define ID_DAP_Transfer                 0x05U
 #define ID_DAP_TransferBlock            0x06U
 #define ID_DAP_TransferAbort            0x07U
 #define ID_DAP_WriteABORT               0x08U
 #define ID_DAP_Delay                    0x09U
 #define ID_DAP_ResetTarget              0x0AU
 #define ID_DAP_SWJ_Pins                 0x10U
 #define ID_DAP_SWJ_Clock                0x11U
 #define ID_DAP_SWJ_Sequence             0x12U
 #define ID_DAP_SWD_Configure            0x13U
 #define ID_DAP_SWD_Sequence             0x1DU
 #define ID_DAP_JTAG_Sequence            0x14U
 #define ID_DAP_JTAG_Configure           0x15U
 #define ID_DAP_JTAG_IDCODE              0x16U
 #define ID_DAP_SWO_Transport            0x17U
 #define ID_DAP_SWO_Mode                 0x18U
 #define ID_DAP_SWO_Baudrate             0x19U
 #define ID_DAP_SWO_Control              0x1AU
 #define ID_DAP_SWO_Status               0x1BU
 #define ID_DAP_SWO_ExtendedStatus       0x1EU
 #define ID_DAP_SWO_Data                 0x1CU
 #define ID_DAP_UART_Transport           0x1FU
 #define ID_DAP_UART_Configure           0x20U
 #define ID_DAP_UART_Control             0x22U
 #define ID_DAP_UART_Status              0x23U
 #define ID_DAP_UART_Transfer            0x21U
 #define ID_DAP_QueueCommands            0x7EU
 #define ID_DAP_ExecuteCommands          0x7FU
 // DAP Vendor Command IDs
 #define ID_DAP_Vendor0                  0x80U
 #define ID_DAP_Vendor1                  0x81U
 #define ID_DAP_Vendor2                  0x82U
 #define ID_DAP_Vendor3                  0x83U
 #define ID_DAP_Vendor4                  0x84U
 #define ID_DAP_Vendor5                  0x85U
 #define ID_DAP_Vendor6                  0x86U
 #define ID_DAP_Vendor7                  0x87U
 #define ID_DAP_Vendor8                  0x88U
 #define ID_DAP_Vendor9                  0x89U
 #define ID_DAP_Vendor10                 0x8AU
 #define ID_DAP_Vendor11                 0x8BU
 #define ID_DAP_Vendor12                 0x8CU
 #define ID_DAP_Vendor13                 0x8DU
 #define ID_DAP_Vendor14                 0x8EU
 #define ID_DAP_Vendor15                 0x8FU
 #define ID_DAP_Vendor16                 0x90U
 #define ID_DAP_Vendor17                 0x91U
 #define ID_DAP_Vendor18                 0x92U
 #define ID_DAP_Vendor19                 0x93U
 #define ID_DAP_Vendor20                 0x94U
 #define ID_DAP_Vendor21                 0x95U
 #define ID_DAP_Vendor22                 0x96U
 #define ID_DAP_Vendor23                 0x97U
 #define ID_DAP_Vendor24                 0x98U
 #define ID_DAP_Vendor25                 0x99U
 #define ID_DAP_Vendor26                 0x9AU
 #define ID_DAP_Vendor27                 0x9BU
 #define ID_DAP_Vendor28                 0x9CU
 #define ID_DAP_Vendor29                 0x9DU
 #define ID_DAP_Vendor30                 0x9EU
 #define ID_DAP_Vendor31                 0x9FU
 #define ID_DAP_Invalid                  0xFFU
 // DAP Status Code
 #define DAP_OK                          0U
 #define DAP_ERROR                       0xFFU
 // DAP ID
 #define DAP_ID_VENDOR                   1U
 #define DAP_ID_PRODUCT                  2U
 #define DAP_ID_SER_NUM                  3U
 #define DAP_ID_DAP_FW_VER               4U
 #define DAP_ID_DEVICE_VENDOR            5U
 #define DAP_ID_DEVICE_NAME              6U
 #define DAP_ID_BOARD_VENDOR             7U
 #define DAP_ID_BOARD_NAME               8U
 #define DAP_ID_PRODUCT_FW_VER           9U
 #define DAP_ID_CAPABILITIES             0xF0U
 #define DAP_ID_TIMESTAMP_CLOCK          0xF1U
 #define DAP_ID_UART_RX_BUFFER_SIZE      0xFBU
 #define DAP_ID_UART_TX_BUFFER_SIZE      0xFCU
 #define DAP_ID_SWO_BUFFER_SIZE          0xFDU
 #define DAP_ID_PACKET_COUNT             0xFEU
 #define DAP_ID_PACKET_SIZE              0xFFU
 // DAP Host Status
 #define DAP_DEBUGGER_CONNECTED          0U
 #define DAP_TARGET_RUNNING              1U
 // DAP Port
 #define DAP_PORT_AUTODETECT             0U      // Autodetect Port
 #define DAP_PORT_DISABLED               0U      // Port Disabled (I/O pins in High-Z)
 #define DAP_PORT_SWD                    1U      // SWD Port (SWCLK, SWDIO) + nRESET
 #define DAP_PORT_JTAG                   2U      // JTAG Port (TCK, TMS, TDI, TDO, nTRST) + nRESET
 // DAP SWJ Pins
 #define DAP_SWJ_SWCLK_TCK               0       // SWCLK/TCK
 #define DAP_SWJ_SWDIO_TMS               1       // SWDIO/TMS
 #define DAP_SWJ_TDI                     2       // TDI
 #define DAP_SWJ_TDO                     3       // TDO
 #define DAP_SWJ_nTRST                   5       // nTRST
 #define DAP_SWJ_nRESET                  7       // nRESET
 // DAP Transfer Request
 #define DAP_TRANSFER_APnDP              (1U<<0)
 #define DAP_TRANSFER_RnW                (1U<<1)
 #define DAP_TRANSFER_A2                 (1U<<2)
 #define DAP_TRANSFER_A3                 (1U<<3)
 #define DAP_TRANSFER_MATCH_VALUE        (1U<<4)
 #define DAP_TRANSFER_MATCH_MASK         (1U<<5)
 #define DAP_TRANSFER_TIMESTAMP          (1U<<7)
 // DAP Transfer Response
 #define DAP_TRANSFER_OK                 (1U<<0)
 #define DAP_TRANSFER_WAIT               (1U<<1)
 #define DAP_TRANSFER_FAULT              (1U<<2)
 #define DAP_TRANSFER_ERROR              (1U<<3)
 #define DAP_TRANSFER_MISMATCH           (1U<<4)
 // DAP SWO Trace Mode
 #define DAP_SWO_OFF                     0U
 #define DAP_SWO_UART                    1U
 #define DAP_SWO_MANCHESTER              2U
 // DAP SWO Trace Status
 #define DAP_SWO_CAPTURE_ACTIVE          (1U<<0)
 #define DAP_SWO_CAPTURE_PAUSED          (1U<<1)
 #define DAP_SWO_STREAM_ERROR            (1U<<6)
 #define DAP_SWO_BUFFER_OVERRUN          (1U<<7)
 // DAP UART Transport
 #define DAP_UART_TRANSPORT_NONE         0U
 #define DAP_UART_TRANSPORT_USB_COM_PORT 1U
 #define DAP_UART_TRANSPORT_DAP_COMMAND  2U
 // DAP UART Control
 #define DAP_UART_CONTROL_RX_ENABLE      (1U<<0)
 #define DAP_UART_CONTROL_RX_DISABLE     (1U<<1)
 #define DAP_UART_CONTROL_RX_BUF_FLUSH   (1U<<2)
 #define DAP_UART_CONTROL_TX_ENABLE      (1U<<4)
 #define DAP_UART_CONTROL_TX_DISABLE     (1U<<5)
 #define DAP_UART_CONTROL_TX_BUF_FLUSH   (1U<<6)
 // DAP UART Status
 #define DAP_UART_STATUS_RX_ENABLED      (1U<<0)
 #define DAP_UART_STATUS_RX_DATA_LOST    (1U<<1)
 #define DAP_UART_STATUS_FRAMING_ERROR   (1U<<2)
 #define DAP_UART_STATUS_PARITY_ERROR    (1U<<3)
 #define DAP_UART_STATUS_TX_ENABLED      (1U<<4)
 // DAP UART Configure Error
 #define DAP_UART_CFG_ERROR_DATA_BITS    (1U<<0)
 #define DAP_UART_CFG_ERROR_PARITY       (1U<<1)
 #define DAP_UART_CFG_ERROR_STOP_BITS    (1U<<2)
 // Debug Port Register Addresses
 #define DP_IDCODE                       0x00U   // IDCODE Register (SW Read only)
 #define DP_ABORT                        0x00U   // Abort Register (SW Write only)
 #define DP_CTRL_STAT                    0x04U   // Control & Status
 #define DP_WCR                          0x04U   // Wire Control Register (SW Only)
 #define DP_SELECT                       0x08U   // Select Register (JTAG R/W & SW W)
 #define DP_RESEND                       0x08U   // Resend (SW Read Only)
 #define DP_RDBUFF                       0x0CU   // Read Buffer (Read Only)
 // JTAG IR Codes
 #define JTAG_ABORT                      0x08U
 #define JTAG_DPACC                      0x0AU
 #define JTAG_APACC                      0x0BU
 #define JTAG_IDCODE                     0x0EU
 #define JTAG_BYPASS                     0x0FU
 // JTAG Sequence Info
 #define JTAG_SEQUENCE_TCK               0x3FU   // TCK count
 #define JTAG_SEQUENCE_TMS               0x40U   // TMS value
 #define JTAG_SEQUENCE_TDO               0x80U   // TDO capture
 // SWD Sequence Info
 #define SWD_SEQUENCE_CLK                0x3FU   // SWCLK count
 #define SWD_SEQUENCE_DIN                0x80U   // SWDIO capture
 #include <stddef.h>
 #include <stdint.h>
 #include "cmsis_compiler.h"
 // DAP Data structure
 typedef struct {
  uint8_t     debug_port;                       // Debug Port
  uint8_t     fast_clock;                       // Fast Clock Flag
  uint8_t     padding[2];
  uint32_t   clock_delay;                       // Clock Delay
  uint32_t     timestamp;                       // Last captured Timestamp
  struct {                                      // Transfer Configuration
    uint8_t   idle_cycles;                      // Idle cycles after transfer
    uint8_t    padding[3];
    uint16_t  retry_count;                      // Number of retries after WAIT response
    uint16_t  match_retry;                      // Number of retries if read value does not match
    uint32_t  match_mask;                       // Match Mask
  } transfer;
 #if (DAP_SWD != 0)
  struct {                                      // SWD Configuration
    uint8_t    turnaround;                      // Turnaround period
    uint8_t    data_phase;                      // Always generate Data Phase
  } swd_conf;
 #endif
 #if (DAP_JTAG != 0)
  struct {                                      // JTAG Device Chain
    uint8_t   count;                            // Number of devices
    uint8_t   index;                            // Device index (device at TDO has index 0)
 #if (DAP_JTAG_DEV_CNT != 0)
    uint8_t   ir_length[DAP_JTAG_DEV_CNT];      // IR Length in bits
    uint16_t  ir_before[DAP_JTAG_DEV_CNT];      // Bits before IR
    uint16_t  ir_after [DAP_JTAG_DEV_CNT];      // Bits after IR
 #endif
  } jtag_dev;
 #endif
 } DAP_Data_t;
 extern          DAP_Data_t DAP_Data;            // DAP Data
 extern volatile uint8_t    DAP_TransferAbort;   // Transfer Abort Flag
 #ifdef  __cplusplus
 extern "C"
 {
 #endif
 // Functions
 extern void     SWJ_Sequence    (uint32_t count, const uint8_t *data);
 extern void     SWD_Sequence    (uint32_t info,  const uint8_t *swdo, uint8_t *swdi);
 extern void     JTAG_Sequence   (uint32_t info,  const uint8_t *tdi,  uint8_t *tdo);
 extern void     JTAG_IR         (uint32_t ir);
 extern uint32_t JTAG_ReadIDCode (void);
 extern void     JTAG_WriteAbort (uint32_t data);
 extern uint8_t  JTAG_Transfer   (uint32_t request, uint32_t *data);
 extern uint8_t  SWD_Transfer    (uint32_t request, uint32_t *data);
 extern void     Delayms         (uint32_t delay);
 extern uint32_t SWO_Transport      (const uint8_t *request, uint8_t *response);
 extern uint32_t SWO_Mode           (const uint8_t *request, uint8_t *response);
 extern uint32_t SWO_Baudrate       (const uint8_t *request, uint8_t *response);
 extern uint32_t SWO_Control        (const uint8_t *request, uint8_t *response);
 extern uint32_t SWO_Status                                 (uint8_t *response);
 extern uint32_t SWO_ExtendedStatus (const uint8_t *request, uint8_t *response);
 extern uint32_t SWO_Data           (const uint8_t *request, uint8_t *response);
 extern void     SWO_QueueTransfer    (uint8_t *buf, uint32_t num);
 extern void     SWO_AbortTransfer    (void);
 extern void     SWO_TransferComplete (void);
 extern uint32_t SWO_Mode_UART     (uint32_t enable);
 extern uint32_t SWO_Baudrate_UART (uint32_t baudrate);
 extern uint32_t SWO_Control_UART  (uint32_t active);
 extern void     SWO_Capture_UART  (uint8_t *buf, uint32_t num);
 extern uint32_t SWO_GetCount_UART (void);
 extern uint32_t SWO_Mode_Manchester     (uint32_t enable);
 extern uint32_t SWO_Baudrate_Manchester (uint32_t baudrate);
 extern uint32_t SWO_Control_Manchester  (uint32_t active);
 extern void     SWO_Capture_Manchester  (uint8_t *buf, uint32_t num);
 extern uint32_t SWO_GetCount_Manchester (void);
 extern uint32_t UART_Transport (const uint8_t *request, uint8_t *response);
 extern uint32_t UART_Configure (const uint8_t *request, uint8_t *response);
 extern uint32_t UART_Control   (const uint8_t *request, uint8_t *response);
 extern uint32_t UART_Status                            (uint8_t *response);
 extern uint32_t UART_Transfer  (const uint8_t *request, uint8_t *response);
 extern uint8_t  USB_COM_PORT_Activate (uint32_t cmd);
 extern uint32_t DAP_ProcessVendorCommand (const uint8_t *request, uint8_t *response);
 extern uint32_t DAP_ProcessCommand       (const uint8_t *request, uint8_t *response);
 extern uint32_t DAP_ExecuteCommand       (const uint8_t *request, uint8_t *response);
 extern void     DAP_Setup (void);
 // Configurable delay for clock generation
 #ifndef DELAY_SLOW_CYCLES
 #define DELAY_SLOW_CYCLES       3U      // Number of cycles for one iteration
 #endif
 #if defined(__CC_ARM)
 __STATIC_FORCEINLINE void PIN_DELAY_SLOW (uint32_t delay) {
  uint32_t count = delay;
  while (--count);
 }
 #else
 __STATIC_FORCEINLINE void PIN_DELAY_SLOW (uint32_t delay) {
  __ASM volatile (
  ".syntax unified\n"
  "0:\n\t"
    "subs %0,%0,#1\n\t"
    "bne  0b\n"
  : "+l" (delay) : : "cc"
  );
 }
 #endif
 // Fixed delay for fast clock generation
 #ifndef DELAY_FAST_CYCLES
 #define DELAY_FAST_CYCLES       0U      // Number of cycles: 0..3
 #endif
 __STATIC_FORCEINLINE void PIN_DELAY_FAST (void) {
 #if (DELAY_FAST_CYCLES >= 1U)
  __NOP();
 #endif
 #if (DELAY_FAST_CYCLES >= 2U)
  __NOP();
 #endif
 #if (DELAY_FAST_CYCLES >= 3U)
  __NOP();
 #endif
 }
 #ifdef  __cplusplus
 }
 #endif
 #endif  /* __DAP_H__ */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/DAP.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/DAP.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/DAP_vendor.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/DAP_vendor.c
@@ -1,100 +0,0 @@
 /*
 * Copyright (c) 2013-2017 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ----------------------------------------------------------------------
 *
 * $Date:        1. December 2017
 * $Revision:    V2.0.0
 *
 * Project:      CMSIS-DAP Source
 * Title:        DAP_vendor.c CMSIS-DAP Vendor Commands
 *
 *---------------------------------------------------------------------------*/
 #include "DAP_config.h"
 #include "DAP.h"
 //**************************************************************************************************
 /**
 \defgroup DAP_Vendor_Adapt_gr Adapt Vendor Commands
 \ingroup DAP_Vendor_gr
@{
 The file DAP_vendor.c provides template source code for extension of a Debug Unit with
 Vendor Commands. Copy this file to the project folder of the Debug Unit and add the
 file to the MDK-ARM project under the file group Configuration.
 */
 /** Process DAP Vendor Command and prepare Response Data
 \param request   pointer to request data
 \param response  pointer to response data
 \return          number of bytes in response (lower 16 bits)
                 number of bytes in request (upper 16 bits)
 */
 uint32_t DAP_ProcessVendorCommand(const uint8_t *request, uint8_t *response) {
  uint32_t num = (1U << 16) | 1U;
  *response++ = *request;        // copy Command ID
  switch (*request++) {          // first byte in request is Command ID
    case ID_DAP_Vendor0:
 #if 0                            // example user command
      num += 1U << 16;           // increment request count
      if (*request == 1U) {      // when first command data byte is 1
        *response++ = 'X';       // send 'X' as response
        num++;                   // increment response count
      }
 #endif
      break;
    case ID_DAP_Vendor1:  break;
    case ID_DAP_Vendor2:  break;
    case ID_DAP_Vendor3:  break;
    case ID_DAP_Vendor4:  break;
    case ID_DAP_Vendor5:  break;
    case ID_DAP_Vendor6:  break;
    case ID_DAP_Vendor7:  break;
    case ID_DAP_Vendor8:  break;
    case ID_DAP_Vendor9:  break;
    case ID_DAP_Vendor10: break;
    case ID_DAP_Vendor11: break;
    case ID_DAP_Vendor12: break;
    case ID_DAP_Vendor13: break;
    case ID_DAP_Vendor14: break;
    case ID_DAP_Vendor15: break;
    case ID_DAP_Vendor16: break;
    case ID_DAP_Vendor17: break;
    case ID_DAP_Vendor18: break;
    case ID_DAP_Vendor19: break;
    case ID_DAP_Vendor20: break;
    case ID_DAP_Vendor21: break;
    case ID_DAP_Vendor22: break;
    case ID_DAP_Vendor23: break;
    case ID_DAP_Vendor24: break;
    case ID_DAP_Vendor25: break;
    case ID_DAP_Vendor26: break;
    case ID_DAP_Vendor27: break;
    case ID_DAP_Vendor28: break;
    case ID_DAP_Vendor29: break;
    case ID_DAP_Vendor30: break;
    case ID_DAP_Vendor31: break;
  }
  return (num);
 }
 ///@}
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/JTAG_DP.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/JTAG_DP.c
@@ -1,370 +0,0 @@
 /*
 * Copyright (c) 2013-2017 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ----------------------------------------------------------------------
 *
 * $Date:        1. December 2017
 * $Revision:    V2.0.0
 *
 * Project:      CMSIS-DAP Source
 * Title:        JTAG_DP.c CMSIS-DAP JTAG DP I/O
 *
 *---------------------------------------------------------------------------*/
 #include "DAP_config.h"
 #include "DAP.h"
 // JTAG Macros
 #define PIN_TCK_SET PIN_SWCLK_TCK_SET
 #define PIN_TCK_CLR PIN_SWCLK_TCK_CLR
 #define PIN_TMS_SET PIN_SWDIO_TMS_SET
 #define PIN_TMS_CLR PIN_SWDIO_TMS_CLR
 #define JTAG_CYCLE_TCK()                \
  PIN_TCK_CLR();                        \
  PIN_DELAY();                          \
  PIN_TCK_SET();                        \
  PIN_DELAY()
 #define JTAG_CYCLE_TDI(tdi)             \
  PIN_TDI_OUT(tdi);                     \
  PIN_TCK_CLR();                        \
  PIN_DELAY();                          \
  PIN_TCK_SET();                        \
  PIN_DELAY()
 #define JTAG_CYCLE_TDO(tdo)             \
  PIN_TCK_CLR();                        \
  PIN_DELAY();                          \
  tdo = PIN_TDO_IN();                   \
  PIN_TCK_SET();                        \
  PIN_DELAY()
 #define JTAG_CYCLE_TDIO(tdi,tdo)        \
  PIN_TDI_OUT(tdi);                     \
  PIN_TCK_CLR();                        \
  PIN_DELAY();                          \
  tdo = PIN_TDO_IN();                   \
  PIN_TCK_SET();                        \
  PIN_DELAY()
 #define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
 #if (DAP_JTAG != 0)
 // Generate JTAG Sequence
 //   info:   sequence information
 //   tdi:    pointer to TDI generated data
 //   tdo:    pointer to TDO captured data
 //   return: none
 void JTAG_Sequence (uint32_t info, const uint8_t *tdi, uint8_t *tdo) {
  uint32_t i_val;
  uint32_t o_val;
  uint32_t bit;
  uint32_t n, k;
  n = info & JTAG_SEQUENCE_TCK;
  if (n == 0U) {
    n = 64U;
  }
  if (info & JTAG_SEQUENCE_TMS) {
    PIN_TMS_SET();
  } else {
    PIN_TMS_CLR();
  }
  while (n) {
    i_val = *tdi++;
    o_val = 0U;
    for (k = 8U; k && n; k--, n--) {
      JTAG_CYCLE_TDIO(i_val, bit);
      i_val >>= 1;
      o_val >>= 1;
      o_val  |= bit << 7;
    }
    o_val >>= k;
    if (info & JTAG_SEQUENCE_TDO) {
      *tdo++ = (uint8_t)o_val;
    }
  }
 }
 // JTAG Set IR
 //   ir:     IR value
 //   return: none
 #define JTAG_IR_Function(speed) /**/                                            \
 static void JTAG_IR_##speed (uint32_t ir) {                                     \
  uint32_t n;                                                                   \
                                                                                \
  PIN_TMS_SET();                                                                \
  JTAG_CYCLE_TCK();                         /* Select-DR-Scan */                \
  JTAG_CYCLE_TCK();                         /* Select-IR-Scan */                \
  PIN_TMS_CLR();                                                                \
  JTAG_CYCLE_TCK();                         /* Capture-IR */                    \
  JTAG_CYCLE_TCK();                         /* Shift-IR */                      \
                                                                                \
  PIN_TDI_OUT(1U);                                                              \
  for (n = DAP_Data.jtag_dev.ir_before[DAP_Data.jtag_dev.index]; n; n--) {      \
    JTAG_CYCLE_TCK();                       /* Bypass before data */            \
  }                                                                             \
  for (n = DAP_Data.jtag_dev.ir_length[DAP_Data.jtag_dev.index] - 1U; n; n--) { \
    JTAG_CYCLE_TDI(ir);                     /* Set IR bits (except last) */     \
    ir >>= 1;                                                                   \
  }                                                                             \
  n = DAP_Data.jtag_dev.ir_after[DAP_Data.jtag_dev.index];                      \
  if (n) {                                                                      \
    JTAG_CYCLE_TDI(ir);                     /* Set last IR bit */               \
    PIN_TDI_OUT(1U);                                                            \
    for (--n; n; n--) {                                                         \
      JTAG_CYCLE_TCK();                     /* Bypass after data */             \
    }                                                                           \
    PIN_TMS_SET();                                                              \
    JTAG_CYCLE_TCK();                       /* Bypass & Exit1-IR */             \
  } else {                                                                      \
    PIN_TMS_SET();                                                              \
    JTAG_CYCLE_TDI(ir);                     /* Set last IR bit & Exit1-IR */    \
  }                                                                             \
                                                                                \
  JTAG_CYCLE_TCK();                         /* Update-IR */                     \
  PIN_TMS_CLR();                                                                \
  JTAG_CYCLE_TCK();                         /* Idle */                          \
  PIN_TDI_OUT(1U);                                                              \
 }
 // JTAG Transfer I/O
 //   request: A[3:2] RnW APnDP
 //   data:    DATA[31:0]
 //   return:  ACK[2:0]
 #define JTAG_TransferFunction(speed)        /**/                                \
 static uint8_t JTAG_Transfer##speed (uint32_t request, uint32_t *data) {        \
  uint32_t ack;                                                                 \
  uint32_t bit;                                                                 \
  uint32_t val;                                                                 \
  uint32_t n;                                                                   \
                                                                                \
  PIN_TMS_SET();                                                                \
  JTAG_CYCLE_TCK();                         /* Select-DR-Scan */                \
  PIN_TMS_CLR();                                                                \
  JTAG_CYCLE_TCK();                         /* Capture-DR */                    \
  JTAG_CYCLE_TCK();                         /* Shift-DR */                      \
                                                                                \
  for (n = DAP_Data.jtag_dev.index; n; n--) {                                   \
    JTAG_CYCLE_TCK();                       /* Bypass before data */            \
  }                                                                             \
                                                                                \
  JTAG_CYCLE_TDIO(request >> 1, bit);       /* Set RnW, Get ACK.0 */            \
  ack  = bit << 1;                                                              \
  JTAG_CYCLE_TDIO(request >> 2, bit);       /* Set A2,  Get ACK.1 */            \
  ack |= bit << 0;                                                              \
  JTAG_CYCLE_TDIO(request >> 3, bit);       /* Set A3,  Get ACK.2 */            \
  ack |= bit << 2;                                                              \
                                                                                \
  if (ack != DAP_TRANSFER_OK) {                                                 \
    /* Exit on error */                                                         \
    PIN_TMS_SET();                                                              \
    JTAG_CYCLE_TCK();                       /* Exit1-DR */                      \
    goto exit;                                                                  \
  }                                                                             \
                                                                                \
  if (request & DAP_TRANSFER_RnW) {                                             \
    /* Read Transfer */                                                         \
    val = 0U;                                                                   \
    for (n = 31U; n; n--) {                                                     \
      JTAG_CYCLE_TDO(bit);                  /* Get D0..D30 */                   \
      val  |= bit << 31;                                                        \
      val >>= 1;                                                                \
    }                                                                           \
    n = DAP_Data.jtag_dev.count - DAP_Data.jtag_dev.index - 1U;                 \
    if (n) {                                                                    \
      JTAG_CYCLE_TDO(bit);                  /* Get D31 */                       \
      for (--n; n; n--) {                                                       \
        JTAG_CYCLE_TCK();                   /* Bypass after data */             \
      }                                                                         \
      PIN_TMS_SET();                                                            \
      JTAG_CYCLE_TCK();                     /* Bypass & Exit1-DR */             \
    } else {                                                                    \
      PIN_TMS_SET();                                                            \
      JTAG_CYCLE_TDO(bit);                  /* Get D31 & Exit1-DR */            \
    }                                                                           \
    val |= bit << 31;                                                           \
    if (data) { *data = val; }                                                  \
  } else {                                                                      \
    /* Write Transfer */                                                        \
    val = *data;                                                                \
    for (n = 31U; n; n--) {                                                     \
      JTAG_CYCLE_TDI(val);                  /* Set D0..D30 */                   \
      val >>= 1;                                                                \
    }                                                                           \
    n = DAP_Data.jtag_dev.count - DAP_Data.jtag_dev.index - 1U;                 \
    if (n) {                                                                    \
      JTAG_CYCLE_TDI(val);                  /* Set D31 */                       \
      for (--n; n; n--) {                                                       \
        JTAG_CYCLE_TCK();                   /* Bypass after data */             \
      }                                                                         \
      PIN_TMS_SET();                                                            \
      JTAG_CYCLE_TCK();                     /* Bypass & Exit1-DR */             \
    } else {                                                                    \
      PIN_TMS_SET();                                                            \
      JTAG_CYCLE_TDI(val);                  /* Set D31 & Exit1-DR */            \
    }                                                                           \
  }                                                                             \
                                                                                \
 exit:                                                                           \
  JTAG_CYCLE_TCK();                         /* Update-DR */                     \
  PIN_TMS_CLR();                                                                \
  JTAG_CYCLE_TCK();                         /* Idle */                          \
  PIN_TDI_OUT(1U);                                                              \
                                                                                \
  /* Capture Timestamp */                                                       \
  if (request & DAP_TRANSFER_TIMESTAMP) {                                       \
    DAP_Data.timestamp = TIMESTAMP_GET();                                       \
  }                                                                             \
                                                                                \
  /* Idle cycles */                                                             \
  n = DAP_Data.transfer.idle_cycles;                                            \
  while (n--) {                                                                 \
    JTAG_CYCLE_TCK();                       /* Idle */                          \
  }                                                                             \
                                                                                \
  return ((uint8_t)ack);                                                        \
 }
 #undef  PIN_DELAY
 #define PIN_DELAY() PIN_DELAY_FAST()
 JTAG_IR_Function(Fast)
 JTAG_TransferFunction(Fast)
 #undef  PIN_DELAY
 #define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
 JTAG_IR_Function(Slow)
 JTAG_TransferFunction(Slow)
 // JTAG Read IDCODE register
 //   return: value read
 uint32_t JTAG_ReadIDCode (void) {
  uint32_t bit;
  uint32_t val;
  uint32_t n;
  PIN_TMS_SET();
  JTAG_CYCLE_TCK();                         /* Select-DR-Scan */
  PIN_TMS_CLR();
  JTAG_CYCLE_TCK();                         /* Capture-DR */
  JTAG_CYCLE_TCK();                         /* Shift-DR */
  for (n = DAP_Data.jtag_dev.index; n; n--) {
    JTAG_CYCLE_TCK();                       /* Bypass before data */
  }
  val = 0U;
  for (n = 31U; n; n--) {
    JTAG_CYCLE_TDO(bit);                    /* Get D0..D30 */
    val  |= bit << 31;
    val >>= 1;
  }
  PIN_TMS_SET();
  JTAG_CYCLE_TDO(bit);                      /* Get D31 & Exit1-DR */
  val |= bit << 31;
  JTAG_CYCLE_TCK();                         /* Update-DR */
  PIN_TMS_CLR();
  JTAG_CYCLE_TCK();                         /* Idle */
  return (val);
 }
 // JTAG Write ABORT register
 //   data:   value to write
 //   return: none
 void JTAG_WriteAbort (uint32_t data) {
  uint32_t n;
  PIN_TMS_SET();
  JTAG_CYCLE_TCK();                         /* Select-DR-Scan */
  PIN_TMS_CLR();
  JTAG_CYCLE_TCK();                         /* Capture-DR */
  JTAG_CYCLE_TCK();                         /* Shift-DR */
  for (n = DAP_Data.jtag_dev.index; n; n--) {
    JTAG_CYCLE_TCK();                       /* Bypass before data */
  }
  PIN_TDI_OUT(0U);
  JTAG_CYCLE_TCK();                         /* Set RnW=0 (Write) */
  JTAG_CYCLE_TCK();                         /* Set A2=0 */
  JTAG_CYCLE_TCK();                         /* Set A3=0 */
  for (n = 31U; n; n--) {
    JTAG_CYCLE_TDI(data);                   /* Set D0..D30 */
    data >>= 1;
  }
  n = DAP_Data.jtag_dev.count - DAP_Data.jtag_dev.index - 1U;
  if (n) {
    JTAG_CYCLE_TDI(data);                   /* Set D31 */
    for (--n; n; n--) {
      JTAG_CYCLE_TCK();                     /* Bypass after data */
    }
    PIN_TMS_SET();
    JTAG_CYCLE_TCK();                       /* Bypass & Exit1-DR */
  } else {
    PIN_TMS_SET();
    JTAG_CYCLE_TDI(data);                   /* Set D31 & Exit1-DR */
  }
  JTAG_CYCLE_TCK();                         /* Update-DR */
  PIN_TMS_CLR();
  JTAG_CYCLE_TCK();                         /* Idle */
  PIN_TDI_OUT(1U);
 }
 // JTAG Set IR
 //   ir:     IR value
 //   return: none
 void JTAG_IR (uint32_t ir) {
  if (DAP_Data.fast_clock) {
    JTAG_IR_Fast(ir);
  } else {
    JTAG_IR_Slow(ir);
  }
 }
 // JTAG Transfer I/O
 //   request: A[3:2] RnW APnDP
 //   data:    DATA[31:0]
 //   return:  ACK[2:0]
 uint8_t  JTAG_Transfer(uint32_t request, uint32_t *data) {
  if (DAP_Data.fast_clock) {
    return JTAG_TransferFast(request, data);
  } else {
    return JTAG_TransferSlow(request, data);
  }
 }
 #endif  /* (DAP_JTAG != 0) */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/SWO.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/SWO.c
@@ -1,798 +0,0 @@
 /*
 * Copyright (c) 2013-2021 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ----------------------------------------------------------------------
 *
 * $Date:        29. March 2021
 * $Revision:    V2.0.1
 *
 * Project:      CMSIS-DAP Source
 * Title:        SWO.c CMSIS-DAP SWO I/O
 *
 *---------------------------------------------------------------------------*/
 #include "DAP_config.h"
 #include "DAP.h"
 #if (SWO_UART != 0)
 #include "Driver_USART.h"
 #endif
 #if (SWO_STREAM != 0)
 #include "cmsis_os2.h"
 #define   osObjectsExternal
 #include "osObjects.h"
 #endif
 #if (SWO_STREAM != 0)
 #ifdef DAP_FW_V1
 #error "SWO Streaming Trace not supported in DAP V1!"
 #endif
 #endif
 #if (SWO_UART != 0)
 // USART Driver
 #define _USART_Driver_(n)  Driver_USART##n
 #define  USART_Driver_(n) _USART_Driver_(n)
 extern ARM_DRIVER_USART    USART_Driver_(SWO_UART_DRIVER);
 #define pUSART           (&USART_Driver_(SWO_UART_DRIVER))
 static uint8_t USART_Ready = 0U;
 #endif  /* (SWO_UART != 0) */
 #if ((SWO_UART != 0) || (SWO_MANCHESTER != 0))
 #define SWO_STREAM_TIMEOUT      50U     /* Stream timeout in ms */
 #define USB_BLOCK_SIZE          512U    /* USB Block Size */
 #define TRACE_BLOCK_SIZE        64U     /* Trace Block Size (2^n: 32...512) */
 // Trace State
 static uint8_t  TraceTransport =  0U;       /* Trace Transport */
 static uint8_t  TraceMode      =  0U;       /* Trace Mode */
 static uint8_t  TraceStatus    =  0U;       /* Trace Status without Errors */
 static uint8_t  TraceError[2]  = {0U, 0U};  /* Trace Error flags (banked) */
 static uint8_t  TraceError_n   =  0U;       /* Active Trace Error bank */
 // Trace Buffer
 static uint8_t  TraceBuf[SWO_BUFFER_SIZE];  /* Trace Buffer (must be 2^n) */
 static volatile uint32_t TraceIndexI  = 0U; /* Incoming Trace Index */
 static volatile uint32_t TraceIndexO  = 0U; /* Outgoing Trace Index */
 static volatile uint8_t  TraceUpdate;       /* Trace Update Flag */
 static          uint32_t TraceBlockSize;    /* Current Trace Block Size */
 #if (TIMESTAMP_CLOCK != 0U)
 // Trace Timestamp
 static volatile struct {
  uint32_t index;
  uint32_t tick;
 } TraceTimestamp;
 #endif
 // Trace Helper functions
 static void     ClearTrace     (void);
 static void     ResumeTrace    (void);
 static uint32_t GetTraceCount  (void);
 static uint8_t  GetTraceStatus (void);
 static void     SetTraceError  (uint8_t flag);
 #if (SWO_STREAM != 0)
 extern osThreadId_t      SWO_ThreadId;
 static volatile uint8_t  TransferBusy = 0U; /* Transfer Busy Flag */
 static          uint32_t TransferSize;      /* Current Transfer Size */
 #endif
 #if (SWO_UART != 0)
 // USART Driver Callback function
 //   event: event mask
 static void USART_Callback (uint32_t event) {
  uint32_t index_i;
  uint32_t index_o;
  uint32_t count;
  uint32_t num;
  if (event &  ARM_USART_EVENT_RECEIVE_COMPLETE) {
 #if (TIMESTAMP_CLOCK != 0U)
    TraceTimestamp.tick = TIMESTAMP_GET();
 #endif
    index_o  = TraceIndexO;
    index_i  = TraceIndexI;
    index_i += TraceBlockSize;
    TraceIndexI = index_i;
 #if (TIMESTAMP_CLOCK != 0U)
    TraceTimestamp.index = index_i;
 #endif
    num   = TRACE_BLOCK_SIZE - (index_i & (TRACE_BLOCK_SIZE - 1U));
    count = index_i - index_o;
    if (count <= (SWO_BUFFER_SIZE - num)) {
      index_i &= SWO_BUFFER_SIZE - 1U;
      TraceBlockSize = num;
      pUSART->Receive(&TraceBuf[index_i], num);
    } else {
      TraceStatus = DAP_SWO_CAPTURE_ACTIVE | DAP_SWO_CAPTURE_PAUSED;
    }
    TraceUpdate = 1U;
 #if (SWO_STREAM != 0)
    if (TraceTransport == 2U) {
      if (count >= (USB_BLOCK_SIZE - (index_o & (USB_BLOCK_SIZE - 1U)))) {
        osThreadFlagsSet(SWO_ThreadId, 1U);
      }
    }
 #endif
  }
  if (event &  ARM_USART_EVENT_RX_OVERFLOW) {
    SetTraceError(DAP_SWO_BUFFER_OVERRUN);
  }
  if (event & (ARM_USART_EVENT_RX_BREAK         |
               ARM_USART_EVENT_RX_FRAMING_ERROR |
               ARM_USART_EVENT_RX_PARITY_ERROR)) {
    SetTraceError(DAP_SWO_STREAM_ERROR);
  }
 }
 // Enable or disable SWO Mode (UART)
 //   enable: enable flag
 //   return: 1 - Success, 0 - Error
 __WEAK uint32_t SWO_Mode_UART (uint32_t enable) {
  int32_t status;
  USART_Ready = 0U;
  if (enable != 0U) {
    status = pUSART->Initialize(USART_Callback);
    if (status != ARM_DRIVER_OK) {
      return (0U);
    }
    status = pUSART->PowerControl(ARM_POWER_FULL);
    if (status != ARM_DRIVER_OK) {
      pUSART->Uninitialize();
      return (0U);
    }
  } else {
    pUSART->Control(ARM_USART_CONTROL_RX, 0U);
    pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
    pUSART->PowerControl(ARM_POWER_OFF);
    pUSART->Uninitialize();
  }
  return (1U);
 }
 // Configure SWO Baudrate (UART)
 //   baudrate: requested baudrate
 //   return:   actual baudrate or 0 when not configured
 __WEAK uint32_t SWO_Baudrate_UART (uint32_t baudrate) {
  int32_t  status;
  uint32_t index;
  uint32_t num;
  if (baudrate > SWO_UART_MAX_BAUDRATE) {
    baudrate = SWO_UART_MAX_BAUDRATE;
  }
  if (TraceStatus & DAP_SWO_CAPTURE_ACTIVE) {
    pUSART->Control(ARM_USART_CONTROL_RX, 0U);
    if (pUSART->GetStatus().rx_busy) {
      TraceIndexI += pUSART->GetRxCount();
      pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
    }
  }
  status = pUSART->Control(ARM_USART_MODE_ASYNCHRONOUS |
                           ARM_USART_DATA_BITS_8       |
                           ARM_USART_PARITY_NONE       |
                           ARM_USART_STOP_BITS_1,
                           baudrate);
  if (status == ARM_DRIVER_OK) {
    USART_Ready = 1U;
  } else {
    USART_Ready = 0U;
    return (0U);
  }
  if (TraceStatus & DAP_SWO_CAPTURE_ACTIVE) {
    if ((TraceStatus & DAP_SWO_CAPTURE_PAUSED) == 0U) {
      index = TraceIndexI & (SWO_BUFFER_SIZE - 1U);
      num = TRACE_BLOCK_SIZE - (index & (TRACE_BLOCK_SIZE - 1U));
      TraceBlockSize = num;
      pUSART->Receive(&TraceBuf[index], num);
    }
    pUSART->Control(ARM_USART_CONTROL_RX, 1U);
  }
  return (baudrate);
 }
 // Control SWO Capture (UART)
 //   active: active flag
 //   return: 1 - Success, 0 - Error
 __WEAK uint32_t SWO_Control_UART (uint32_t active) {
  int32_t status;
  if (active) {
    if (!USART_Ready) {
      return (0U);
    }
    TraceBlockSize = 1U;
    status = pUSART->Receive(&TraceBuf[0], 1U);
    if (status != ARM_DRIVER_OK) {
      return (0U);
    }
    status = pUSART->Control(ARM_USART_CONTROL_RX, 1U);
    if (status != ARM_DRIVER_OK) {
      return (0U);
    }
  } else {
    pUSART->Control(ARM_USART_CONTROL_RX, 0U);
    if (pUSART->GetStatus().rx_busy) {
      TraceIndexI += pUSART->GetRxCount();
      pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
    }
  }
  return (1U);
 }
 // Start SWO Capture (UART)
 //   buf: pointer to buffer for capturing
 //   num: number of bytes to capture
 __WEAK void SWO_Capture_UART (uint8_t *buf, uint32_t num) {
  TraceBlockSize = num;
  pUSART->Receive(buf, num);
 }
 // Get SWO Pending Trace Count (UART)
 //   return: number of pending trace data bytes
 __WEAK uint32_t SWO_GetCount_UART (void) {
  uint32_t count;
  if (pUSART->GetStatus().rx_busy) {
    count = pUSART->GetRxCount();
  } else {
    count = 0U;
  }
  return (count);
 }
 #endif  /* (SWO_UART != 0) */
 #if (SWO_MANCHESTER != 0)
 // Enable or disable SWO Mode (Manchester)
 //   enable: enable flag
 //   return: 1 - Success, 0 - Error
 __WEAK uint32_t SWO_Mode_Manchester (uint32_t enable) {
  return (0U);
 }
 // Configure SWO Baudrate (Manchester)
 //   baudrate: requested baudrate
 //   return:   actual baudrate or 0 when not configured
 __WEAK uint32_t SWO_Baudrate_Manchester (uint32_t baudrate) {
  return (0U);
 }
 // Control SWO Capture (Manchester)
 //   active: active flag
 //   return: 1 - Success, 0 - Error
 __WEAK uint32_t SWO_Control_Manchester (uint32_t active) {
  return (0U);
 }
 // Start SWO Capture (Manchester)
 //   buf: pointer to buffer for capturing
 //   num: number of bytes to capture
 __WEAK void SWO_Capture_Manchester (uint8_t *buf, uint32_t num) {
 }
 // Get SWO Pending Trace Count (Manchester)
 //   return: number of pending trace data bytes
 __WEAK uint32_t SWO_GetCount_Manchester (void) {
 }
 #endif  /* (SWO_MANCHESTER != 0) */
 // Clear Trace Errors and Data
 static void ClearTrace (void) {
 #if (SWO_STREAM != 0)
  if (TraceTransport == 2U) {
    if (TransferBusy != 0U) {
      SWO_AbortTransfer();
      TransferBusy = 0U;
    }
  }
 #endif
  TraceError[0] = 0U;
  TraceError[1] = 0U;
  TraceError_n  = 0U;
  TraceIndexI   = 0U;
  TraceIndexO   = 0U;
 #if (TIMESTAMP_CLOCK != 0U)
  TraceTimestamp.index = 0U;
  TraceTimestamp.tick  = 0U;
 #endif
 }
 // Resume Trace Capture
 static void ResumeTrace (void) {
  uint32_t index_i;
  uint32_t index_o;
  if (TraceStatus == (DAP_SWO_CAPTURE_ACTIVE | DAP_SWO_CAPTURE_PAUSED)) {
    index_i = TraceIndexI;
    index_o = TraceIndexO;
    if ((index_i - index_o) < SWO_BUFFER_SIZE) {
      index_i &= SWO_BUFFER_SIZE - 1U;
      switch (TraceMode) {
 #if (SWO_UART != 0)
        case DAP_SWO_UART:
          TraceStatus = DAP_SWO_CAPTURE_ACTIVE;
          SWO_Capture_UART(&TraceBuf[index_i], 1U);
          break;
 #endif
 #if (SWO_MANCHESTER != 0)
        case DAP_SWO_MANCHESTER:
          TraceStatus = DAP_SWO_CAPTURE_ACTIVE;
          SWO_Capture_Manchester(&TraceBuf[index_i], 1U);
          break;
 #endif
        default:
          break;
      }
    }
  }
 }
 // Get Trace Count
 //   return: number of available data bytes in trace buffer
 static uint32_t GetTraceCount (void) {
  uint32_t count;
  if (TraceStatus == DAP_SWO_CAPTURE_ACTIVE) {
    do {
      TraceUpdate = 0U;
      count = TraceIndexI - TraceIndexO;
      switch (TraceMode) {
 #if (SWO_UART != 0)
        case DAP_SWO_UART:
          count += SWO_GetCount_UART();
          break;
 #endif
 #if (SWO_MANCHESTER != 0)
        case DAP_SWO_MANCHESTER:
          count += SWO_GetCount_Manchester();
          break;
 #endif
        default:
          break;
      }
    } while (TraceUpdate != 0U);
  } else {
    count = TraceIndexI - TraceIndexO;
  }
  return (count);
 }
 // Get Trace Status (clear Error flags)
 //   return: Trace Status (Active flag and Error flags)
 static uint8_t GetTraceStatus (void) {
  uint8_t  status;
  uint32_t n;
  n = TraceError_n;
  TraceError_n ^= 1U;
  status = TraceStatus | TraceError[n];
  TraceError[n] = 0U;
  return (status);
 }
 // Set Trace Error flag(s)
 //   flag:  error flag(s) to set
 static void SetTraceError (uint8_t flag) {
  TraceError[TraceError_n] |= flag;
 }
 // Process SWO Transport command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t SWO_Transport (const uint8_t *request, uint8_t *response) {
  uint8_t  transport;
  uint32_t result;
  if ((TraceStatus & DAP_SWO_CAPTURE_ACTIVE) == 0U) {
    transport = *request;
    switch (transport) {
      case 0U:
      case 1U:
 #if (SWO_STREAM != 0)
      case 2U:
 #endif
        TraceTransport = transport;
        result = 1U;
        break;
      default:
        result = 0U;
        break;
    }
  } else {
    result = 0U;
  }
  if (result != 0U) {
    *response = DAP_OK;
  } else {
    *response = DAP_ERROR;
  }
  return ((1U << 16) | 1U);
 }
 // Process SWO Mode command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t SWO_Mode (const uint8_t *request, uint8_t *response) {
  uint8_t  mode;
  uint32_t result;
  mode = *request;
  switch (TraceMode) {
 #if (SWO_UART != 0)
    case DAP_SWO_UART:
      SWO_Mode_UART(0U);
      break;
 #endif
 #if (SWO_MANCHESTER != 0)
    case DAP_SWO_MANCHESTER:
      SWO_Mode_Manchester(0U);
      break;
 #endif
    default:
      break;
  }
  switch (mode) {
    case DAP_SWO_OFF:
      result = 1U;
      break;
 #if (SWO_UART != 0)
    case DAP_SWO_UART:
      result = SWO_Mode_UART(1U);
      break;
 #endif
 #if (SWO_MANCHESTER != 0)
    case DAP_SWO_MANCHESTER:
      result = SWO_Mode_Manchester(1U);
      break;
 #endif
    default:
      result = 0U;
      break;
  }
  if (result != 0U) {
    TraceMode = mode;
  } else {
    TraceMode = DAP_SWO_OFF;
  }
  TraceStatus = 0U;
  if (result != 0U) {
    *response = DAP_OK;
  } else {
    *response = DAP_ERROR;
  }
  return ((1U << 16) | 1U);
 }
 // Process SWO Baudrate command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t SWO_Baudrate (const uint8_t *request, uint8_t *response) {
  uint32_t baudrate;
  baudrate = (uint32_t)(*(request+0) <<  0) |
             (uint32_t)(*(request+1) <<  8) |
             (uint32_t)(*(request+2) << 16) |
             (uint32_t)(*(request+3) << 24);
  switch (TraceMode) {
 #if (SWO_UART != 0)
    case DAP_SWO_UART:
      baudrate = SWO_Baudrate_UART(baudrate);
      break;
 #endif
 #if (SWO_MANCHESTER != 0)
    case DAP_SWO_MANCHESTER:
      baudrate = SWO_Baudrate_Manchester(baudrate);
      break;
 #endif
    default:
      baudrate = 0U;
      break;
  }
  if (baudrate == 0U) {
    TraceStatus = 0U;
  }
  *response++ = (uint8_t)(baudrate >>  0);
  *response++ = (uint8_t)(baudrate >>  8);
  *response++ = (uint8_t)(baudrate >> 16);
  *response   = (uint8_t)(baudrate >> 24);
  return ((4U << 16) | 4U);
 }
 // Process SWO Control command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t SWO_Control (const uint8_t *request, uint8_t *response) {
  uint8_t  active;
  uint32_t result;
  active = *request & DAP_SWO_CAPTURE_ACTIVE;
  if (active != (TraceStatus & DAP_SWO_CAPTURE_ACTIVE)) {
    if (active) {
      ClearTrace();
    }
    switch (TraceMode) {
 #if (SWO_UART != 0)
      case DAP_SWO_UART:
        result = SWO_Control_UART(active);
        break;
 #endif
 #if (SWO_MANCHESTER != 0)
      case DAP_SWO_MANCHESTER:
        result = SWO_Control_Manchester(active);
        break;
 #endif
      default:
        result = 0U;
        break;
    }
    if (result != 0U) {
      TraceStatus = active;
 #if (SWO_STREAM != 0)
      if (TraceTransport == 2U) {
        osThreadFlagsSet(SWO_ThreadId, 1U);
      }
 #endif
    }
  } else {
    result = 1U;
  }
  if (result != 0U) {
    *response = DAP_OK;
  } else {
    *response = DAP_ERROR;
  }
  return ((1U << 16) | 1U);
 }
 // Process SWO Status command and prepare response
 //   response: pointer to response data
 //   return:   number of bytes in response
 uint32_t SWO_Status (uint8_t *response) {
  uint8_t  status;
  uint32_t count;
  status = GetTraceStatus();
  count  = GetTraceCount();
  *response++ = status;
  *response++ = (uint8_t)(count >>  0);
  *response++ = (uint8_t)(count >>  8);
  *response++ = (uint8_t)(count >> 16);
  *response   = (uint8_t)(count >> 24);
  return (5U);
 }
 // Process SWO Extended Status command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t SWO_ExtendedStatus (const uint8_t *request, uint8_t *response) {
  uint8_t  cmd;
  uint8_t  status;
  uint32_t count;
 #if (TIMESTAMP_CLOCK != 0U)
  uint32_t index;
  uint32_t tick;
 #endif
  uint32_t num;
  num = 0U;
  cmd = *request;
  if (cmd & 0x01U) {
    status = GetTraceStatus();
    *response++ = status;
    num += 1U;
  }
  if (cmd & 0x02U) {
    count = GetTraceCount();
    *response++ = (uint8_t)(count >>  0);
    *response++ = (uint8_t)(count >>  8);
    *response++ = (uint8_t)(count >> 16);
    *response++ = (uint8_t)(count >> 24);
    num += 4U;
  }
 #if (TIMESTAMP_CLOCK != 0U)
  if (cmd & 0x04U) {
    do {
      TraceUpdate = 0U;
      index = TraceTimestamp.index;
      tick  = TraceTimestamp.tick;
    } while (TraceUpdate != 0U);
    *response++ = (uint8_t)(index >>  0);
    *response++ = (uint8_t)(index >>  8);
    *response++ = (uint8_t)(index >> 16);
    *response++ = (uint8_t)(index >> 24);
    *response++ = (uint8_t)(tick  >>  0);
    *response++ = (uint8_t)(tick  >>  8);
    *response++ = (uint8_t)(tick  >> 16);
    *response++ = (uint8_t)(tick  >> 24);
    num += 4U;
  }
 #endif
  return ((1U << 16) | num);
 }
 // Process SWO Data command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t SWO_Data (const uint8_t *request, uint8_t *response) {
  uint8_t  status;
  uint32_t count;
  uint32_t index;
  uint32_t n, i;
  status = GetTraceStatus();
  count  = GetTraceCount();
  if (TraceTransport == 1U) {
    n = (uint32_t)(*(request+0) << 0) |
        (uint32_t)(*(request+1) << 8);
    if (n > (DAP_PACKET_SIZE - 4U)) {
      n = DAP_PACKET_SIZE - 4U;
    }
    if (count > n) {
      count = n;
    }
  } else {
    count = 0U;
  }
  *response++ = status;
  *response++ = (uint8_t)(count >> 0);
  *response++ = (uint8_t)(count >> 8);
  if (TraceTransport == 1U) {
    index = TraceIndexO;
    for (i = index, n = count; n; n--) {
      i &= SWO_BUFFER_SIZE - 1U;
      *response++ = TraceBuf[i++];
    }
    TraceIndexO = index + count;
    ResumeTrace();
  }
  return ((2U << 16) | (3U + count));
 }
 #if (SWO_STREAM != 0)
 // SWO Data Transfer complete callback
 void SWO_TransferComplete (void) {
  TraceIndexO += TransferSize;
  TransferBusy = 0U;
  ResumeTrace();
  osThreadFlagsSet(SWO_ThreadId, 1U);
 }
 // SWO Thread
 __NO_RETURN void SWO_Thread (void *argument) {
  uint32_t timeout;
  uint32_t flags;
  uint32_t count;
  uint32_t index;
  uint32_t i, n;
  (void)   argument;
  timeout = osWaitForever;
  for (;;) {
    flags = osThreadFlagsWait(1U, osFlagsWaitAny, timeout);
    if (TraceStatus & DAP_SWO_CAPTURE_ACTIVE) {
      timeout = SWO_STREAM_TIMEOUT;
    } else {
      timeout = osWaitForever;
      flags   = osFlagsErrorTimeout;
    }
    if (TransferBusy == 0U) {
      count = GetTraceCount();
      if (count != 0U) {
        index = TraceIndexO & (SWO_BUFFER_SIZE - 1U);
        n = SWO_BUFFER_SIZE - index;
        if (count > n) {
          count = n;
        }
        if (flags != osFlagsErrorTimeout) {
          i = index & (USB_BLOCK_SIZE - 1U);
          if (i == 0U) {
            count &= ~(USB_BLOCK_SIZE - 1U);
          } else {
            n = USB_BLOCK_SIZE - i;
            if (count >= n) {
              count = n;
            } else {
              count = 0U;
            }
          }
        }
        if (count != 0U) {
          TransferSize = count;
          TransferBusy = 1U;
          SWO_QueueTransfer(&TraceBuf[index], count);
        }
      }
    }
  }
 }
 #endif  /* (SWO_STREAM != 0) */
 #endif  /* ((SWO_UART != 0) || (SWO_MANCHESTER != 0)) */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/SW_DP.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/SW_DP.c
@@ -1,286 +0,0 @@
 /*
 * Copyright (c) 2013-2017 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ----------------------------------------------------------------------
 *
 * $Date:        1. December 2017
 * $Revision:    V2.0.0
 *
 * Project:      CMSIS-DAP Source
 * Title:        SW_DP.c CMSIS-DAP SW DP I/O
 *
 *---------------------------------------------------------------------------*/
 #include "DAP_config.h"
 #include "DAP.h"
 // SW Macros
 #define PIN_SWCLK_SET PIN_SWCLK_TCK_SET
 #define PIN_SWCLK_CLR PIN_SWCLK_TCK_CLR
 #define SW_CLOCK_CYCLE()                \
  PIN_SWCLK_CLR();                      \
  PIN_DELAY();                          \
  PIN_SWCLK_SET();                      \
  PIN_DELAY()
 #define SW_WRITE_BIT(bit)               \
  PIN_SWDIO_OUT(bit);                   \
  PIN_SWCLK_CLR();                      \
  PIN_DELAY();                          \
  PIN_SWCLK_SET();                      \
  PIN_DELAY()
 #define SW_READ_BIT(bit)                \
  PIN_SWCLK_CLR();                      \
  PIN_DELAY();                          \
  bit = PIN_SWDIO_IN();                 \
  PIN_SWCLK_SET();                      \
  PIN_DELAY()
 #define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
 // Generate SWJ Sequence
 //   count:  sequence bit count
 //   data:   pointer to sequence bit data
 //   return: none
 #if ((DAP_SWD != 0) || (DAP_JTAG != 0))
 void SWJ_Sequence (uint32_t count, const uint8_t *data) {
  uint32_t val;
  uint32_t n;
  val = 0U;
  n = 0U;
  while (count--) {
    if (n == 0U) {
      val = *data++;
      n = 8U;
    }
    if (val & 1U) {
      PIN_SWDIO_TMS_SET();
    } else {
      PIN_SWDIO_TMS_CLR();
    }
    SW_CLOCK_CYCLE();
    val >>= 1;
    n--;
  }
 }
 #endif
 // Generate SWD Sequence
 //   info:   sequence information
 //   swdo:   pointer to SWDIO generated data
 //   swdi:   pointer to SWDIO captured data
 //   return: none
 #if (DAP_SWD != 0)
 void SWD_Sequence (uint32_t info, const uint8_t *swdo, uint8_t *swdi) {
  uint32_t val;
  uint32_t bit;
  uint32_t n, k;
  n = info & SWD_SEQUENCE_CLK;
  if (n == 0U) {
    n = 64U;
  }
  if (info & SWD_SEQUENCE_DIN) {
    while (n) {
      val = 0U;
      for (k = 8U; k && n; k--, n--) {
        SW_READ_BIT(bit);
        val >>= 1;
        val  |= bit << 7;
      }
      val >>= k;
      *swdi++ = (uint8_t)val;
    }
  } else {
    while (n) {
      val = *swdo++;
      for (k = 8U; k && n; k--, n--) {
        SW_WRITE_BIT(val);
        val >>= 1;
      }
    }
  }
 }
 #endif
 #if (DAP_SWD != 0)
 // SWD Transfer I/O
 //   request: A[3:2] RnW APnDP
 //   data:    DATA[31:0]
 //   return:  ACK[2:0]
 #define SWD_TransferFunction(speed)     /**/                                    \
 static uint8_t SWD_Transfer##speed (uint32_t request, uint32_t *data) {         \
  uint32_t ack;                                                                 \
  uint32_t bit;                                                                 \
  uint32_t val;                                                                 \
  uint32_t parity;                                                              \
                                                                                \
  uint32_t n;                                                                   \
                                                                                \
  /* Packet Request */                                                          \
  parity = 0U;                                                                  \
  SW_WRITE_BIT(1U);                     /* Start Bit */                         \
  bit = request >> 0;                                                           \
  SW_WRITE_BIT(bit);                    /* APnDP Bit */                         \
  parity += bit;                                                                \
  bit = request >> 1;                                                           \
  SW_WRITE_BIT(bit);                    /* RnW Bit */                           \
  parity += bit;                                                                \
  bit = request >> 2;                                                           \
  SW_WRITE_BIT(bit);                    /* A2 Bit */                            \
  parity += bit;                                                                \
  bit = request >> 3;                                                           \
  SW_WRITE_BIT(bit);                    /* A3 Bit */                            \
  parity += bit;                                                                \
  SW_WRITE_BIT(parity);                 /* Parity Bit */                        \
  SW_WRITE_BIT(0U);                     /* Stop Bit */                          \
  SW_WRITE_BIT(1U);                     /* Park Bit */                          \
                                                                                \
  /* Turnaround */                                                              \
  PIN_SWDIO_OUT_DISABLE();                                                      \
  for (n = DAP_Data.swd_conf.turnaround; n; n--) {                              \
    SW_CLOCK_CYCLE();                                                           \
  }                                                                             \
                                                                                \
  /* Acknowledge response */                                                    \
  SW_READ_BIT(bit);                                                             \
  ack  = bit << 0;                                                              \
  SW_READ_BIT(bit);                                                             \
  ack |= bit << 1;                                                              \
  SW_READ_BIT(bit);                                                             \
  ack |= bit << 2;                                                              \
                                                                                \
  if (ack == DAP_TRANSFER_OK) {         /* OK response */                       \
    /* Data transfer */                                                         \
    if (request & DAP_TRANSFER_RnW) {                                           \
      /* Read data */                                                           \
      val = 0U;                                                                 \
      parity = 0U;                                                              \
      for (n = 32U; n; n--) {                                                   \
        SW_READ_BIT(bit);               /* Read RDATA[0:31] */                  \
        parity += bit;                                                          \
        val >>= 1;                                                              \
        val  |= bit << 31;                                                      \
      }                                                                         \
      SW_READ_BIT(bit);                 /* Read Parity */                       \
      if ((parity ^ bit) & 1U) {                                                \
        ack = DAP_TRANSFER_ERROR;                                               \
      }                                                                         \
      if (data) { *data = val; }                                                \
      /* Turnaround */                                                          \
      for (n = DAP_Data.swd_conf.turnaround; n; n--) {                          \
        SW_CLOCK_CYCLE();                                                       \
      }                                                                         \
      PIN_SWDIO_OUT_ENABLE();                                                   \
    } else {                                                                    \
      /* Turnaround */                                                          \
      for (n = DAP_Data.swd_conf.turnaround; n; n--) {                          \
        SW_CLOCK_CYCLE();                                                       \
      }                                                                         \
      PIN_SWDIO_OUT_ENABLE();                                                   \
      /* Write data */                                                          \
      val = *data;                                                              \
      parity = 0U;                                                              \
      for (n = 32U; n; n--) {                                                   \
        SW_WRITE_BIT(val);              /* Write WDATA[0:31] */                 \
        parity += val;                                                          \
        val >>= 1;                                                              \
      }                                                                         \
      SW_WRITE_BIT(parity);             /* Write Parity Bit */                  \
    }                                                                           \
    /* Capture Timestamp */                                                     \
    if (request & DAP_TRANSFER_TIMESTAMP) {                                     \
      DAP_Data.timestamp = TIMESTAMP_GET();                                     \
    }                                                                           \
    /* Idle cycles */                                                           \
    n = DAP_Data.transfer.idle_cycles;                                          \
    if (n) {                                                                    \
      PIN_SWDIO_OUT(0U);                                                        \
      for (; n; n--) {                                                          \
        SW_CLOCK_CYCLE();                                                       \
      }                                                                         \
    }                                                                           \
    PIN_SWDIO_OUT(1U);                                                          \
    return ((uint8_t)ack);                                                      \
  }                                                                             \
                                                                                \
  if ((ack == DAP_TRANSFER_WAIT) || (ack == DAP_TRANSFER_FAULT)) {              \
    /* WAIT or FAULT response */                                                \
    if (DAP_Data.swd_conf.data_phase && ((request & DAP_TRANSFER_RnW) != 0U)) { \
      for (n = 32U+1U; n; n--) {                                                \
        SW_CLOCK_CYCLE();               /* Dummy Read RDATA[0:31] + Parity */   \
      }                                                                         \
    }                                                                           \
    /* Turnaround */                                                            \
    for (n = DAP_Data.swd_conf.turnaround; n; n--) {                            \
      SW_CLOCK_CYCLE();                                                         \
    }                                                                           \
    PIN_SWDIO_OUT_ENABLE();                                                     \
    if (DAP_Data.swd_conf.data_phase && ((request & DAP_TRANSFER_RnW) == 0U)) { \
      PIN_SWDIO_OUT(0U);                                                        \
      for (n = 32U+1U; n; n--) {                                                \
        SW_CLOCK_CYCLE();               /* Dummy Write WDATA[0:31] + Parity */  \
      }                                                                         \
    }                                                                           \
    PIN_SWDIO_OUT(1U);                                                          \
    return ((uint8_t)ack);                                                      \
  }                                                                             \
                                                                                \
  /* Protocol error */                                                          \
  for (n = DAP_Data.swd_conf.turnaround + 32U + 1U; n; n--) {                   \
    SW_CLOCK_CYCLE();                   /* Back off data phase */               \
  }                                                                             \
  PIN_SWDIO_OUT_ENABLE();                                                       \
  PIN_SWDIO_OUT(1U);                                                            \
  return ((uint8_t)ack);                                                        \
 }
 #undef  PIN_DELAY
 #define PIN_DELAY() PIN_DELAY_FAST()
 SWD_TransferFunction(Fast)
 #undef  PIN_DELAY
 #define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
 SWD_TransferFunction(Slow)
 // SWD Transfer I/O
 //   request: A[3:2] RnW APnDP
 //   data:    DATA[31:0]
 //   return:  ACK[2:0]
 uint8_t  SWD_Transfer(uint32_t request, uint32_t *data) {
  if (DAP_Data.fast_clock) {
    return SWD_TransferFast(request, data);
  } else {
    return SWD_TransferSlow(request, data);
  }
 }
 #endif  /* (DAP_SWD != 0) */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/UART.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DAP/Firmware/Source/UART.c
@@ -1,652 +0,0 @@
 /*
 * Copyright (c) 2021 ARM Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ----------------------------------------------------------------------
 *
 * $Date:        1. March 2021
 * $Revision:    V1.0.0
 *
 * Project:      CMSIS-DAP Source
 * Title:        UART.c CMSIS-DAP UART
 *
 *---------------------------------------------------------------------------*/
 #include "DAP_config.h"
 #include "DAP.h"
 #if (DAP_UART != 0)
 #ifdef DAP_FW_V1
 #error "UART Communication Port not supported in DAP V1!"
 #endif
 #include "Driver_USART.h"
 #include "cmsis_os2.h"
 #include <string.h>
 #define UART_RX_BLOCK_SIZE    32U   /* Uart Rx Block Size (must be 2^n) */
 // USART Driver
 #define _USART_Driver_(n)  Driver_USART##n
 #define  USART_Driver_(n) _USART_Driver_(n)
 extern ARM_DRIVER_USART    USART_Driver_(DAP_UART_DRIVER);
 #define pUSART           (&USART_Driver_(DAP_UART_DRIVER))
 // UART Configuration
 #if (DAP_UART_USB_COM_PORT != 0)
 static uint8_t  UartTransport = DAP_UART_TRANSPORT_USB_COM_PORT;
 #else
 static uint8_t  UartTransport = DAP_UART_TRANSPORT_NONE;
 #endif
 // UART Flags
 static uint8_t  UartConfigured = 0U;
 static uint8_t  UartReceiveEnabled = 0U;
 static uint8_t  UartTransmitEnabled = 0U;
 static uint8_t  UartTransmitActive = 0U;
 // UART TX Buffer
 static uint8_t  UartTxBuf[DAP_UART_TX_BUFFER_SIZE];
 static volatile uint32_t UartTxIndexI = 0U;
 static volatile uint32_t UartTxIndexO = 0U;
 // UART RX Buffer
 static uint8_t  UartRxBuf[DAP_UART_RX_BUFFER_SIZE];
 static volatile uint32_t UartRxIndexI = 0U;
 static volatile uint32_t UartRxIndexO = 0U;
 // Uart Errors
 static volatile uint8_t  UartErrorRxDataLost = 0U;
 static volatile uint8_t  UartErrorFraming = 0U;
 static volatile uint8_t  UartErrorParity = 0U;
 // UART Transmit
 static uint32_t UartTxNum = 0U;
 // Function prototypes
 static uint8_t  UART_Init (void);
 static void     UART_Uninit (void);
 static uint8_t  UART_Get_Status (void);
 static uint8_t  UART_Receive_Enable (void);
 static uint8_t  UART_Transmit_Enable (void);
 static void     UART_Receive_Disable (void);
 static void     UART_Transmit_Disable (void);
 static void     UART_Receive_Flush (void);
 static void     UART_Transmit_Flush (void);
 static void     UART_Receive (void);
 static void     UART_Transmit (void);
 // USART Driver Callback function
 //   event: event mask
 static void USART_Callback (uint32_t event) {
  if (event &  ARM_USART_EVENT_SEND_COMPLETE) {
    UartTxIndexO += UartTxNum;
    UartTransmitActive = 0U;
    UART_Transmit();
  }
  if (event &  ARM_USART_EVENT_RECEIVE_COMPLETE) {
    UartRxIndexI += UART_RX_BLOCK_SIZE;
    UART_Receive();
  }
  if (event &  ARM_USART_EVENT_RX_OVERFLOW) {
    UartErrorRxDataLost = 1U;
  }
  if (event &  ARM_USART_EVENT_RX_FRAMING_ERROR) {
    UartErrorFraming = 1U;
  }
  if (event &  ARM_USART_EVENT_RX_PARITY_ERROR) {
    UartErrorParity = 1U;
  }
 }
 // Init UART
 //   return: DAP_OK or DAP_ERROR
 static uint8_t UART_Init (void) {
  int32_t status;
  uint8_t ret = DAP_ERROR;
  UartConfigured = 0U;
  UartReceiveEnabled = 0U;
  UartTransmitEnabled = 0U;
  UartTransmitActive = 0U;
  UartErrorRxDataLost = 0U;
  UartErrorFraming = 0U;
  UartErrorParity = 0U;
  UartTxIndexI = 0U;
  UartTxIndexO = 0U;
  UartRxIndexI = 0U;
  UartRxIndexO = 0U;
  UartTxNum = 0U;
  status = pUSART->Initialize(USART_Callback);
  if (status == ARM_DRIVER_OK) {
    status = pUSART->PowerControl(ARM_POWER_FULL);
  }
  if (status == ARM_DRIVER_OK) {
    ret = DAP_OK;
  }
  return (ret);
 }
 // Un-Init UART
 static void UART_Uninit (void) {
  UartConfigured = 0U;
  pUSART->PowerControl(ARM_POWER_OFF);
  pUSART->Uninitialize();
 }
 // Get UART Status
 //   return: status
 static uint8_t UART_Get_Status (void) {
  uint8_t status = 0U;
  if (UartReceiveEnabled != 0U) {
    status |= DAP_UART_STATUS_RX_ENABLED;
  }
  if (UartErrorRxDataLost != 0U) {
    UartErrorRxDataLost = 0U;
    status |= DAP_UART_STATUS_RX_DATA_LOST;
  }
  if (UartErrorFraming != 0U) {
    UartErrorFraming = 0U;
    status |= DAP_UART_STATUS_FRAMING_ERROR;
  }
  if (UartErrorParity != 0U) {
    UartErrorParity = 0U;
    status |= DAP_UART_STATUS_PARITY_ERROR;
  }
  if (UartTransmitEnabled != 0U) {
    status |= DAP_UART_STATUS_TX_ENABLED;
  }
  return (status);
 }
 // Enable UART Receive
 //   return: DAP_OK or DAP_ERROR
 static uint8_t UART_Receive_Enable (void) {
  int32_t status;
  uint8_t ret = DAP_ERROR;
  if (UartReceiveEnabled == 0U) {
    // Flush Buffers
    UartRxIndexI = 0U;
    UartRxIndexO = 0U;
    UART_Receive();
    status = pUSART->Control(ARM_USART_CONTROL_RX, 1U);
    if (status == ARM_DRIVER_OK) {
      UartReceiveEnabled = 1U;
      ret = DAP_OK;
    }
  } else {
    ret = DAP_OK;
  }
  return (ret);
 }
 // Enable UART Transmit
 //   return: DAP_OK or DAP_ERROR
 static uint8_t UART_Transmit_Enable (void) {
  int32_t status;
  uint8_t ret = DAP_ERROR;
  if (UartTransmitEnabled == 0U) {
    // Flush Buffers
    UartTransmitActive = 0U;
    UartTxIndexI = 0U;
    UartTxIndexO = 0U;
    UartTxNum = 0U;
    status = pUSART->Control(ARM_USART_CONTROL_TX, 1U);
    if (status == ARM_DRIVER_OK) {
      UartTransmitEnabled = 1U;
      ret = DAP_OK;
    }
  } else {
    ret = DAP_OK;
  }
  return (ret);
 }
 // Disable UART Receive
 static void UART_Receive_Disable (void) {
  if (UartReceiveEnabled != 0U) {
    pUSART->Control(ARM_USART_CONTROL_RX, 0U);
    pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
    UartReceiveEnabled = 0U;
  }
 }
 // Disable UART Transmit
 static void UART_Transmit_Disable (void) {
  if (UartTransmitEnabled != 0U) {
    pUSART->Control(ARM_USART_ABORT_SEND, 0U);
    pUSART->Control(ARM_USART_CONTROL_TX, 0U);
    UartTransmitActive = 0U;
    UartTransmitEnabled = 0U;
  }
 }
 // Flush UART Receive buffer
 static void UART_Receive_Flush (void) {
  pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
  UartRxIndexI = 0U;
  UartRxIndexO = 0U;
  if (UartReceiveEnabled != 0U) {
    UART_Receive();
  }
 }
 // Flush UART Transmit buffer
 static void UART_Transmit_Flush (void) {
  pUSART->Control(ARM_USART_ABORT_SEND, 0U);
  UartTransmitActive = 0U;
  UartTxIndexI = 0U;
  UartTxIndexO = 0U;
  UartTxNum = 0U;
 }
 // Receive data from target via UART
 static void UART_Receive (void) {
  uint32_t index;
  index = UartRxIndexI & (DAP_UART_RX_BUFFER_SIZE - 1U);
  pUSART->Receive(&UartRxBuf[index], UART_RX_BLOCK_SIZE);
 }
 // Transmit available data to target via UART
 static void UART_Transmit (void) {
  uint32_t count;
  uint32_t index;
  count = UartTxIndexI - UartTxIndexO;
  index = UartTxIndexO & (DAP_UART_TX_BUFFER_SIZE - 1U);
  if (count != 0U) {
    if ((index + count) <= DAP_UART_TX_BUFFER_SIZE) {
      UartTxNum = count;
    } else {
      UartTxNum = DAP_UART_TX_BUFFER_SIZE - index;
    }
    UartTransmitActive = 1U;
    pUSART->Send(&UartTxBuf[index], UartTxNum);
  }
 }
 // Process UART Transport command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t UART_Transport (const uint8_t *request, uint8_t *response) {
  uint8_t  transport;
  uint8_t  ret = DAP_ERROR;
  transport = *request;
  switch (transport) {
    case DAP_UART_TRANSPORT_NONE:
      switch (UartTransport) {
        case DAP_UART_TRANSPORT_NONE:
          ret = DAP_OK;
          break;
        case DAP_UART_TRANSPORT_USB_COM_PORT:
 #if (DAP_UART_USB_COM_PORT != 0)
          USB_COM_PORT_Activate(0U);
          UartTransport = DAP_UART_TRANSPORT_NONE;
          ret = DAP_OK;
 #endif
          break;
        case DAP_UART_TRANSPORT_DAP_COMMAND:
          UART_Receive_Disable();
          UART_Transmit_Disable();
          UART_Uninit();
          UartTransport = DAP_UART_TRANSPORT_NONE;
          ret= DAP_OK;
          break;
      }
      break;
    case DAP_UART_TRANSPORT_USB_COM_PORT:
      switch (UartTransport) {
        case DAP_UART_TRANSPORT_NONE:
 #if (DAP_UART_USB_COM_PORT != 0)
          if (USB_COM_PORT_Activate(1U) == 0U) {
            UartTransport = DAP_UART_TRANSPORT_USB_COM_PORT;
            ret = DAP_OK;
          }
 #endif
          break;
        case DAP_UART_TRANSPORT_USB_COM_PORT:
          ret = DAP_OK;
          break;
        case DAP_UART_TRANSPORT_DAP_COMMAND:
          UART_Receive_Disable();
          UART_Transmit_Disable();
          UART_Uninit();
          UartTransport = DAP_UART_TRANSPORT_NONE;
 #if (DAP_UART_USB_COM_PORT != 0)
          if (USB_COM_PORT_Activate(1U) == 0U) {
            UartTransport = DAP_UART_TRANSPORT_USB_COM_PORT;
            ret = DAP_OK;
          }
 #endif
          break;
      }
      break;
    case DAP_UART_TRANSPORT_DAP_COMMAND:
      switch (UartTransport) {
        case DAP_UART_TRANSPORT_NONE:
          ret = UART_Init();
          if (ret == DAP_OK) {
            UartTransport = DAP_UART_TRANSPORT_DAP_COMMAND;
          }
          break;
        case DAP_UART_TRANSPORT_USB_COM_PORT:
 #if (DAP_UART_USB_COM_PORT != 0)
          USB_COM_PORT_Activate(0U);
          UartTransport = DAP_UART_TRANSPORT_NONE;
 #endif
          ret = UART_Init();
          if (ret == DAP_OK) {
            UartTransport = DAP_UART_TRANSPORT_DAP_COMMAND;
          }
          break;
        case DAP_UART_TRANSPORT_DAP_COMMAND:
          ret = DAP_OK;
          break;
      }
      break;
    default:
      break;
  }
  *response = ret;
  return ((1U << 16) | 1U);
 }
 // Process UART Configure command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t UART_Configure (const uint8_t *request, uint8_t *response) {
  uint8_t  control, status;
  uint32_t baudrate;
  int32_t  result;
  if (UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) {
    status = DAP_UART_CFG_ERROR_DATA_BITS |
             DAP_UART_CFG_ERROR_PARITY    |
             DAP_UART_CFG_ERROR_STOP_BITS;
    baudrate = 0U;  // baudrate error
  } else {
    status   = 0U;
    control  = *request;
    baudrate = (uint32_t)(*(request+1) <<  0) |
               (uint32_t)(*(request+2) <<  8) |
               (uint32_t)(*(request+3) << 16) |
               (uint32_t)(*(request+4) << 24);
    result = pUSART->Control(control |
                             ARM_USART_MODE_ASYNCHRONOUS |
                             ARM_USART_FLOW_CONTROL_NONE,
                             baudrate);
    if (result == ARM_DRIVER_OK) {
      UartConfigured = 1U;
    } else {
      UartConfigured = 0U;
      switch (result) {
        case ARM_USART_ERROR_BAUDRATE:
          status = 0U;
          baudrate = 0U;
          break;
        case ARM_USART_ERROR_DATA_BITS:
          status = DAP_UART_CFG_ERROR_DATA_BITS;
          break;
        case ARM_USART_ERROR_PARITY:
          status = DAP_UART_CFG_ERROR_PARITY;
          break;
        case ARM_USART_ERROR_STOP_BITS:
          status = DAP_UART_CFG_ERROR_STOP_BITS;
          break;
        default:
          status = DAP_UART_CFG_ERROR_DATA_BITS |
                   DAP_UART_CFG_ERROR_PARITY    |
                   DAP_UART_CFG_ERROR_STOP_BITS;
          baudrate = 0U;
          break;
      }
    }
  }
  *response++ = status;
  *response++ = (uint8_t)(baudrate >>  0);
  *response++ = (uint8_t)(baudrate >>  8);
  *response++ = (uint8_t)(baudrate >> 16);
  *response   = (uint8_t)(baudrate >> 24);
  return ((5U << 16) | 5U);
 }
 // Process UART Control command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t UART_Control (const uint8_t *request, uint8_t *response) {
  uint8_t control;
  uint8_t result;
  uint8_t ret = DAP_OK;
  if (UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) {
    ret = DAP_ERROR;
  } else {
    control = *request;
    if ((control & DAP_UART_CONTROL_RX_DISABLE) != 0U) {
      // Receive disable
      UART_Receive_Disable();
    } else if ((control & DAP_UART_CONTROL_RX_ENABLE) != 0U) {
      // Receive enable
      if (UartConfigured != 0U) {
        result = UART_Receive_Enable();
        if (result != DAP_OK) {
          ret = DAP_ERROR;
        }
      } else {
        ret = DAP_ERROR;
      }
    }
    if ((control & DAP_UART_CONTROL_RX_BUF_FLUSH) != 0U) {
      UART_Receive_Flush();
    }
    if ((control & DAP_UART_CONTROL_TX_DISABLE) != 0U) {
      // Transmit disable
      UART_Transmit_Disable();
    } else if ((control & DAP_UART_CONTROL_TX_ENABLE) != 0U) {
      // Transmit enable
      if (UartConfigured != 0U) {
        result = UART_Transmit_Enable();
        if (result != DAP_OK) {
          ret = DAP_ERROR;
        }
      } else {
        ret = DAP_ERROR;
      }
    } 
    if ((control & DAP_UART_CONTROL_TX_BUF_FLUSH) != 0U) {
      UART_Transmit_Flush();
    }
  }
  *response = ret;
  return ((1U << 16) | 1U);
 }
 // Process UART Status command and prepare response
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t UART_Status (uint8_t *response) {
  uint32_t rx_cnt, tx_cnt;
  uint32_t cnt;
  uint8_t  status;
  if ((UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) ||
      (UartConfigured == 0U)) {
    rx_cnt = 0U;
    tx_cnt = 0U;
    status = 0U;
  } else {
    rx_cnt  = UartRxIndexI - UartRxIndexO;
    rx_cnt += pUSART->GetRxCount();
    if (rx_cnt > (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2))) {
      // Overflow
      UartErrorRxDataLost = 1U;
      rx_cnt = (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2));
      UartRxIndexO = UartRxIndexI - rx_cnt;
    }
    tx_cnt = UartTxIndexI - UartTxIndexO;
    cnt = pUSART->GetTxCount();
    if (UartTransmitActive != 0U) {
      tx_cnt -= cnt;
    }
    status = UART_Get_Status();
  }
  *response++ = status;
  *response++ = (uint8_t)(rx_cnt >>  0);
  *response++ = (uint8_t)(rx_cnt >>  8);
  *response++ = (uint8_t)(rx_cnt >> 16);
  *response++ = (uint8_t)(rx_cnt >> 24);
  *response++ = (uint8_t)(tx_cnt >>  0);
  *response++ = (uint8_t)(tx_cnt >>  8);
  *response++ = (uint8_t)(tx_cnt >> 16);
  *response   = (uint8_t)(tx_cnt >> 24);
  return ((0U << 16) | 9U);
 }
 // Process UART Transfer command and prepare response
 //   request:  pointer to request data
 //   response: pointer to response data
 //   return:   number of bytes in response (lower 16 bits)
 //             number of bytes in request (upper 16 bits)
 uint32_t UART_Transfer (const uint8_t *request, uint8_t *response) {
  uint32_t rx_cnt, tx_cnt;
  uint32_t rx_num, tx_num;
  uint8_t *rx_data;
  const
  uint8_t *tx_data;
  uint32_t num;
  uint32_t index;
  uint8_t  status;
  if (UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) {
    status = 0U;
    rx_cnt = 0U;
    tx_cnt = 0U;
  } else {
    // RX Data
    rx_cnt = ((uint32_t)(*(request+0) << 0)  |
              (uint32_t)(*(request+1) << 8));
    if (rx_cnt > (DAP_PACKET_SIZE - 6U)) {
      rx_cnt = (DAP_PACKET_SIZE - 6U);
    }
    rx_num  = UartRxIndexI - UartRxIndexO;
    rx_num += pUSART->GetRxCount();
    if (rx_num > (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2))) {
      // Overflow
      UartErrorRxDataLost = 1U;
      rx_num = (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2));
      UartRxIndexO = UartRxIndexI - rx_num;
    }
    if (rx_cnt > rx_num) {
      rx_cnt = rx_num;
    }
    rx_data = (response+5);
    index = UartRxIndexO & (DAP_UART_RX_BUFFER_SIZE - 1U);
    if ((index + rx_cnt) <= DAP_UART_RX_BUFFER_SIZE) {
      memcpy( rx_data,      &UartRxBuf[index], rx_cnt);
    } else {
      num = DAP_UART_RX_BUFFER_SIZE - index;
      memcpy( rx_data,      &UartRxBuf[index], num);
      memcpy(&rx_data[num], &UartRxBuf[0],     rx_cnt - num);
    }
    UartRxIndexO += rx_cnt;
    // TX Data
    tx_cnt  = ((uint32_t)(*(request+2) << 0) |
               (uint32_t)(*(request+3) << 8));
    tx_data =              (request+4);
    if (tx_cnt > (DAP_PACKET_SIZE - 5U)) {
      tx_cnt = (DAP_PACKET_SIZE - 5U);
    }
    tx_num = UartTxIndexI - UartTxIndexO;
    num = pUSART->GetTxCount();
    if (UartTransmitActive != 0U) {
      tx_num -= num;
    }
    if (tx_cnt > (DAP_UART_TX_BUFFER_SIZE - tx_num)) {
      tx_cnt = (DAP_UART_TX_BUFFER_SIZE - tx_num);
    }
    index = UartTxIndexI & (DAP_UART_TX_BUFFER_SIZE - 1U);
    if ((index + tx_cnt) <= DAP_UART_TX_BUFFER_SIZE) {
      memcpy(&UartTxBuf[index],  tx_data,      tx_cnt);
    } else {
      num = DAP_UART_TX_BUFFER_SIZE - index;
      memcpy(&UartTxBuf[index],  tx_data,      num);
      memcpy(&UartTxBuf[0],     &tx_data[num], tx_cnt - num);
    }
    UartTxIndexI += tx_cnt;
    if (UartTransmitActive == 0U) {
      UART_Transmit();
    }
    status = UART_Get_Status();
  }
  *response++ = status;
  *response++ = (uint8_t)(tx_cnt >> 0);
  *response++ = (uint8_t)(tx_cnt >> 8);
  *response++ = (uint8_t)(rx_cnt >> 0);
  *response   = (uint8_t)(rx_cnt >> 8);
  return (((4U + tx_cnt) << 16) | (5U + rx_cnt));
 }
 #endif /* DAP_UART */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BasicMathFunctions/BasicMathFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BasicMathFunctions/BasicMathFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BasicMathFunctions/BasicMathFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BasicMathFunctions/BasicMathFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BayesFunctions/BayesFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BayesFunctions/BayesFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BayesFunctions/BayesFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/BayesFunctions/BayesFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/CommonTables/CommonTables.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/CommonTables/CommonTables.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/CommonTables/CommonTablesF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/CommonTables/CommonTablesF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/ComplexMathFunctions/ComplexMathFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/ComplexMathFunctions/ComplexMathFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/ComplexMathFunctions/ComplexMathFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/ComplexMathFunctions/ComplexMathFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/ControllerFunctions/ControllerFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/ControllerFunctions/ControllerFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/DistanceFunctions/DistanceFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/DistanceFunctions/DistanceFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/DistanceFunctions/DistanceFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/DistanceFunctions/DistanceFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FastMathFunctions/FastMathFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FastMathFunctions/FastMathFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FastMathFunctions/FastMathFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FastMathFunctions/FastMathFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FilteringFunctions/FilteringFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FilteringFunctions/FilteringFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FilteringFunctions/FilteringFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/FilteringFunctions/FilteringFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/QuaternionMathFunctions/QuaternionMathFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/QuaternionMathFunctions/QuaternionMathFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/CMakeLists.txt
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/CMakeLists.txt
@@ -1,29 +0,0 @@
 #
 # Copyright (c) 2019-2021 Arm Limited.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 cmake_minimum_required(VERSION 3.15.6)
 project(CMSISNN)
 set(CMSIS_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..")
 option(BUILD_CMSIS_NN_FUNCTIONS "Build CMSIS-NN Source." ON)
 if(BUILD_CMSIS_NN_FUNCTIONS)
    add_subdirectory(Source)
 endif()
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nn_math_types.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nn_math_types.h
@@ -1,169 +0,0 @@
 /******************************************************************************
 * @file     arm_nn_math_types.h
 * @brief    Compiler include and basic types
 * @version  V1.1.0
 * @date     09 March 2022
 * Target Processor: Cortex-M
 ******************************************************************************/
 /*
 * Copyright (c) 2010-2022 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
   Copied from CMSIS/DSP/arm_math_types.h and modified
 */
 #ifndef _ARM_NN_MATH_TYPES_H_
 #define _ARM_NN_MATH_TYPES_H_
 /* DSP inlcude for enum arm_status. */
 #include "arm_math_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Compiler specific diagnostic adjustment */
 #if defined(__CC_ARM)
 #elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
 #elif defined(__GNUC__)
 #elif defined(__ICCARM__)
 #elif defined(__TI_ARM__)
 #elif defined(__CSMC__)
 #elif defined(__TASKING__)
 #elif defined(_MSC_VER)
 #else
 #error Unknown compiler
 #endif
 /* Included for instrinsics definitions */
 #if defined(_MSC_VER)
 #include <stdint.h>
 #ifndef __STATIC_FORCEINLINE
 #define __STATIC_FORCEINLINE static __forceinline
 #endif
 #ifndef __STATIC_INLINE
 #define __STATIC_INLINE static __inline
 #endif
 #ifndef __ALIGNED
 #define __ALIGNED(x) __declspec(align(x))
 #endif
 #elif defined(__GNUC_PYTHON__)
 #include <stdint.h>
 #ifndef __ALIGNED
 #define __ALIGNED(x) __attribute__((aligned(x)))
 #endif
 #ifndef __STATIC_FORCEINLINE
 #define __STATIC_FORCEINLINE static inline __attribute__((always_inline))
 #endif
 #ifndef __STATIC_INLINE
 #define __STATIC_INLINE static inline
 #endif
 #else
 #include "cmsis_compiler.h"
 #endif
 #include <float.h>
 #include <limits.h>
 #include <math.h>
 #include <string.h>
 /* evaluate ARM DSP feature */
 #if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
 #ifndef ARM_MATH_DSP
 #define ARM_MATH_DSP 1
 #endif
 #endif
 #if __ARM_FEATURE_MVE
 #ifndef ARM_MATH_MVEI
 #define ARM_MATH_MVEI
 #endif
 #endif
 /* Compiler specific diagnostic adjustment */
 #if defined(__CC_ARM)
 #elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
 #elif defined(__GNUC__)
 // #pragma GCC diagnostic pop
 #elif defined(__ICCARM__)
 #elif defined(__TI_ARM__)
 #elif defined(__CSMC__)
 #elif defined(__TASKING__)
 #elif defined(_MSC_VER)
 #else
 #error Unknown compiler
 #endif
 #ifdef __cplusplus
 }
 #endif
 #if __ARM_FEATURE_MVE
 #include <arm_mve.h>
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * @brief Add necessary typedefs
 */
 #define NN_Q31_MAX ((q31_t)(0x7FFFFFFFL))
 #define NN_Q15_MAX ((q15_t)(0x7FFF))
 #define NN_Q7_MAX ((q7_t)(0x7F))
 #define NN_Q31_MIN ((q31_t)(0x80000000L))
 #define NN_Q15_MIN ((q15_t)(0x8000))
 #define NN_Q7_MIN ((q7_t)(0x80))
 /**
 * @brief Error status returned by some functions in the library.
 */
 typedef enum
 {
    ARM_CMSIS_NN_SUCCESS = 0,        /**< No error */
    ARM_CMSIS_NN_ARG_ERROR = -1,     /**< One or more arguments are incorrect */
    ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**<  No implementation available */
 } arm_cmsis_nn_status;
 #ifdef __cplusplus
 }
 #endif
 #endif /*ifndef _ARM_NN_MATH_TYPES_H_ */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nn_tables.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nn_tables.h
@@ -1,56 +0,0 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_tables.h
 * Description:  Extern declaration for NN tables
 *
 * $Date:        17. August 2021
 * $Revision:    V.1.0.2
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef _ARM_NN_TABLES_H
 #define _ARM_NN_TABLES_H
 #include "arm_nn_math_types.h"
 /**
 * @brief tables for various activation functions
 *
 */
 extern const q15_t sigmoidTable_q15[256];
 extern const q7_t sigmoidTable_q7[256];
 extern const q7_t tanhTable_q7[256];
 extern const q15_t tanhTable_q15[256];
 /**
 * @brief 2-way tables for various activation functions
 *
 * 2-way table, H table for value larger than 1/4
 * L table for value smaller than 1/4, H table for remaining
 * We have this only for the q15_t version. It does not make
 * sense to have it for q7_t type
 */
 extern const q15_t sigmoidHTable_q15[192];
 extern const q15_t sigmoidLTable_q15[128];
 #endif /*  ARM_NN_TABLES_H */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nn_types.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nn_types.h
@@ -1,137 +0,0 @@
 /*
 * Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_types.h
 * Description:  Public header file to contain the CMSIS-NN structs for the
 *               TensorFlowLite micro compliant functions
 *
 * $Date:        22. Februari 2022
 * $Revision:    V.2.1.0
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 #ifndef _ARM_NN_TYPES_H
 #define _ARM_NN_TYPES_H
 #include <stdint.h>
 /** CMSIS-NN object to contain the width and height of a tile */
 typedef struct
 {
    int32_t w; /**< Width */
    int32_t h; /**< Height */
 } cmsis_nn_tile;
 /** CMSIS-NN object used for the function context. */
 typedef struct
 {
    void *buf;    /**< Pointer to a buffer needed for the optimization */
    int32_t size; /**< Buffer size */
 } cmsis_nn_context;
 /** CMSIS-NN object to contain the dimensions of the tensors */
 typedef struct
 {
    int32_t n; /**< Generic dimension to contain either the batch size or output channels.
                     Please refer to the function documentation for more information */
    int32_t h; /**< Height */
    int32_t w; /**< Width */
    int32_t c; /**< Input channels */
 } cmsis_nn_dims;
 /** CMSIS-NN object for the per-channel quantization parameters */
 typedef struct
 {
    int32_t *multiplier; /**< Multiplier values */
    int32_t *shift;      /**< Shift values */
 } cmsis_nn_per_channel_quant_params;
 /** CMSIS-NN object for the per-tensor quantization parameters */
 typedef struct
 {
    int32_t multiplier; /**< Multiplier value */
    int32_t shift;      /**< Shift value */
 } cmsis_nn_per_tensor_quant_params;
 /** CMSIS-NN object for the quantized Relu activation */
 typedef struct
 {
    int32_t min; /**< Min value used to clamp the result */
    int32_t max; /**< Max value used to clamp the result */
 } cmsis_nn_activation;
 /** CMSIS-NN object for the convolution layer parameters */
 typedef struct
 {
    int32_t input_offset;  /**< Zero value for the input tensor */
    int32_t output_offset; /**< Zero value for the output tensor */
    cmsis_nn_tile stride;
    cmsis_nn_tile padding;
    cmsis_nn_tile dilation;
    cmsis_nn_activation activation;
 } cmsis_nn_conv_params;
 /** CMSIS-NN object for Depthwise convolution layer parameters */
 typedef struct
 {
    int32_t input_offset;  /**< Zero value for the input tensor */
    int32_t output_offset; /**< Zero value for the output tensor */
    int32_t ch_mult;       /**< Channel Multiplier. ch_mult * in_ch = out_ch */
    cmsis_nn_tile stride;
    cmsis_nn_tile padding;
    cmsis_nn_tile dilation;
    cmsis_nn_activation activation;
 } cmsis_nn_dw_conv_params;
 /** CMSIS-NN object for pooling layer parameters */
 typedef struct
 {
    cmsis_nn_tile stride;
    cmsis_nn_tile padding;
    cmsis_nn_activation activation;
 } cmsis_nn_pool_params;
 /** CMSIS-NN object for Fully Connected layer parameters */
 typedef struct
 {
    int32_t input_offset;  /**< Zero value for the input tensor */
    int32_t filter_offset; /**< Zero value for the filter tensor. Not used */
    int32_t output_offset; /**< Zero value for the output tensor */
    cmsis_nn_activation activation;
 } cmsis_nn_fc_params;
 /** CMSIS-NN object for SVDF layer parameters */
 typedef struct
 {
    int32_t rank;
    int32_t input_offset;  /**< Zero value for the input tensor */
    int32_t output_offset; /**< Zero value for the output tensor */
    cmsis_nn_activation input_activation;
    cmsis_nn_activation output_activation;
 } cmsis_nn_svdf_params;
 /** CMSIS-NN object for Softmax s16 layer parameters */
 typedef struct
 {
    const int16_t *exp_lut;
    const int16_t *one_by_one_lut;
 } cmsis_nn_softmax_lut_s16;
 #endif // _ARM_NN_TYPES_H
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nnfunctions.h
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Include/arm_nnsupportfunctions.h
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/CMakeLists.txt
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/CMakeLists.txt
@@ -1,30 +0,0 @@
 #
 # Copyright (c) 2019-2021 Arm Limited. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 project(CMSISNNActivation)
 file(GLOB SRC "./*_s8.c")
 add_library(CMSISNNActivation STATIC ${SRC})
 ### Includes
 target_include_directories(CMSISNNActivation PUBLIC "${NN}/Include")
 target_include_directories(CMSISNNActivation PUBLIC "${ROOT}/CMSIS/Core/Include")
 target_include_directories(CMSISNNActivation PUBLIC "${ROOT}/CMSIS/DSP/Include")
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c
@@ -1,96 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_activations_q15.c
 * Description:  Q15 neural network activation function using direct table look-up
 *
 * $Date:        09. October 2020
 * $Revision:    V.1.0.1
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nn_tables.h"
 #include "arm_nnfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Acti
 * @{
 */
 /**
 * @brief neural network activation function using direct table look-up
 *
 * @note  Refer header file for details.
 *
 */
 void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
 {
    uint16_t i = size;
    q15_t *pIn = data;
    q15_t *pOut = data;
    uint16_t shift_size = 8 + 3 - int_width;
    uint32_t bit_mask = 0x7FF >> int_width;
    uint32_t full_frac = bit_mask + 1;
    const q15_t *lookup_table;
    switch (type)
    {
    case ARM_SIGMOID:
        lookup_table = sigmoidTable_q15;
        break;
    case ARM_TANH:
    default:
        lookup_table = tanhTable_q15;
        break;
    }
    while (i)
    {
        q15_t out;
        q15_t in = *pIn++;
        q15_t frac = (uint32_t)in & bit_mask;
        q15_t value = lookup_table[(uint8_t)(in >> shift_size)];
        if ((in >> shift_size) != 0x7f)
        {
            q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))];
            /* doing the interpolation here for better accuracy */
            out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size;
        }
        else
        {
            /* the largest positive value does not have a right side for linear interpolation */
            out = value;
        }
        *pOut++ = out;
        i--;
    }
 }
 /**
 * @} end of Acti group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c
@@ -1,89 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_activations_q7.c
 * Description:  Q7 neural network activation function using direct table look-up
 *
 * $Date:        09. October 2020
 * $Revision:    V.1.0.1
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nn_tables.h"
 #include "arm_nnfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Acti
 * @{
 */
 /**
 * @brief Q7 neural network activation function using direct table look-up
 * @param[in,out]   data        pointer to input
 * @param[in]       size        number of elements
 * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
 * @param[in]       type        type of activation functions
 *
 * @details
 *
 * This is the direct table look-up approach.
 *
 * Assume here the integer part of the fixed-point is <= 3.
 * More than 3 just not making much sense, makes no difference with
 * saturation followed by any of these activation functions.
 */
 void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
 {
    uint16_t i = size;
    q7_t *pIn = data;
    q7_t *pOut = data;
    q7_t in;
    q7_t out;
    uint16_t shift_size = 3 - int_width;
    const q7_t *lookup_table;
    switch (type)
    {
    case ARM_SIGMOID:
        lookup_table = sigmoidTable_q7;
        break;
    case ARM_TANH:
    default:
        lookup_table = tanhTable_q7;
        break;
    }
    while (i)
    {
        in = *pIn++;
        out = lookup_table[(uint8_t)(in >> shift_size)];
        *pOut++ = out;
        i--;
    }
 }
 /**
 * @} end of Acti group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu6_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu6_s8.c
@@ -1,65 +0,0 @@
 /*
 * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_relu6_s8.c
 * Description:  Basic s8 version of ReLU6
 *
 * $Date:        09. October 2020
 * $Revision:    V.1.0.1
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Acti
 * @{
 */
 /*
 *  Basic ReLU6 function
 *
 * Refer to header file for details.
 *
 */
 void arm_relu6_s8(q7_t *data, uint16_t size)
 {
    int32_t i;
    for (i = 0; i < size; i++)
    {
        int32_t ip = data[i];
        ip = MAX(ip, 0);
        data[i] = MIN(ip, 6);
    }
 }
 /**
 * @} end of Acti group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
@@ -1,104 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_relu_q15.c
 * Description:  Q15 version of ReLU
 *
 * $Date:        09. October 2020
 * $Revision:    V.1.0.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Acti
 * @{
 */
 /**
 * @brief Q15 RELU function
 * @param[in,out]   data        pointer to input
 * @param[in]       size        number of elements
 *
 * @details
 *
 * Optimized relu with QSUB instructions.
 *
 */
 void arm_relu_q15(q15_t *data, uint16_t size)
 {
 #if defined(ARM_MATH_DSP)
    /* Run the following code for M cores with DSP extension */
    uint16_t i = size >> 1;
    q15_t *input = data;
    q15_t *output = data;
    q31_t in;
    q31_t buf;
    q31_t mask;
    while (i)
    {
        in = read_q15x2_ia(&input);
        /* extract the first bit */
        buf = __ROR(in & 0x80008000, 15);
        /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
        mask = __QSUB16(0x00000000, buf);
        arm_nn_write_q15x2_ia(&output, in & (~mask));
        i--;
    }
    if (size & 0x1)
    {
        if (*input < 0)
        {
            *input = 0;
        }
        input++;
    }
 #else
    /* Run the following code as reference implementation for M cores without DSP extension */
    uint16_t i;
    for (i = 0; i < size; i++)
    {
        if (data[i] < 0)
            data[i] = 0;
    }
 #endif /* ARM_MATH_DSP */
 }
 /**
 * @} end of Acti group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
@@ -1,109 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_relu_q7.c
 * Description:  Q7 version of ReLU
 *
 * $Date:        20. July 2021
 * $Revision:    V.1.1.3
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Acti
 * @{
 */
 /**
 * @brief Q7 RELU function
 * @param[in,out]   data        pointer to input
 * @param[in]       size        number of elements
 *
 * @details
 *
 * Optimized relu with QSUB instructions.
 *
 */
 void arm_relu_q7(q7_t *data, uint16_t size)
 {
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for M cores with DSP extension */
    uint16_t i = size >> 2;
    q7_t *input = data;
    q7_t *output = data;
    q31_t in;
    q31_t buf;
    q31_t mask;
    while (i)
    {
        in = arm_nn_read_q7x4_ia((const q7_t **)&input);
        /* extract the first bit */
        buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
        /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
        mask = __QSUB8(0x00000000, buf);
        arm_nn_write_q7x4_ia(&output, in & (~mask));
        i--;
    }
    i = size & 0x3;
    while (i)
    {
        if (*input < 0)
        {
            *input = 0;
        }
        input++;
        i--;
    }
 #else
    /* Run the following code as reference implementation for cores without DSP extension */
    uint16_t i;
    for (i = 0; i < size; i++)
    {
        if (data[i] < 0)
            data[i] = 0;
    }
 #endif
 }
 /**
 * @} end of Acti group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/CMakeLists.txt
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/CMakeLists.txt
@@ -1,31 +0,0 @@
 #
 # Copyright (c) 2019-2021 Arm Limited. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 project(CMSISNNBasicMaths)
 file(GLOB SRC "./*_*.c")
 add_library(CMSISNNBasicMaths STATIC ${SRC})
 ### Includes
 target_include_directories(CMSISNNBasicMaths PUBLIC "${NN}/Include")
 target_include_directories(CMSISNNBasicMaths PUBLIC "${ROOT}/CMSIS/Core/Include")
 target_include_directories(CMSISNNBasicMaths PUBLIC "${ROOT}/CMSIS/DSP/Include")
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s16.c
@@ -1,105 +0,0 @@
 /*
 * Copyright (C) 2022 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_elementwise_add_s16
 * Description:  Elementwise add
 *
 * $Date:        14 Februari 2022
 * $Revision:    V.1.0.0
 *
 * Target Processor:  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup BasicMath
 * @{
 */
 /*
 * s16 elementwise add
 *
 * Refer header file for details.
 *
 */
 /* Note: __SHIFT is expected to be <=0 */
 arm_status arm_elementwise_add_s16(const int16_t *input_1_vect,
                                   const int16_t *input_2_vect,
                                   const int32_t input_1_offset,
                                   const int32_t input_1_mult,
                                   const int32_t input_1_shift,
                                   const int32_t input_2_offset,
                                   const int32_t input_2_mult,
                                   const int32_t input_2_shift,
                                   const int32_t left_shift,
                                   int16_t *output,
                                   const int32_t out_offset,
                                   const int32_t out_mult,
                                   const int32_t out_shift,
                                   const int32_t out_activation_min,
                                   const int32_t out_activation_max,
                                   const int32_t block_size)
 {
    (void)input_1_offset;
    (void)input_2_offset;
    (void)out_offset;
    int32_t loop_count;
    int32_t input_1;
    int32_t input_2;
    int32_t sum;
    loop_count = block_size;
    while (loop_count > 0)
    {
        /* C = A + B */
        input_1 = *input_1_vect++ << left_shift;
        input_2 = *input_2_vect++ << left_shift;
        input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
        input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
        sum = input_1 + input_2;
        sum = arm_nn_requantize(sum, out_mult, out_shift);
        sum = MAX(sum, out_activation_min);
        sum = MIN(sum, out_activation_max);
        *output++ = (int16_t)sum;
        /* Decrement loop counter */
        loop_count--;
    }
    return (ARM_MATH_SUCCESS);
 }
 /**
 * @} end of BasicMath group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
@@ -1,255 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_elementwise_add_s8
 * Description:  Element wise add
 *
 * $Date:        01. March 2021
 * $Revision:    V.2.5.3
 *
 * Target Processor:  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 #endif
 #if defined(ARM_MATH_MVEI)
 #define SAT_INPUT_VECT(__INPUT_V, __MULT, __SHIFT)                                                                     \
    __INPUT_V = arm_doubling_high_mult_mve(__INPUT_V, __MULT);                                                         \
    __INPUT_V = arm_divide_by_power_of_two_mve(__INPUT_V, -__SHIFT);
 #endif
 /**
 * @note The *_no_sat API does not mean that the input not saturated, Since
 *       __MULT is a positive integer, it is saturated. The API definition
 *       has more info about it.
 */
 #define SAT_INPUT(__INPUT, __MULT, __SHIFT)                                                                            \
    __INPUT = arm_nn_doubling_high_mult_no_sat(__INPUT, __MULT);                                                       \
    __INPUT = arm_nn_divide_by_power_of_two(__INPUT, -__SHIFT);
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup BasicMath
 * @{
 */
 /*
 * s8 element wise add
 *
 * Refer header file for details.
 *
 */
 /* Note: __SHIFT is expected to be <=0 */
 arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
                                  const int8_t *input_2_vect,
                                  const int32_t input_1_offset,
                                  const int32_t input_1_mult,
                                  const int32_t input_1_shift,
                                  const int32_t input_2_offset,
                                  const int32_t input_2_mult,
                                  const int32_t input_2_shift,
                                  const int32_t left_shift,
                                  int8_t *output,
                                  const int32_t out_offset,
                                  const int32_t out_mult,
                                  const int32_t out_shift,
                                  const int32_t out_activation_min,
                                  const int32_t out_activation_max,
                                  const uint32_t block_size)
 {
 #if defined(ARM_MATH_MVEI)
    int32_t count = (int32_t)block_size;
    while (count > 0)
    {
        int32x4_t vect_1;
        int32x4_t vect_2;
        mve_pred16_t p = vctp32q((uint32_t)count);
        vect_1 = vldrbq_z_s32(input_1_vect, p);
        vect_2 = vldrbq_z_s32(input_2_vect, p);
        vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset));
        vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset));
        vect_1 = vshlq_r_s32(vect_1, left_shift);
        vect_2 = vshlq_r_s32(vect_2, left_shift);
        SAT_INPUT_VECT(vect_1, input_1_mult, input_1_shift);
        SAT_INPUT_VECT(vect_2, input_2_mult, input_2_shift);
        vect_1 = vaddq_s32(vect_1, vect_2);
        SAT_INPUT_VECT(vect_1, out_mult, out_shift);
        vect_1 = vaddq_n_s32(vect_1, out_offset);
        vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
        vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
        input_1_vect += 4;
        input_2_vect += 4;
        vstrbq_p_s32(output, vect_1, p);
        output += 4;
        count -= 4;
    }
 #else
    uint32_t loop_count;
    int32_t input_1;
    int32_t input_2;
    int32_t sum;
 #if defined(ARM_MATH_DSP)
    int32_t a_1, b_1, a_2, b_2;
    int32_t offset_1_packed, offset_2_packed;
    int8_t r1, r2, r3, r4;
    offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
    offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
    loop_count = block_size >> 2;
    while (loop_count > 0U)
    {
        /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
           intrinsic */
        input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
        input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
        a_1 = __SADD16(a_1, offset_1_packed);
        b_1 = __SADD16(b_1, offset_1_packed);
        a_2 = __SADD16(a_2, offset_2_packed);
        b_2 = __SADD16(b_2, offset_2_packed);
        /* Sum 1 */
        input_1 = (b_1 & 0x0FFFF) << left_shift;
        SAT_INPUT(input_1, input_1_mult, input_1_shift);
        input_2 = (b_2 & 0x0FFFF) << left_shift;
        SAT_INPUT(input_2, input_2_mult, input_2_shift);
        sum = input_1 + input_2;
        SAT_INPUT(sum, out_mult, out_shift);
        sum += out_offset;
        sum = MAX(sum, out_activation_min);
        sum = MIN(sum, out_activation_max);
        r1 = (q7_t)sum;
        /* Sum 3 */
        input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift;
        SAT_INPUT(input_1, input_1_mult, input_1_shift);
        input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift;
        SAT_INPUT(input_2, input_2_mult, input_2_shift);
        sum = input_1 + input_2;
        SAT_INPUT(sum, out_mult, out_shift);
        sum += out_offset;
        sum = MAX(sum, out_activation_min);
        sum = MIN(sum, out_activation_max);
        r3 = (q7_t)sum;
        /* Sum 2 */
        input_1 = (a_1 & 0x0FFFF) << left_shift;
        SAT_INPUT(input_1, input_1_mult, input_1_shift);
        input_2 = (a_2 & 0x0FFFF) << left_shift;
        SAT_INPUT(input_2, input_2_mult, input_2_shift);
        sum = input_1 + input_2;
        SAT_INPUT(sum, out_mult, out_shift);
        sum += out_offset;
        sum = MAX(sum, out_activation_min);
        sum = MIN(sum, out_activation_max);
        r2 = (q7_t)sum;
        /* Sum 4 */
        input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift;
        SAT_INPUT(input_1, input_1_mult, input_1_shift);
        input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift;
        SAT_INPUT(input_2, input_2_mult, input_2_shift);
        sum = input_1 + input_2;
        SAT_INPUT(sum, out_mult, out_shift);
        sum += out_offset;
        sum = MAX(sum, out_activation_min);
        sum = MIN(sum, out_activation_max);
        r4 = (q7_t)sum;
        write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
        loop_count--;
    }
    loop_count = block_size & 0x3;
 #else
    loop_count = block_size;
 #endif
    while (loop_count > 0U)
    {
        /* C = A + B */
        input_1 = (*input_1_vect++ + input_1_offset) << left_shift;
        input_2 = (*input_2_vect++ + input_2_offset) << left_shift;
        input_1 = arm_nn_doubling_high_mult(input_1, input_1_mult);
        input_1 = arm_nn_divide_by_power_of_two(input_1, -input_1_shift);
        input_2 = arm_nn_doubling_high_mult(input_2, input_2_mult);
        input_2 = arm_nn_divide_by_power_of_two(input_2, -input_2_shift);
        sum = input_1 + input_2;
        SAT_INPUT(sum, out_mult, out_shift);
        sum += out_offset;
        sum = MAX(sum, out_activation_min);
        sum = MIN(sum, out_activation_max);
        *output++ = (q7_t)sum;
        /* Decrement loop counter */
        loop_count--;
    }
 #endif /* ARM_MATH_MVEI */
    return (ARM_MATH_SUCCESS);
 }
 /**
 * @} end of BasicMath group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s16.c
@@ -1,95 +0,0 @@
 /*
 * Copyright (C) 2022 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_elementwise_mul_s16
 * Description:  Element wise multiplication
 *
 * $Date:        14 Februari 2022
 * $Revision:    V.1.0.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup BasicMath
 * @{
 */
 /**
 * @brief s16 element wise multiplication of two vectors
 *
 * @note   Refer header file for details.
 *
 */
 arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
                                   const int16_t *input_2_vect,
                                   const int32_t input_1_offset,
                                   const int32_t input_2_offset,
                                   int16_t *output,
                                   const int32_t out_offset,
                                   const int32_t out_mult,
                                   const int32_t out_shift,
                                   const int32_t out_activation_min,
                                   const int32_t out_activation_max,
                                   const int32_t block_size)
 {
    (void)input_1_offset;
    (void)input_2_offset;
    (void)out_offset;
    int32_t loop_count;
    int32_t input_1;
    int32_t input_2;
    int32_t mul_res;
    loop_count = block_size;
    while (loop_count > 0)
    {
        /* C = A * B */
        input_1 = *input_1_vect++;
        input_2 = *input_2_vect++;
        mul_res = input_1 * input_2;
        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
        mul_res = MAX(mul_res, out_activation_min);
        mul_res = MIN(mul_res, out_activation_max);
        *output++ = (int16_t)mul_res;
        /* Decrement loop counter */
        loop_count--;
    }
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of BasicMath group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
@@ -1,200 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_elementwise_mul_s8
 * Description:  Element wise multiplication
 *
 * $Date:        January 26, 2021
 * $Revision:    V.1.0.5
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup BasicMath
 * @{
 */
 /**
 * @brief s8 element wise multiplication of two vectors
 *
 * @note   Refer header file for details.
 *
 */
 arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
                                  const int8_t *input_2_vect,
                                  const int32_t input_1_offset,
                                  const int32_t input_2_offset,
                                  int8_t *output,
                                  const int32_t out_offset,
                                  const int32_t out_mult,
                                  const int32_t out_shift,
                                  const int32_t out_activation_min,
                                  const int32_t out_activation_max,
                                  const uint32_t block_size)
 {
    int32_t loop_count;
 #if defined(ARM_MATH_MVEI)
    loop_count = (block_size + 3) / 4;
    uint32_t num_elements = block_size;
    for (int i = 0; i < loop_count; i++)
    {
        mve_pred16_t p = vctp32q(num_elements);
        int32x4_t input_1 = vldrbq_z_s32(input_1_vect, p);
        input_1 = vaddq_n_s32(input_1, input_1_offset);
        int32x4_t input_2 = vldrbq_z_s32(input_2_vect, p);
        input_2 = vaddq_n_s32(input_2, input_2_offset);
        int32x4_t res_0 = vmulq_s32(input_1, input_2);
        res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
        res_0 += vdupq_n_s32(out_offset);
        res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min));
        res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max));
        vstrbq_p_s32(output, res_0, p);
        input_1_vect += 4;
        input_2_vect += 4;
        output += 4;
        num_elements -= 4;
    }
 #else
    int32_t input_1;
    int32_t input_2;
    int32_t mul_res;
 #if defined(ARM_MATH_DSP)
    int32_t a_1, b_1, a_2, b_2;
    int32_t offset_1_packed, offset_2_packed;
    int8_t r1, r2, r3, r4;
    offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
    offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
    loop_count = block_size >> 2;
    while (loop_count > 0)
    {
        /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
           intrinsic */
        input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
        input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
        a_1 = __SADD16(a_1, offset_1_packed);
        b_1 = __SADD16(b_1, offset_1_packed);
        a_2 = __SADD16(a_2, offset_2_packed);
        b_2 = __SADD16(b_2, offset_2_packed);
        /* Mul 1 */
        input_1 = (int16_t)(b_1 & 0x0FFFFL);
        input_2 = (int16_t)(b_2 & 0x0FFFFL);
        mul_res = input_1 * input_2;
        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
        mul_res = MAX(mul_res, out_activation_min);
        mul_res = MIN(mul_res, out_activation_max);
        r1 = (q7_t)mul_res;
        /* Mul 3 */
        input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL);
        input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL);
        mul_res = input_1 * input_2;
        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
        mul_res = MAX(mul_res, out_activation_min);
        mul_res = MIN(mul_res, out_activation_max);
        r3 = (q7_t)mul_res;
        /* Mul 2 */
        input_1 = (int16_t)(a_1 & 0x0FFFFL);
        input_2 = (int16_t)(a_2 & 0x0FFFFL);
        mul_res = input_1 * input_2;
        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
        mul_res = MAX(mul_res, out_activation_min);
        mul_res = MIN(mul_res, out_activation_max);
        r2 = (q7_t)mul_res;
        /* Mul 4 */
        input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL);
        input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL);
        mul_res = input_1 * input_2;
        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
        mul_res = MAX(mul_res, out_activation_min);
        mul_res = MIN(mul_res, out_activation_max);
        r4 = (q7_t)mul_res;
        write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
        loop_count--;
    }
    loop_count = block_size & 0x3;
 #else
    loop_count = block_size;
 #endif
    while (loop_count > 0)
    {
        /* C = A * B */
        input_1 = *input_1_vect++ + input_1_offset;
        input_2 = *input_2_vect++ + input_2_offset;
        mul_res = input_1 * input_2;
        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
        mul_res = MAX(mul_res, out_activation_min);
        mul_res = MIN(mul_res, out_activation_max);
        *output++ = (q7_t)mul_res;
        /* Decrement loop counter */
        loop_count--;
    }
 #endif
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of BasicMath group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/CMakeLists.txt
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/CMakeLists.txt
@@ -1,98 +0,0 @@
 #
 # Copyright (c) 2019-2021 Arm Limited.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 SET(ROOT ${CMSIS_PATH})
 # Select which parts of the CMSIS-DSP must be compiled.
 # There are some dependencies between the parts but they are not tracked
 # by this cmake. So, enabling some functions may require to enable some
 # other ones.
 option(CONCATENATION        "Concatenation"         ON)
 option(FULLYCONNECTED       "Fully Connected"       ON)
 option(CONVOLUTION          "Convolutions"          ON)
 option(ACTIVATION           "Activations"           ON)
 option(POOLING              "Pooling"               ON)
 option(SOFTMAX              "Softmax"               ON)
 option(BASICMATHSNN         "Basic Maths for NN"    ON)
 option(RESHAPE              "Reshape"               ON)
 option(SVDF                 "SVDF"                  ON)
 # When OFF it is the default behavior : all tables are included.
 option(NNSUPPORT            "NN Support"            ON)
 ###########################
 #
 # CMSIS NN
 #
 ###########################
 # NN Sources
 SET(NN ${ROOT}/CMSIS/NN)
 list(APPEND CMAKE_MODULE_PATH ${NN}/Source)
 add_library(cmsis-nn STATIC)
 target_compile_options(cmsis-nn PRIVATE -Ofast)
 ### Includes
 target_include_directories(cmsis-nn PUBLIC "${NN}/Include")
 target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/Core/Include")
 target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/DSP/Include")
 if (BASICMATHSNN)
  add_subdirectory(BasicMathFunctions)
 endif()
 if (CONCATENATION)
  add_subdirectory(ConcatenationFunctions)
 endif()
 if (FULLYCONNECTED)
  add_subdirectory(FullyConnectedFunctions)
 endif()
 if (CONVOLUTION)
  add_subdirectory(ConvolutionFunctions)
 endif()
 if (ACTIVATION)
  add_subdirectory(ActivationFunctions)
 endif()
 if (POOLING)
  add_subdirectory(PoolingFunctions)
 endif()
 if (SOFTMAX)
  add_subdirectory(SoftmaxFunctions)
 endif()
 if (SVDF)
  add_subdirectory(SVDFunctions)
 endif()
 if (RESHAPE)
  add_subdirectory(ReshapeFunctions)
 endif()
 # Keep NNSUPPORT at the end
 if (NNSUPPORT)
  add_subdirectory(NNSupportFunctions)
 endif()
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/CMakeLists.txt
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/CMakeLists.txt
@@ -1,20 +0,0 @@
 #
 # Copyright (c) 2019-2021 Arm Limited.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 file(GLOB SRC "./*_*.c")
 target_sources(cmsis-nn PRIVATE ${SRC})
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c
@@ -1,66 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_concatenation_s8_w.c
 * Description:  s8 version of concatenation along the W axis
 *
 * $Date:        October 2019
 * $Revision:    V.1.0.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Concatenation
 * @{
 */
 /*
 *  s8 version of concatenation along the W axis
 *
 * Refer to header file for details.
 *
 */
 void arm_concatenation_s8_w(const int8_t *input,
                            const uint16_t input_x,
                            const uint16_t input_y,
                            const uint16_t input_z,
                            const uint16_t input_w,
                            int8_t *output,
                            const uint32_t offset_w)
 {
    const uint32_t input_copy_size = input_x * input_y * input_z * input_w;
    output += offset_w * (input_x * input_y * input_z);
    arm_memcpy_q7(output, input, input_copy_size);
 }
 /**
 * @} end of Concatenation group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c
@@ -1,75 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_concatenation_s8_x.c
 * Description:  s8 version of concatenation along the X axis
 *
 * $Date:        October 2019
 * $Revision:    V.1.0.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Concatenation
 * @{
 */
 /*
 *  s8 version of concatenation along the X axis
 *
 * Refer to header file for details.
 *
 */
 void arm_concatenation_s8_x(const int8_t *input,
                            const uint16_t input_x,
                            const uint16_t input_y,
                            const uint16_t input_z,
                            const uint16_t input_w,
                            int8_t *output,
                            const uint16_t output_x,
                            const uint32_t offset_x)
 {
    const uint32_t num_iterations = input_y * input_z * input_w;
    output += offset_x;
    uint32_t i;
    // Copy per row
    for (i = 0; i < num_iterations; ++i)
    {
        arm_memcpy_q7(output, input, input_x);
        input += input_x;
        output += output_x;
    }
 }
 /**
 * @} end of Concatenation group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c
@@ -1,76 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_concatenation_s8_y.c
 * Description:  s8 version of concatenation along the Y axis
 *
 * $Date:        October 2019
 * $Revision:    V.1.0.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Concatenation
 * @{
 */
 /*
 *  s8 version of concatenation along the Y axis
 *
 * Refer to header file for details.
 *
 */
 void arm_concatenation_s8_y(const int8_t *input,
                            const uint16_t input_x,
                            const uint16_t input_y,
                            const uint16_t input_z,
                            const uint16_t input_w,
                            int8_t *output,
                            const uint16_t output_y,
                            const uint32_t offset_y)
 {
    const uint32_t num_iterations = input_z * input_w;
    const uint32_t input_copy_size = input_x * input_y;
    const uint32_t output_stride = input_x * output_y;
    output += offset_y * input_x;
    uint32_t i;
    // Copy per tile
    for (i = 0; i < num_iterations; ++i)
    {
        arm_memcpy_q7(output, input, input_copy_size);
        input += input_copy_size;
        output += output_stride;
    }
 }
 /**
 * @} end of Concatenation group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c
@@ -1,75 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_concatenation_s8_z.c
 * Description:  s8 version of concatenation along the Z axis
 *
 * $Date:        October 2019
 * $Revision:    V.1.0.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup Concatenation
 * @{
 */
 /*
 *  s8 version of concatenation along the Z axis
 *
 * Refer to header file for details.
 *
 */
 void arm_concatenation_s8_z(const int8_t *input,
                            const uint16_t input_x,
                            const uint16_t input_y,
                            const uint16_t input_z,
                            const uint16_t input_w,
                            int8_t *output,
                            const uint16_t output_z,
                            const uint32_t offset_z)
 {
    const uint32_t input_copy_size = input_x * input_y * input_z;
    const uint32_t output_stride = input_x * input_y * output_z;
    output += offset_z * (input_x * input_y);
    uint32_t i;
    for (i = 0; i < input_w; ++i)
    {
        arm_memcpy_q7(output, input, input_copy_size);
        input += input_copy_size;
        output += output_stride;
    }
 }
 /**
 * @} end of Concatenation group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
@@ -1,24 +0,0 @@
 #
 # Copyright (c) 2019-2022 Arm Limited.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 file(GLOB SRC "./*_s8*.c")
 file(GLOB SRC_S16 "./*_s16*.c")
 target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16})
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
@@ -1,205 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_1_x_n_s8.c
 * Description:  s8 version of 1xN convolution using symmetric quantization.
 *
 * $Date:        December 14, 2021
 * $Revision:    V.2.1.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * 1xN s8 convolution function.
 *
 * Refer header file for details.
 *
 */
 arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
                                 const cmsis_nn_conv_params *conv_params,
                                 const cmsis_nn_per_channel_quant_params *quant_params,
                                 const cmsis_nn_dims *input_dims,
                                 const q7_t *input_data,
                                 const cmsis_nn_dims *filter_dims,
                                 const q7_t *filter_data,
                                 const cmsis_nn_dims *bias_dims,
                                 const int32_t *bias_data,
                                 const cmsis_nn_dims *output_dims,
                                 q7_t *output_data)
 {
    (void)bias_dims;
    arm_status status = ARM_MATH_SUCCESS;
    if (output_dims->w % 4 != 0)
    {
        status = ARM_MATH_SIZE_MISMATCH;
        goto out;
    }
 #if defined(ARM_MATH_MVEI)
    (void)ctx;
    const uint16_t input_x = input_dims->w;
    const uint16_t kernel_x = filter_dims->w;
    const uint16_t output_x = output_dims->w;
    const uint16_t output_ch = output_dims->c;
    const uint16_t input_ch = input_dims->c;
    const uint16_t pad_x = conv_params->padding.w;
    const uint16_t stride_x = conv_params->stride.w;
    const int32_t input_offset = conv_params->input_offset;
    const int32_t out_offset = conv_params->output_offset;
    const int32_t out_activation_min = conv_params->activation.min;
    const int32_t out_activation_max = conv_params->activation.max;
    int32_t *output_mult = quant_params->multiplier;
    int32_t *output_shift = quant_params->shift;
    for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4)
    {
        int32_t input_begin_idx[4];
        int32_t ker_begin_idx[4];
        int32_t ker_end_idx[4];
        for (int i = 0; i < 4; i++)
        {
            const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x;
            input_begin_idx[i] = MAX(0, est_input_x_idx);
            ker_begin_idx[i] = MAX(0, -est_input_x_idx);
            ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx);
        }
        if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x))
        {
            for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
            {
                int32x4_t s_offset;
                int32_t acc[4];
                {
                    int32_t sum_row[4];
                    (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch,
                                                    input_data + input_begin_idx[0] * input_ch,
                                                    filter_data + (input_ch * kernel_x * i_out_ch) +
                                                        (ker_begin_idx[0] * input_ch),
                                                    &sum_row[0],
                                                    &acc[0]);
                    (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch,
                                                    input_data + input_begin_idx[1] * input_ch,
                                                    filter_data + (input_ch * kernel_x * i_out_ch) +
                                                        (ker_begin_idx[1] * input_ch),
                                                    &sum_row[1],
                                                    &acc[1]);
                    (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch,
                                                    input_data + input_begin_idx[2] * input_ch,
                                                    filter_data + (input_ch * kernel_x * i_out_ch) +
                                                        (ker_begin_idx[2] * input_ch),
                                                    &sum_row[2],
                                                    &acc[2]);
                    (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch,
                                                    input_data + input_begin_idx[3] * input_ch,
                                                    filter_data + (input_ch * kernel_x * i_out_ch) +
                                                        (ker_begin_idx[3] * input_ch),
                                                    &sum_row[3],
                                                    &acc[3]);
                    s_offset = vldrwq_s32(sum_row);
                }
                int32x4_t res = vldrwq_s32(acc);
                s_offset = vmulq_n_s32(s_offset, input_offset);
                res = vaddq_s32(res, s_offset);
                if (bias_data)
                {
                    res = vaddq_n_s32(res, bias_data[i_out_ch]);
                }
                res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
                res = vaddq_n_s32(res, out_offset);
                res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
                res = vminq_s32(res, vdupq_n_s32(out_activation_max));
                const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
                vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
                output_data++;
            }
            output_data += (3 * output_ch);
        }
        else
        {
            output_data = arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch,
                                                    stride_x * input_ch,
                                                    input_data + input_begin_idx[0] * input_ch,
                                                    filter_data,
                                                    output_ch,
                                                    conv_params,
                                                    quant_params,
                                                    bias_data,
                                                    output_data);
        }
    }
 #else
    status = arm_convolve_s8(ctx,
                             conv_params,
                             quant_params,
                             input_dims,
                             input_data,
                             filter_dims,
                             filter_data,
                             bias_dims,
                             bias_data,
                             output_dims,
                             output_data);
 #endif
 out:
    /* Return to application */
    return status;
 }
 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
 #if !defined(ARM_MATH_MVEI)
    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
 #else
    (void)input_dims;
    (void)filter_dims;
    return 0;
 #endif
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
@@ -1,235 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_1x1_HWC_q7_fast_nonsquare.c
 * Description:  Fast Q7 version of 1x1 convolution (non-square shape)
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
 * @param[in]       Im_in        pointer to input tensor
 * @param[in]       dim_im_in_x  input tensor dimention x
 * @param[in]       dim_im_in_y  input tensor dimention y
 * @param[in]       ch_im_in     number of input tensor channels
 * @param[in]       wt           pointer to kernel weights
 * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel_x filter kernel size x
 * @param[in]       dim_kernel_y filter kernel size y
 * @param[in]       padding_x    padding size x
 * @param[in]       padding_y    padding size y
 * @param[in]       stride_x     convolution stride x
 * @param[in]       stride_y     convolution stride y
 * @param[in]       bias         pointer to bias
 * @param[in]       bias_shift   amount of left-shift for bias
 * @param[in]       out_shift    amount of right-shift for output
 * @param[in,out]   Im_out       pointer to output tensor
 * @param[in]       dim_im_out_x output tensor dimension x
 * @param[in]       dim_im_out_y output tensor dimension y
 * @param[in,out]   bufferA      pointer to buffer space for input
 * @param[in,out]   bufferB      pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1
 * and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise
 * separable convolution.
 *
 * This function is the version with full list of optimization tricks, but with
 * some constraints:
 *   ch_im_in is multiple of 4
 *   ch_im_out is multiple of 2
 *
 * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
 * https://arxiv.org/abs/1704.04861
 */
 arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
                                                  const uint16_t dim_im_in_x,
                                                  const uint16_t dim_im_in_y,
                                                  const uint16_t ch_im_in,
                                                  const q7_t *wt,
                                                  const uint16_t ch_im_out,
                                                  const uint16_t dim_kernel_x,
                                                  const uint16_t dim_kernel_y,
                                                  const uint16_t padding_x,
                                                  const uint16_t padding_y,
                                                  const uint16_t stride_x,
                                                  const uint16_t stride_y,
                                                  const q7_t *bias,
                                                  const uint16_t bias_shift,
                                                  const uint16_t out_shift,
                                                  q7_t *Im_out,
                                                  const uint16_t dim_im_out_x,
                                                  const uint16_t dim_im_out_y,
                                                  q15_t *bufferA,
                                                  q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    (void)dim_im_in_y;
    int16_t i_out_y, i_out_x;
    int16_t i_ch_out;
    /* -----------------------
     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
     *  im2col are done to output in q15_t format from q7_t input
     */
    q15_t *pBuffer = bufferA;
    q7_t *pOut = Im_out;
    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
        padding_y != 0 || stride_x != 1 || stride_y != 1)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
        {
            /* This part implements the im2col function */
            arm_q7_to_q15_reordered_no_shift(
                (q7_t *)Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer, ch_im_in);
            pBuffer += ch_im_in;
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* check if there is left-over for compute */
    if (pBuffer != bufferA)
    {
        const q7_t *pA = wt;
        for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
        {
            q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
            const q15_t *pB = bufferA;
            /* basically each time it process 4 entries */
            uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
            while (colCnt)
            {
                q31_t inA1, inA2;
                q31_t inB1, inB2;
                pA = read_and_pad_reordered(pA, &inA1, &inA2);
                inB1 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA1, inB1, sum);
                inB2 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA2, inB2, sum);
                colCnt--;
            }
            colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
            while (colCnt)
            {
                q7_t inA1 = *pA++;
                q15_t inB1 = *pB++;
                sum += inA1 * inB1;
                colCnt--;
            }
            *pOut = (q7_t)__SSAT((sum >> out_shift), 8);
            pOut++;
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
        padding_y != 0 || stride_x != 1 || stride_y != 1)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out_y; j++)
        {
            for (k = 0; k < dim_im_out_x; k++)
            {
                conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel_y; m++)
                {
                    for (n = 0; n < dim_kernel_x; n++)
                    {
                        // if-for implementation
                        in_row = stride_y * j + m - padding_y;
                        in_col = stride_x * k + n - padding_x;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in +
                                       l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
@@ -1,161 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_1x1_s8_fast.c
 * Description:  Fast q7 version of 1x1 convolution (non-square shape)
 *
 * $Date:        12. November 2021
 * $Revision:    V.2.0.4
 *
 * Target Processor:  Cortex-M Processors
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 #include <stdio.h>
 #define DIM_KER_X (1U)
 #define DIM_KER_Y (1U)
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Fast s8 version for 1x1 convolution (non-square shape)
 *
 * Refer header file for details.
 *
 */
 arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
                                    const cmsis_nn_conv_params *conv_params,
                                    const cmsis_nn_per_channel_quant_params *quant_params,
                                    const cmsis_nn_dims *input_dims,
                                    const q7_t *input_data,
                                    const cmsis_nn_dims *filter_dims,
                                    const q7_t *filter_data,
                                    const cmsis_nn_dims *bias_dims,
                                    const int32_t *bias_data,
                                    const cmsis_nn_dims *output_dims,
                                    q7_t *output_data)
 {
    if (input_dims->c % 4 != 0 || conv_params->padding.w != 0 || conv_params->padding.h != 0 ||
        conv_params->stride.w != 1 || conv_params->stride.h != 1)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    (void)ctx;
    (void)filter_dims;
    (void)bias_dims;
 #if defined(ARM_MATH_MVEI)
    const int32_t col_len = input_dims->w * input_dims->h * input_dims->n;
    const int32_t output_ch = output_dims->c;
    const int32_t input_ch = input_dims->c;
    const int32_t input_offset = conv_params->input_offset;
    const int32_t out_offset = conv_params->output_offset;
    const int32_t out_activation_min = conv_params->activation.min;
    const int32_t out_activation_max = conv_params->activation.max;
    int32_t *output_mult = quant_params->multiplier;
    int32_t *output_shift = quant_params->shift;
    for (int i_items = 0; i_items <= (col_len - 4); i_items += 4)
    {
        output_data = arm_nn_mat_mul_core_4x_s8(input_ch,
                                                input_ch,
                                                input_data + i_items * input_ch,
                                                filter_data,
                                                output_ch,
                                                conv_params,
                                                quant_params,
                                                bias_data,
                                                output_data);
    }
    /* Handle left over elements */
    for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++)
    {
        for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
        {
            int32_t sum_row = 0;
            int32_t acc;
            (void)arm_nn_mat_mul_core_1x_s8(
                input_ch, input_data + i_items * input_ch, filter_data + i_out_ch * input_ch, &sum_row, &acc);
            if (bias_data)
            {
                acc += bias_data[i_out_ch];
            }
            sum_row = (sum_row * input_offset);
            acc += sum_row;
            acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]);
            acc += out_offset;
            acc = MAX(acc, out_activation_min);
            acc = MIN(acc, out_activation_max);
            *output_data++ = acc;
        }
    }
 #else
    /* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */
    const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n;
    const int32_t rhs_rows = output_dims->c;
    const int32_t rhs_cols = input_dims->c;
    arm_nn_mat_mult_nt_t_s8(input_data,
                            filter_data,
                            bias_data,
                            output_data,
                            quant_params->multiplier,
                            quant_params->shift,
                            lhs_rows,
                            rhs_rows,
                            rhs_cols,
                            conv_params->input_offset,
                            conv_params->output_offset,
                            conv_params->activation.min,
                            conv_params->activation.max);
 #endif
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims)
 {
    (void)input_dims;
    return 0;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
@@ -1,209 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q15_basic.c
 * Description:  Q15 version of convolution
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Basic Q15 convolution function
 * @param[in]       Im_in       pointer to input tensor
 * @param[in]       dim_im_in   input tensor dimention
 * @param[in]       ch_im_in    number of input tensor channels
 * @param[in]       wt          pointer to kernel weights
 * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel  filter kernel size
 * @param[in]       padding     padding sizes
 * @param[in]       stride      convolution stride
 * @param[in]       bias        pointer to bias
 * @param[in]       bias_shift  amount of left-shift for bias
 * @param[in]       out_shift   amount of right-shift for output
 * @param[in,out]   Im_out      pointer to output tensor
 * @param[in]       dim_im_out  output tensor dimension
 * @param[in,out]   bufferA     pointer to buffer space for input
 * @param[in,out]   bufferB     pointer to buffer space for output
 * @return     The function returns <code>ARM_MATH_SUCCESS</code>
 *
 * @details
 *
 * <b>Buffer size:</b>
 *
 * bufferA size: ch_im_in*dim_kernel*dim_kernel
 *
 * bufferB size: 0
 *
 * This basic version is designed to work for any input tensor and weight
 * dimension.
 */
 arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
                                      const uint16_t dim_im_in,
                                      const uint16_t ch_im_in,
                                      const q15_t *wt,
                                      const uint16_t ch_im_out,
                                      const uint16_t dim_kernel,
                                      const uint16_t padding,
                                      const uint16_t stride,
                                      const q15_t *bias,
                                      const uint16_t bias_shift,
                                      const uint16_t out_shift,
                                      q15_t *Im_out,
                                      const uint16_t dim_im_out,
                                      q15_t *bufferA,
                                      q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    uint16_t im2col_out_pixel_index = 0;
    q15_t *pBuffer = bufferA;
    q15_t *pOut = Im_out;
    q15_t *im_buffer = bufferA;
    const q15_t *pA;
    int i;
    /* This part implements the im2col function */
    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* Filling 0 for out-of-bound paddings */
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
                         * ch_im_in); */
                        memcpy(pBuffer,
                               (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
                               sizeof(q15_t) * ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            pA = wt;
            for (i = 0; i < ch_im_out; i++)
            {
                q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                const q15_t *pB = im_buffer;
                uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
                while (colCnt)
                {
                    q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
                    q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
                    q31_t inA2 = arm_nn_read_q15x2_ia(&pA);
                    q31_t inB2 = arm_nn_read_q15x2_ia(&pB);
                    sum = __SMLAD(inA1, inB1, sum);
                    sum = __SMLAD(inA2, inB2, sum);
                    colCnt--;
                }
                colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
                while (colCnt)
                {
                    q15_t inA1 = *pA++;
                    q15_t inB1 = *pB++;
                    sum += inA1 * inB1;
                    colCnt--;
                }
                *pOut = (q15_t)__SSAT((sum >> out_shift), 16);
                pOut++;
            }
            /* counter reset */
            pBuffer = im_buffer;
            im2col_out_pixel_index++;
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out; j++)
        {
            for (k = 0; k < dim_im_out; k++)
            {
                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel; m++)
                {
                    for (n = 0; n < dim_kernel; n++)
                    {
                        in_row = stride * j + m - padding;
                        in_col = stride * k + n - padding;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
@@ -1,259 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q15_fast.c
 * Description:  Fast Q15 version of convolution
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Fast Q15 convolution function
 * @param[in]       Im_in       pointer to input tensor
 * @param[in]       dim_im_in   input tensor dimention
 * @param[in]       ch_im_in    number of input tensor channels
 * @param[in]       wt          pointer to kernel weights
 * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel  filter kernel size
 * @param[in]       padding     padding sizes
 * @param[in]       stride      convolution stride
 * @param[in]       bias        pointer to bias
 * @param[in]       bias_shift  amount of left-shift for bias
 * @param[in]       out_shift   amount of right-shift for output
 * @param[in,out]   Im_out      pointer to output tensor
 * @param[in]       dim_im_out  output tensor dimension
 * @param[in,out]   bufferA     pointer to buffer space for input
 * @param[in,out]   bufferB     pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * @details
 *
 * <b>Buffer size:</b>
 *
 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
 *
 * bufferB size: 0
 *
 * <b>Input dimension constraints:</b>
 *
 * ch_im_in is multiple of 2
 *
 * ch_im_out is multiple of 2
 *
 * dim_im_out is a multiple of 2
 *
 */
 arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
                                     const uint16_t dim_im_in,
                                     const uint16_t ch_im_in,
                                     const q15_t *wt,
                                     const uint16_t ch_im_out,
                                     const uint16_t dim_kernel,
                                     const uint16_t padding,
                                     const uint16_t stride,
                                     const q15_t *bias,
                                     const uint16_t bias_shift,
                                     const uint16_t out_shift,
                                     q15_t *Im_out,
                                     const uint16_t dim_im_out,
                                     q15_t *bufferA,
                                     q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    q15_t *pBuffer = bufferA;
    q15_t *im_buffer = bufferA;
    q15_t *pOut = Im_out;
    if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0 || dim_im_out & 0x1)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    /* This part implements the im2col function */
    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
                         * ch_im_in); */
                        memcpy(pBuffer,
                               (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
                               sizeof(q15_t) * ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (i_out_x & 0x1)
            {
                int i;
                /* initialize the matrix pointers for A */
                const q15_t *pA = wt;
                /* set up the second output pointers */
                q15_t *pOut2 = pOut + ch_im_out;
                /* this loop over rows in A */
                for (i = 0; i < ch_im_out; i += 2)
                {
                    /* setup pointers for B */
                    const q15_t *pB = im_buffer;
                    const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel;
                    /* aling the second pointer for A */
                    const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel;
                    /* init the sum with bias */
                    q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                    q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                    q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
                    q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
                    uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1;
                    /* accumulate over the vector */
                    while (colCnt)
                    {
                        q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
                        q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
                        q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
                        q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
                        sum = __SMLAD(inA1, inB1, sum);
                        sum2 = __SMLAD(inA1, inB2, sum2);
                        sum3 = __SMLAD(inA2, inB1, sum3);
                        sum4 = __SMLAD(inA2, inB2, sum4);
                        colCnt--;
                    } /* while over colCnt */
                    colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1;
                    while (colCnt)
                    {
                        q15_t inA1 = *pA++;
                        q15_t inB1 = *pB++;
                        q15_t inA2 = *pA2++;
                        q15_t inB2 = *pB2++;
                        sum += inA1 * inB1;
                        sum2 += inA1 * inB2;
                        sum3 += inA2 * inB1;
                        sum4 += inA2 * inB2;
                        colCnt--;
                    } /* while over colCnt */
                    *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
                    *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
                    *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
                    *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
                    /* skip the row computed with A2 */
                    pA += ch_im_in * dim_kernel * dim_kernel;
                } /* for over ch_im_out */
                pOut += ch_im_out;
                /* counter reset */
                pBuffer = im_buffer;
            }
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out; j++)
        {
            for (k = 0; k < dim_im_out; k++)
            {
                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel; m++)
                {
                    for (n = 0; n < dim_kernel; n++)
                    {
                        in_row = stride * j + m - padding;
                        in_col = stride * k + n - padding;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
@@ -1,270 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q15_fast.c
 * Description:  Fast Q15 version of convolution
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Fast Q15 convolution function (non-sqaure shape)
 * @param[in]       Im_in        pointer to input tensor
 * @param[in]       dim_im_in_x  input tensor dimention x
 * @param[in]       dim_im_in_y  input tensor dimention y
 * @param[in]       ch_im_in     number of input tensor channels
 * @param[in]       wt           pointer to kernel weights
 * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel_x filter kernel size x
 * @param[in]       dim_kernel_y filter kernel size y
 * @param[in]       padding_x    padding size x
 * @param[in]       padding_y    padding size y
 * @param[in]       stride_x     convolution stride x
 * @param[in]       stride_y     convolution stride y
 * @param[in]       bias         pointer to bias
 * @param[in]       bias_shift   amount of left-shift for bias
 * @param[in]       out_shift    amount of right-shift for output
 * @param[in,out]   Im_out       pointer to output tensor
 * @param[in]       dim_im_out_x output tensor dimension x
 * @param[in]       dim_im_out_y output tensor dimension y
 * @param[in,out]   bufferA      pointer to buffer space for input
 * @param[in,out]   bufferB      pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * @details
 *
 * <b>Buffer size:</b>
 *
 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
 *
 * bufferB size: 0
 *
 * <b>Input dimension constraints:</b>
 *
 * ch_im_in is multiple of 2
 *
 * ch_im_out is multiple of 2
 *
 */
 arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
                                               const uint16_t dim_im_in_x,
                                               const uint16_t dim_im_in_y,
                                               const uint16_t ch_im_in,
                                               const q15_t *wt,
                                               const uint16_t ch_im_out,
                                               const uint16_t dim_kernel_x,
                                               const uint16_t dim_kernel_y,
                                               const uint16_t padding_x,
                                               const uint16_t padding_y,
                                               const uint16_t stride_x,
                                               const uint16_t stride_y,
                                               const q15_t *bias,
                                               const uint16_t bias_shift,
                                               const uint16_t out_shift,
                                               q15_t *Im_out,
                                               const uint16_t dim_im_out_x,
                                               const uint16_t dim_im_out_y,
                                               q15_t *bufferA,
                                               q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    q15_t *pBuffer = bufferA;
    q15_t *im_buffer = bufferA;
    q15_t *pOut = Im_out;
    if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    /* This part implements the im2col function */
    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
        {
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
                     i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
                         * ch_im_in); */
                        memcpy(pBuffer,
                               (q15_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
                               sizeof(q15_t) * ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (i_out_x & 0x1)
            {
                int i;
                /* initialize the matrix pointers for A */
                const q15_t *pA = wt;
                /* set up the second output pointers */
                q15_t *pOut2 = pOut + ch_im_out;
                /* this loop over rows in A */
                for (i = 0; i < ch_im_out; i += 2)
                {
                    /* setup pointers for B */
                    const q15_t *pB = im_buffer;
                    const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x;
                    /* aling the second pointer for A */
                    const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x;
                    /* init the sum with bias */
                    q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                    q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                    q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
                    q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
                    uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1;
                    /* accumulate over the vector */
                    while (colCnt)
                    {
                        q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
                        q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
                        q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
                        q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
                        sum = __SMLAD(inA1, inB1, sum);
                        sum2 = __SMLAD(inA1, inB2, sum2);
                        sum3 = __SMLAD(inA2, inB1, sum3);
                        sum4 = __SMLAD(inA2, inB2, sum4);
                        colCnt--;
                    } /* while over colCnt */
                    colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1;
                    while (colCnt)
                    {
                        q15_t inA1 = *pA++;
                        q15_t inB1 = *pB++;
                        q15_t inA2 = *pA2++;
                        q15_t inB2 = *pB2++;
                        sum += inA1 * inB1;
                        sum2 += inA1 * inB2;
                        sum3 += inA2 * inB1;
                        sum4 += inA2 * inB2;
                        colCnt--;
                    } /* while over colCnt */
                    *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
                    *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
                    *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
                    *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
                    /* skip the row computed with A2 */
                    pA += ch_im_in * dim_kernel_y * dim_kernel_x;
                } /* for over ch_im_out */
                pOut += ch_im_out;
                /* counter reset */
                pBuffer = im_buffer;
            }
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out_y; j++)
        {
            for (k = 0; k < dim_im_out_x; k++)
            {
                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel_y; m++)
                {
                    for (n = 0; n < dim_kernel_x; n++)
                    {
                        in_row = stride_y * j + m - padding_y;
                        in_col = stride_x * k + n - padding_x;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x + n) * ch_im_in +
                                       l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
@@ -1,280 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q7_RGB.c
 * Description:  Q7 version of convolution for RGB image
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Q7 convolution function for RGB image
 * @param[in]       Im_in       pointer to input tensor
 * @param[in]       dim_im_in   input tensor dimention
 * @param[in]       ch_im_in    number of input tensor channels
 * @param[in]       wt          pointer to kernel weights
 * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel  filter kernel size
 * @param[in]       padding     padding sizes
 * @param[in]       stride      convolution stride
 * @param[in]       bias        pointer to bias
 * @param[in]       bias_shift  amount of left-shift for bias
 * @param[in]       out_shift   amount of right-shift for output
 * @param[in,out]   Im_out      pointer to output tensor
 * @param[in]       dim_im_out  output tensor dimension
 * @param[in,out]   bufferA     pointer to buffer space for input
 * @param[in,out]   bufferB     pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * @details
 *
 * <b>Buffer size:</b>
 *
 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
 *
 * bufferB size: 0
 *
 * <b>Input dimension constraints:</b>
 *
 * ch_im_in equals 3
 *
 * This kernel is written exclusively for convolution with ch_im_in
 * equals 3. This applies on the first layer of CNNs which has input
 * image with RGB format.
 */
 arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                                   const uint16_t dim_im_in,
                                   const uint16_t ch_im_in,
                                   const q7_t *wt,
                                   const uint16_t ch_im_out,
                                   const uint16_t dim_kernel,
                                   const uint16_t padding,
                                   const uint16_t stride,
                                   const q7_t *bias,
                                   const uint16_t bias_shift,
                                   const uint16_t out_shift,
                                   q7_t *Im_out,
                                   const uint16_t dim_im_out,
                                   q15_t *bufferA,
                                   q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    /*
     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
     *  im2col are done to output in q15_t format from q7_t input
     */
    q15_t *pBuffer = bufferA;
    q7_t *pOut = Im_out;
    // check if number of input channels is 3
    if (ch_im_in != 3)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    // This part implements the im2col function
    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
                        arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t));
                        pBuffer += 3;
                    }
                    else
                    {
                        /*
                         * Equivalent to:
                         *  arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
                         */
                        const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3;
                        q31_t buf = arm_nn_read_q7x4(pPixel);
                        union arm_nnword top;
                        union arm_nnword bottom;
                        top.word = __SXTB16(buf);
                        bottom.word = __SXTB16(__ROR(buf, 8));
 #ifndef ARM_MATH_BIG_ENDIAN
                        /*
                         *  little-endian, | omit | 3rd  | 2nd  | 1st  |
                         *                MSB                         LSB
                         *   top | 3rd | 1st |; bottom | omit | 2nd |
                         *
                         *  version 1, need to swap 2nd and 3rd weight
                         * *__SIMD32(pBuffer) = top.word;
                         * *(pBuffer+2) = bottom.half_words[0];
                         *
                         *  version 2, no weight shuffling required
                         */
                        *pBuffer++ = top.half_words[0];
                        int32_t packed_word = __PKHBT(bottom.word, top.word, 0);
                        arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
 #else
                        /*
                         *  big-endian,    | 1st  | 2nd  | 3rd  | omit |
                         *                MSB                         LSB
                         *  top | 2nd | omit |; bottom | 1st | 3rd |
                         *
                         *  version 1, need to swap 2nd and 3rd weight
                         * *__SIMD32(pBuffer) = bottom.word;
                         * *(pBuffer+2) = top.half_words[1];
                         *
                         *  version 2, no weight shuffling required
                         */
                        *pBuffer++ = bottom.half_words[0];
                        int32_t packed_word = __PKHTB(top.word, bottom.word, 0);
                        arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
 #endif
                        pBuffer += 2;
                    }
                }
            }
            if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15(
                    wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* left-over because odd number of output pixels */
    if (pBuffer != bufferA)
    {
        const q7_t *pA = wt;
        int i;
        for (i = 0; i < ch_im_out; i++)
        {
            q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
            q15_t *pB = bufferA;
            /* basically each time it process 4 entries */
            uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2;
            while (colCnt)
            {
                q31_t inA1, inA2;
                q31_t inB1, inB2;
                pA = read_and_pad(pA, &inA1, &inA2);
                inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
                sum = __SMLAD(inA1, inB1, sum);
                inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
                sum = __SMLAD(inA2, inB2, sum);
                colCnt--;
            }
            colCnt = 3 * dim_kernel * dim_kernel & 0x3;
            while (colCnt)
            {
                q7_t inA1 = *pA++;
                q15_t inB1 = *pB++;
                sum += inA1 * inB1;
                colCnt--;
            }
            *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    // check if number of input channels is 3
    if (ch_im_in != 3)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out; j++)
        {
            for (k = 0; k < dim_im_out; k++)
            {
                conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel; m++)
                {
                    for (n = 0; n < dim_kernel; n++)
                    {
                        /* if-for implementation */
                        in_row = stride * j + m - padding;
                        in_col = stride * k + n - padding;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return (ARM_MATH_SUCCESS);
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
@@ -1,227 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q7_basic.c
 * Description:	 Q7 version of convolution
 *
 * $Date:        20. July 2021
 * $Revision:    V.1.1.1
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Basic Q7 convolution function
 * @param[in]       Im_in       pointer to input tensor
 * @param[in]       dim_im_in   input tensor dimention
 * @param[in]       ch_im_in    number of input tensor channels
 * @param[in]       wt          pointer to kernel weights
 * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel  filter kernel size
 * @param[in]       padding     padding sizes
 * @param[in]       stride      convolution stride
 * @param[in]       bias        pointer to bias
 * @param[in]       bias_shift  amount of left-shift for bias
 * @param[in]       out_shift   amount of right-shift for output
 * @param[in,out]   Im_out      pointer to output tensor
 * @param[in]       dim_im_out  output tensor dimension
 * @param[in,out]   bufferA     pointer to buffer space for input
 * @param[in,out]   bufferB     pointer to buffer space for output
 * @return     The function returns <code>ARM_MATH_SUCCESS</code>
 *
 * @details
 *
 * <b>Buffer size:</b>
 *
 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
 *
 * bufferB size: 0
 *
 * This basic version is designed to work for any input tensor and weight
 * dimension.
 */
 arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
                                     const uint16_t dim_im_in,
                                     const uint16_t ch_im_in,
                                     const q7_t *wt,
                                     const uint16_t ch_im_out,
                                     const uint16_t dim_kernel,
                                     const uint16_t padding,
                                     const uint16_t stride,
                                     const q7_t *bias,
                                     const uint16_t bias_shift,
                                     const uint16_t out_shift,
                                     q7_t *Im_out,
                                     const uint16_t dim_im_out,
                                     q15_t *bufferA,
                                     q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    /*
     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
     *  im2col are done to output in q15_t format from q7_t input
     */
    q15_t *pBuffer = bufferA;
    q7_t *pOut = Im_out;
    /* This part implements the im2col function */
    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* Filling 0 for out-of-bound paddings */
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        /* Copying the pixel data to column */
                        arm_q7_to_q15_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            /* Computation is filed for every 2 columns */
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* left-over because odd number of output pixels */
    if (pBuffer != bufferA)
    {
        const q7_t *pA = wt;
        int i;
        for (i = 0; i < ch_im_out; i++)
        {
            /* Load the accumulator with bias first */
            q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
            /* Point to the beging of the im2col buffer */
            const q15_t *pB = bufferA;
            /* Each time it process 4 entries */
            uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
            while (colCnt)
            {
                q31_t inA1, inA2;
                q31_t inB1, inB2;
                pA = read_and_pad(pA, &inA1, &inA2);
                inB1 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA1, inB1, sum);
                inB2 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA2, inB2, sum);
                colCnt--;
            }
            colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
            while (colCnt)
            {
                q7_t inA1 = *pA++;
                q15_t inB1 = *pB++;
                sum += inA1 * inB1;
                colCnt--;
            }
            *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
        }
    }
 #else
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    (void)bufferA;
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out; j++)
        {
            for (k = 0; k < dim_im_out; k++)
            {
                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel; m++)
                {
                    for (n = 0; n < dim_kernel; n++)
                    {
                        // if-for implementation
                        in_row = stride * j + m - padding;
                        in_col = stride * k + n - padding;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
@@ -1,229 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q7_basic.c
 * Description:	 Q7 version of convolution
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Basic Q7 convolution function (non-sqaure shape)
 * @param[in]       Im_in        pointer to input tensor
 * @param[in]       dim_im_in_x  input tensor dimention x
 * @param[in]       dim_im_in_y  input tensor dimention y
 * @param[in]       ch_im_in     number of input tensor channels
 * @param[in]       wt           pointer to kernel weights
 * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel_x filter kernel size x
 * @param[in]       dim_kernel_y filter kernel size y
 * @param[in]       padding_x    padding size x
 * @param[in]       padding_y    padding size y
 * @param[in]       stride_x     convolution stride x
 * @param[in]       stride_y     convolution stride y
 * @param[in]       bias         pointer to bias
 * @param[in]       bias_shift   amount of left-shift for bias
 * @param[in]       out_shift    amount of right-shift for output
 * @param[in,out]   Im_out       pointer to output tensor
 * @param[in]       dim_im_out_x output tensor dimension x
 * @param[in]       dim_im_out_y output tensor dimension y
 * @param[in,out]   bufferA      pointer to buffer space for input
 * @param[in,out]   bufferB      pointer to buffer space for output
 * @return     The function returns <code>ARM_MATH_SUCCESS</code>
 */
 arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
                                               const uint16_t dim_im_in_x,
                                               const uint16_t dim_im_in_y,
                                               const uint16_t ch_im_in,
                                               const q7_t *wt,
                                               const uint16_t ch_im_out,
                                               const uint16_t dim_kernel_x,
                                               const uint16_t dim_kernel_y,
                                               const uint16_t padding_x,
                                               const uint16_t padding_y,
                                               const uint16_t stride_x,
                                               const uint16_t stride_y,
                                               const q7_t *bias,
                                               const uint16_t bias_shift,
                                               const uint16_t out_shift,
                                               q7_t *Im_out,
                                               const uint16_t dim_im_out_x,
                                               const uint16_t dim_im_out_y,
                                               q15_t *bufferA,
                                               q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    /*
     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
     *  im2col are done to output in q15_t format from q7_t input
     */
    q15_t *pBuffer = bufferA;
    q7_t *pOut = Im_out;
    /* This part implements the im2col function */
    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
        {
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
                     i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
                    {
                        /* Filling 0 for out-of-bound paddings */
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        /* Copying the pixel data to column */
                        arm_q7_to_q15_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            /* Computation is filed for every 2 columns */
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* left-over because odd number of output pixels */
    if (pBuffer != bufferA)
    {
        const q7_t *pA = wt;
        int i;
        for (i = 0; i < ch_im_out; i++)
        {
            /* Load the accumulator with bias first */
            q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
            /* Point to the beging of the im2col buffer */
            const q15_t *pB = bufferA;
            /* Each time it process 4 entries */
            uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2;
            while (colCnt)
            {
                q31_t inA1, inA2;
                q31_t inB1, inB2;
                pA = read_and_pad(pA, &inA1, &inA2);
                inB1 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA1, inB1, sum);
                inB2 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA2, inB2, sum);
                colCnt--;
            }
            colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
            while (colCnt)
            {
                q7_t inA1 = *pA++;
                q15_t inB1 = *pB++;
                sum += inA1 * inB1;
                colCnt--;
            }
            *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
        }
    }
 #else
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    (void)bufferA;
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out_y; j++)
        {
            for (k = 0; k < dim_im_out_x; k++)
            {
                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel_y; m++)
                {
                    for (n = 0; n < dim_kernel_x; n++)
                    {
                        // if-for implementation
                        in_row = stride_y * j + m - padding_y;
                        in_col = stride_x * k + n - padding_x;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
                                       l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
@@ -1,380 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q7_fast.c
 * Description:  Fast Q7 version of convolution
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Fast Q7 convolution function
 * @param[in]       Im_in       pointer to input tensor
 * @param[in]       dim_im_in   input tensor dimention
 * @param[in]       ch_im_in    number of input tensor channels
 * @param[in]       wt          pointer to kernel weights
 * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel  filter kernel size
 * @param[in]       padding     padding sizes
 * @param[in]       stride      convolution stride
 * @param[in]       bias        pointer to bias
 * @param[in]       bias_shift  amount of left-shift for bias
 * @param[in]       out_shift   amount of right-shift for output
 * @param[in,out]   Im_out      pointer to output tensor
 * @param[in]       dim_im_out  output tensor dimension
 * @param[in,out]   bufferA     pointer to buffer space for input
 * @param[in,out]   bufferB     pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * @details
 *
 * <b>Buffer size:</b>
 *
 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
 *
 * bufferB size: 0
 *
 * <b>Input dimension constraints:</b>
 *
 * ch_im_in is multiple of 4    ( because of the SIMD32 read and swap )
 *
 * ch_im_out is multiple of 2    ( bacause 2x2 mat_mult kernel )
 *
 * The im2col converts the Q7 tensor input into Q15 column, which is stored in
 * bufferA. There is reordering happenning during this im2col process with
 * arm_q7_to_q15_reordered_no_shift. For every four elements, the second and
 * third elements are swapped.
 *
 * The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the
 * GEMM computation with the reordered columns.
 *
 * To speed-up the determination of the padding condition, we split the
 * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
 * This reduces the total number of boundary condition checks and improves
 * the data copying performance.
 */
 arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
                                    const uint16_t dim_im_in,
                                    const uint16_t ch_im_in,
                                    const q7_t *wt,
                                    const uint16_t ch_im_out,
                                    const uint16_t dim_kernel,
                                    const uint16_t padding,
                                    const uint16_t stride,
                                    const q7_t *bias,
                                    const uint16_t bias_shift,
                                    const uint16_t out_shift,
                                    q7_t *Im_out,
                                    const uint16_t dim_im_out,
                                    q15_t *bufferA,
                                    q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    /*
     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
     *  im2col are done to output in q15_t format from q7_t input
     */
    q15_t *pBuffer = bufferA;
    q7_t *pOut = Im_out;
    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    /*
     *  Here we split the entire matrix into three regions depending on the padding situation
     *    Top: i_out_y from 0 to padding - 1
     * Middle: i_out_y from padding to dim_im_out-padding-1
     * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
     */
    /* top part */
    for (i_out_y = 0; i_out_y < padding; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* middle part, here we also divide the x into left, mid and right */
    for (; i_out_y < dim_im_out - padding; i_out_y++)
    {
        /* left part */
        for (i_out_x = 0; i_out_x < padding; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
        /* mid part */
        for (; i_out_x < dim_im_out - padding; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                arm_q7_to_q15_reordered_no_shift((q7_t *)Im_in +
                                                     (i_ker_y * dim_im_in + i_out_x * stride - padding) * ch_im_in,
                                                 pBuffer,
                                                 ch_im_in * dim_kernel);
                pBuffer += ch_im_in * dim_kernel;
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
        /* right part */
        for (; i_out_x < dim_im_out; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    for (; i_out_y < dim_im_out; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* check if there is left-over for compute */
    if (pBuffer != bufferA)
    {
        const q7_t *pA = wt;
        int i;
        for (i = 0; i < ch_im_out; i++)
        {
            q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
            const q15_t *pB = bufferA;
            /* each time it process 4 entries */
            uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
            while (colCnt)
            {
                q31_t inA1, inA2;
                q31_t inB1, inB2;
                pA = read_and_pad_reordered(pA, &inA1, &inA2);
                inB1 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA1, inB1, sum);
                inB2 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA2, inB2, sum);
                colCnt--;
            }
            colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
            while (colCnt)
            {
                q7_t inA1 = *pA++;
                q15_t inB1 = *pB++;
                sum += inA1 * inB1;
                colCnt--;
            }
            *pOut = (q7_t)__SSAT((sum >> out_shift), 8);
            pOut++;
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out; j++)
        {
            for (k = 0; k < dim_im_out; k++)
            {
                conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel; m++)
                {
                    for (n = 0; n < dim_kernel; n++)
                    {
                        // if-for implementation
                        in_row = stride * j + m - padding;
                        in_col = stride * k + n - padding;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
@@ -1,378 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_HWC_q7_fast_nonsquare.c
 * Description:  Fast Q7 version of convolution (non-sqaure shape)
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Fast Q7 convolution function (non-sqaure shape)
 * @param[in]       Im_in        pointer to input tensor
 * @param[in]       dim_im_in_x  input tensor dimention x
 * @param[in]       dim_im_in_y  input tensor dimention y
 * @param[in]       ch_im_in     number of input tensor channels
 * @param[in]       wt           pointer to kernel weights
 * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel_x filter kernel size x
 * @param[in]       dim_kernel_y filter kernel size y
 * @param[in]       padding_x    padding size x
 * @param[in]       padding_y    padding size y
 * @param[in]       stride_x     convolution stride x
 * @param[in]       stride_y     convolution stride y
 * @param[in]       bias         pointer to bias
 * @param[in]       bias_shift   amount of left-shift for bias
 * @param[in]       out_shift    amount of right-shift for output
 * @param[in,out]   Im_out       pointer to output tensor
 * @param[in]       dim_im_out_x output tensor dimension x
 * @param[in]       dim_im_out_y output tensor dimension y
 * @param[in,out]   bufferA      pointer to buffer space for input
 * @param[in,out]   bufferB      pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * This function is the version with full list of optimization tricks, but with
 * some constraints:
 *   ch_im_in is multiple of 4
 *   ch_im_out is multiple of 2
 */
 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
                                              const uint16_t dim_im_in_x,
                                              const uint16_t dim_im_in_y,
                                              const uint16_t ch_im_in,
                                              const q7_t *wt,
                                              const uint16_t ch_im_out,
                                              const uint16_t dim_kernel_x,
                                              const uint16_t dim_kernel_y,
                                              const uint16_t padding_x,
                                              const uint16_t padding_y,
                                              const uint16_t stride_x,
                                              const uint16_t stride_y,
                                              const q7_t *bias,
                                              const uint16_t bias_shift,
                                              const uint16_t out_shift,
                                              q7_t *Im_out,
                                              const uint16_t dim_im_out_x,
                                              const uint16_t dim_im_out_y,
                                              q15_t *bufferA,
                                              q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
    /* -----------------------
     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
     *  im2col are done to output in q15_t format from q7_t input
     */
    q15_t *pBuffer = bufferA;
    q7_t *pOut = Im_out;
    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    /*
     *  Here we split the entire matrix into three regions depending on the padding situation
     *    Top: i_out_y from 0 to padding - 1
     * Middle: i_out_y from padding to dim_im_out-padding-1
     * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
     */
    /* top part */
    for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
                     i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* middle part, here we also divide the x into left, mid and right */
    for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
    {
        /* left part */
        for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
                     i_ker_x++)
                {
                    if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
        /* mid part */
        for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                arm_q7_to_q15_reordered_no_shift(
                    (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
                    pBuffer,
                    ch_im_in * dim_kernel_x);
                pBuffer += ch_im_in * dim_kernel_x;
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
        /* right part */
        for (; i_out_x < dim_im_out_x; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
                     i_ker_x++)
                {
                    if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    for (; i_out_y < dim_im_out_y; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
        {
            /* This part implements the im2col function */
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
                     i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
                    {
                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
                    }
                    else
                    {
                        arm_q7_to_q15_reordered_no_shift(
                            (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
            {
                pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
                    wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
                /* counter reset */
                pBuffer = bufferA;
            }
        }
    }
    /* check if there is left-over for compute */
    if (pBuffer != bufferA)
    {
        const q7_t *pA = wt;
        int i;
        for (i = 0; i < ch_im_out; i++)
        {
            q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
            const q15_t *pB = bufferA;
            /* basically each time it process 4 entries */
            uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
            while (colCnt)
            {
                q31_t inA1, inA2;
                q31_t inB1, inB2;
                pA = read_and_pad_reordered(pA, &inA1, &inA2);
                inB1 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA1, inB1, sum);
                inB2 = arm_nn_read_q15x2_ia(&pB);
                sum = __SMLAD(inA2, inB2, sum);
                colCnt--;
            }
            colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
            while (colCnt)
            {
                q7_t inA1 = *pA++;
                q15_t inB1 = *pB++;
                sum += inA1 * inB1;
                colCnt--;
            }
            *pOut = (q7_t)__SSAT((sum >> out_shift), 8);
            pOut++;
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i, j, k, l, m, n;
    int conv_out;
    int in_row, in_col;
    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
    {
        /* check if the input dimension meets the constraints */
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i = 0; i < ch_im_out; i++)
    {
        for (j = 0; j < dim_im_out_y; j++)
        {
            for (k = 0; k < dim_im_out_x; k++)
            {
                conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
                for (m = 0; m < dim_kernel_y; m++)
                {
                    for (n = 0; n < dim_kernel_x; n++)
                    {
                        /* if-for implementation */
                        in_row = stride_y * j + m - padding_y;
                        in_col = stride_x * k + n - padding_x;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
                        {
                            for (l = 0; l < ch_im_in; l++)
                            {
                                conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
                                    wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
                                       l];
                            }
                        }
                    }
                }
                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
@@ -1,241 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_fast_s16.c
 * Description:  Optimized s16 version of convolution.
 *
 * $Date:        12 August 2021
 * $Revision:    V.1.1.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Basic s16 convolution function.
 *
 * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
 * are multiples of 4 or atleast greater than 4.
 *
 */
 arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
                                 const cmsis_nn_conv_params *conv_params,
                                 const cmsis_nn_per_channel_quant_params *quant_params,
                                 const cmsis_nn_dims *input_dims,
                                 const q15_t *input_data,
                                 const cmsis_nn_dims *filter_dims,
                                 const q7_t *filter_data,
                                 const cmsis_nn_dims *bias_dims,
                                 const int64_t *bias_data,
                                 const cmsis_nn_dims *output_dims,
                                 q15_t *output_data)
 {
    (void)bias_dims;
    if (filter_dims->w * filter_dims->h * input_dims->c >= 512)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
    {
        return ARM_MATH_ARGUMENT_ERROR;
    }
    q15_t *buffer_a = (q15_t *)ctx->buf;
    const int32_t input_batches = input_dims->n;
    const int32_t input_x = input_dims->w;
    const int32_t input_y = input_dims->h;
    const int32_t input_ch = input_dims->c;
    const int32_t kernel_x = filter_dims->w;
    const int32_t kernel_y = filter_dims->h;
    const int32_t output_x = output_dims->w;
    const int32_t output_y = output_dims->h;
    const int32_t output_ch = output_dims->c;
    const int32_t pad_x = conv_params->padding.w;
    const int32_t pad_y = conv_params->padding.h;
    const int32_t stride_x = conv_params->stride.w;
    const int32_t stride_y = conv_params->stride.h;
    const int16_t out_activation_min = conv_params->activation.min;
    const int16_t out_activation_max = conv_params->activation.max;
    int32_t *output_mult = quant_params->multiplier;
    int32_t *output_shift = quant_params->shift;
    for (int i_batch = 0; i_batch < input_batches; i_batch++)
    {
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
        /* Generate two columns from the input tensor a GEMM computation */
        q15_t *two_column_buf = buffer_a;
        q15_t *out = output_data;
        /* This part implements the im2col function */
        for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++)
        {
            for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++)
            {
                for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y;
                     i_ker_y++)
                {
                    for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x;
                         i_ker_x++)
                    {
                        if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
                        {
                            /* Filling 0 for out-of-bound paddings */
                            arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch);
                        }
                        else
                        {
                            arm_memcpy_q7((q7_t *)two_column_buf,
                                          (const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
                                          input_ch * sizeof(q15_t));
                        }
                        two_column_buf += input_ch;
                    }
                }
                /* Computation is filed for every 2 columns */
                if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
                {
                    out = arm_nn_mat_mult_kernel_s16(filter_data,
                                                     buffer_a,
                                                     output_ch,
                                                     output_shift,
                                                     output_mult,
                                                     out_activation_min,
                                                     out_activation_max,
                                                     (input_ch * kernel_y * kernel_x),
                                                     bias_data,
                                                     out);
                    /* Counter reset */
                    two_column_buf = buffer_a;
                }
            }
        }
        /* Left-over because odd number of output pixels */
        if (two_column_buf != buffer_a)
        {
            const q7_t *ker_a = filter_data;
            int i;
            for (i = 0; i < output_ch; i++)
            {
                /* Init the accumulator*/
                q31_t sum = 0;
                /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
                const q15_t *ip_as_col = buffer_a;
                /* 4 multiply and accumulates are done in one loop. */
                uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
                while (col_count)
                {
                    q31_t ker_a1, ker_a2;
                    q31_t ip_b1, ip_b2;
                    ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
                    ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
                    sum = __SMLAD(ker_a1, ip_b1, sum);
                    ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
                    sum = __SMLAD(ker_a2, ip_b2, sum);
                    col_count--;
                }
                /* Handle left over mac */
                col_count = input_ch * kernel_y * kernel_x & 0x3;
                while (col_count)
                {
                    q7_t ker_a1 = *ker_a++;
                    q15_t ip_b1 = *ip_as_col++;
                    sum += ker_a1 * ip_b1;
                    col_count--;
                }
                if (bias_data)
                {
                    q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
                    q63_t acc_64 = sum + bias_data[i];
                    sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]);
                }
                else
                {
                    sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
                }
                sum = MAX(sum, out_activation_min);
                sum = MIN(sum, out_activation_max);
                *out++ = (q15_t)sum;
            }
        }
 #else
        (void)input_data;
        (void)output_data;
        (void)bias_data;
        (void)filter_data;
        (void)buffer_a;
        (void)kernel_x;
        (void)kernel_y;
        (void)pad_x;
        (void)pad_y;
        (void)stride_x;
        (void)stride_y;
        (void)out_activation_min;
        (void)out_activation_max;
        (void)output_mult;
        (void)output_shift;
        return ARM_MATH_ARGUMENT_ERROR;
 #endif
        /* Advance to the next batch */
        input_data += (input_x * input_y * input_ch);
        output_data += (output_x * output_y * output_ch);
    }
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
 #else
    (void)input_dims;
    (void)filter_dims;
    return 0;
 #endif
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c
@@ -1,156 +0,0 @@
 /*
 * Copyright (C) 2010-2022 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_s16.c
 * Description:  s16 version of convolution using symmetric quantization.
 *
 * $Date:        January 13, 2022
 * $Revision:    V.1.1.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Basic s16 convolution function.
 *
 * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
 * are multiples of 4 or atleast greater than 4.
 *
 */
 arm_status arm_convolve_s16(const cmsis_nn_context *ctx,
                            const cmsis_nn_conv_params *conv_params,
                            const cmsis_nn_per_channel_quant_params *quant_params,
                            const cmsis_nn_dims *input_dims,
                            const q15_t *input_data,
                            const cmsis_nn_dims *filter_dims,
                            const q7_t *filter_data,
                            const cmsis_nn_dims *bias_dims,
                            const int64_t *bias_data,
                            const cmsis_nn_dims *output_dims,
                            q15_t *output_data)
 {
    (void)bias_dims;
    (void)ctx;
    const int32_t input_batches = input_dims->n;
    const int32_t input_x = input_dims->w;
    const int32_t input_y = input_dims->h;
    const int32_t input_ch = input_dims->c;
    const int32_t kernel_x = filter_dims->w;
    const int32_t kernel_y = filter_dims->h;
    const int32_t output_x = output_dims->w;
    const int32_t output_y = output_dims->h;
    const int32_t output_ch = output_dims->c;
    const int32_t pad_x = conv_params->padding.w;
    const int32_t pad_y = conv_params->padding.h;
    const int32_t stride_x = conv_params->stride.w;
    const int32_t stride_y = conv_params->stride.h;
    const int32_t dilation_x = conv_params->dilation.w;
    const int32_t dilation_y = conv_params->dilation.h;
    const int32_t out_activation_min = conv_params->activation.min;
    const int32_t out_activation_max = conv_params->activation.max;
    int32_t *output_mult = quant_params->multiplier;
    int32_t *output_shift = quant_params->shift;
    for (int i_batch = 0; i_batch < input_batches; i_batch++)
    {
        /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
        for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
        {
            const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]);
            for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
            {
                for (int32_t base_idx_x = -pad_x, i_out_x = 0; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
                {
                    int64_t conv_out_acc = 0;
                    const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
                    const int32_t ker_y_start = MAX(0, start_y_max);
                    const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
                    const int32_t ker_x_start = MAX(0, start_x_max);
                    const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
                    const int32_t ker_y_end = MIN(kernel_y, end_min_y);
                    const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
                    const int32_t ker_x_end = MIN(kernel_x, end_min_x);
                    for (int32_t i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
                    {
                        for (int32_t i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
                        {
                            const int32_t in_row = base_idx_y + dilation_y * i_ker_y;
                            const int32_t in_col = base_idx_x + dilation_x * i_ker_x;
                            for (int32_t i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
                            {
                                conv_out_acc += input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] *
                                    filter_data[i_out_ch * input_ch * kernel_y * kernel_x +
                                                (i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch];
                            }
                        }
                    }
                    if (bias_data)
                    {
                        conv_out_acc += bias_data[i_out_ch];
                    }
                    int32_t conv_out = arm_nn_requantize_s64(conv_out_acc, reduced_multiplier, output_shift[i_out_ch]);
                    conv_out = MAX(conv_out, out_activation_min);
                    conv_out = MIN(conv_out, out_activation_max);
                    output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int16_t)conv_out;
                }
            }
        }
        /* Advance to the next batch */
        input_data += (input_x * input_y * input_ch);
        output_data += (output_x * output_y * output_ch);
    }
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
    (void)input_dims;
    (void)filter_dims;
    return 0;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
@@ -1,335 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_s8.c
 * Description:  s8 version of convolution using symmetric quantization.
 *
 * $Date:        December 14, 2021
 * $Revision:    V.2.1.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Basic s8 convolution function.
 *
 * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
 * are multiples of 4 or atleast greater than 4.
 *
 */
 arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
                           const cmsis_nn_conv_params *conv_params,
                           const cmsis_nn_per_channel_quant_params *quant_params,
                           const cmsis_nn_dims *input_dims,
                           const q7_t *input_data,
                           const cmsis_nn_dims *filter_dims,
                           const q7_t *filter_data,
                           const cmsis_nn_dims *bias_dims,
                           const int32_t *bias_data,
                           const cmsis_nn_dims *output_dims,
                           q7_t *output_data)
 {
    (void)bias_dims;
    if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
    {
        return ARM_MATH_ARGUMENT_ERROR;
    }
    q15_t *buffer_a = (q15_t *)ctx->buf;
    const int32_t input_batches = input_dims->n;
    const uint16_t input_x = input_dims->w;
    const uint16_t input_y = input_dims->h;
    const uint16_t input_ch = input_dims->c;
    const uint16_t kernel_x = filter_dims->w;
    const uint16_t kernel_y = filter_dims->h;
    const uint16_t output_x = output_dims->w;
    const uint16_t output_y = output_dims->h;
    const uint16_t output_ch = output_dims->c;
    const uint16_t pad_x = conv_params->padding.w;
    const uint16_t pad_y = conv_params->padding.h;
    const uint16_t stride_x = conv_params->stride.w;
    const uint16_t stride_y = conv_params->stride.h;
    const int32_t input_offset = conv_params->input_offset;
    const int32_t out_offset = conv_params->output_offset;
    const int32_t out_activation_min = conv_params->activation.min;
    const int32_t out_activation_max = conv_params->activation.max;
    int32_t *output_mult = quant_params->multiplier;
    int32_t *output_shift = quant_params->shift;
    int i_batch;
    for (i_batch = 0; i_batch < input_batches; i_batch++)
    {
 #if defined(ARM_MATH_MVEI)
        /* Generate upto four columns from the input tensor a GEMM computation */
        q7_t *im2col_buf = (q7_t *)buffer_a;
        q7_t *out = output_data;
        int32_t buffer_fill_cnt = 0;
        int32_t padded = 0;
        const int32_t num_elem = kernel_x * kernel_y * input_ch;
        const int32_t dilation_x = conv_params->dilation.w;
        const int32_t dilation_y = conv_params->dilation.h;
        /* This part implements the im2col function */
        for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
        {
            for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
            {
                const int32_t base_idx_x = stride_x * i_out_x - pad_x;
                const int32_t base_idx_y = stride_y * i_out_y - pad_y;
                for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
                {
                    for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
                    {
                        const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
                        const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
                        if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
                        {
                            memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch);
                            padded = 1;
                        }
                        else
                        {
                            arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
                        }
                        im2col_buf += input_ch;
                    }
                }
                buffer_fill_cnt++;
                /* Computation is filed for every 4 columns */
                if (buffer_fill_cnt == 4 && (padded == 0))
                {
                    buffer_fill_cnt = 0;
                    out = arm_nn_mat_mul_core_4x_s8(num_elem,
                                                    num_elem,
                                                    (q7_t *)buffer_a,
                                                    filter_data,
                                                    output_ch,
                                                    conv_params,
                                                    quant_params,
                                                    bias_data,
                                                    out);
                    im2col_buf = (q7_t *)buffer_a;
                }
                else if (buffer_fill_cnt == 4 && (padded != 0))
                {
                    buffer_fill_cnt = 0;
                    out = arm_nn_mat_mult_s8(filter_data,
                                             (q7_t *)buffer_a,
                                             output_ch,
                                             4,
                                             output_shift,
                                             output_mult,
                                             out_offset,
                                             input_offset,
                                             0,
                                             out_activation_min,
                                             out_activation_max,
                                             num_elem,
                                             bias_data,
                                             out);
                    im2col_buf = (q7_t *)buffer_a;
                    padded = 0;
                }
            }
        }
        /* Handle left over columns */
        if (buffer_fill_cnt != 0)
        {
            out = arm_nn_mat_mult_s8(filter_data,
                                     (q7_t *)buffer_a,
                                     output_ch,
                                     buffer_fill_cnt,
                                     output_shift,
                                     output_mult,
                                     out_offset,
                                     input_offset,
                                     0,
                                     out_activation_min,
                                     out_activation_max,
                                     num_elem,
                                     bias_data,
                                     out);
        }
 #else // #if defined(ARM_MATH_MVEI)
        const uint16_t dilation_x = conv_params->dilation.w;
        const uint16_t dilation_y = conv_params->dilation.h;
        int32_t i_out_y, i_out_x, i_ker_y, i_ker_x;
        /* Generate two columns from the input tensor a GEMM computation */
        q15_t *two_column_buf = buffer_a;
        q7_t *out = output_data;
        /* This part implements the im2col function */
        for (i_out_y = 0; i_out_y < output_y; i_out_y++)
        {
            for (i_out_x = 0; i_out_x < output_x; i_out_x++)
            {
                const int32_t base_idx_y = stride_y * i_out_y - pad_y;
                const int32_t base_idx_x = stride_x * i_out_x - pad_x;
                for (i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
                {
                    for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
                    {
                        const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
                        const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
                        if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
                        {
                            /* Filling 0 for out-of-bound paddings */
                            memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
                        }
                        else
                        {
                            /* Copying the pixel data to column */
                            arm_q7_to_q15_with_offset(
                                input_data + (k_y * input_x + k_x) * input_ch, two_column_buf, input_ch, input_offset);
                        }
                        two_column_buf += input_ch;
                    }
                }
                /* Computation is filed for every 2 columns */
                if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
                {
                    out = arm_nn_mat_mult_kernel_s8_s16(filter_data,
                                                        buffer_a,
                                                        output_ch,
                                                        output_shift,
                                                        output_mult,
                                                        out_offset,
                                                        out_activation_min,
                                                        out_activation_max,
                                                        input_ch * kernel_y * kernel_x,
                                                        bias_data,
                                                        out);
                    /* counter reset */
                    two_column_buf = buffer_a;
                }
            }
        }
        /* left-over because odd number of output pixels */
        if (two_column_buf != buffer_a)
        {
            const q7_t *ker_a = filter_data;
            int i;
            for (i = 0; i < output_ch; i++)
            {
                /* Load the accumulator with bias first */
                q31_t sum = 0;
                if (bias_data)
                {
                    sum = bias_data[i];
                }
                /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
                const q15_t *ip_as_col = buffer_a;
                /* 4 multiply and accumulates are done in one loop. */
 #if defined(ARM_MATH_DSP)
                uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
                while (col_count)
                {
                    q31_t ker_a1, ker_a2;
                    q31_t ip_b1, ip_b2;
                    ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
                    ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
                    sum = __SMLAD(ker_a1, ip_b1, sum);
                    ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
                    sum = __SMLAD(ker_a2, ip_b2, sum);
                    col_count--;
                }
                /* Handle left over mac */
                col_count = input_ch * kernel_y * kernel_x & 0x3;
 #else
                uint16_t col_count = input_ch * kernel_y * kernel_x;
 #endif
                while (col_count)
                {
                    q7_t ker_a1 = *ker_a++;
                    q15_t ip_b1 = *ip_as_col++;
                    sum += ker_a1 * ip_b1;
                    col_count--;
                }
                sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
                sum += out_offset;
                sum = MAX(sum, out_activation_min);
                sum = MIN(sum, out_activation_max);
                *out++ = (q7_t)sum;
            }
        }
 #endif // #if defined(ARM_MATH_MVEI)
        /* Advance to the next batch */
        input_data += (input_x * input_y * input_ch);
        output_data += (output_x * output_y * output_ch);
    }
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
 #if defined(ARM_MATH_MVEI)
    int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h;
    // Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on
    // implementation of  arm_nn_mat_mult_s8
    col_length = (col_length + 7) / 8;
    // 4 -> number of im2col buffers, 8 -> 8 elements per Q register
    return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
 #else
    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
 #endif
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
@@ -1,130 +0,0 @@
 /*
 * Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_wrapper_s16.c
 * Description:  s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
 * cmsis-nn to perform the convolution.
 *
 * $Date:        13 January 2022
 * $Revision:    V.1.2.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Convolution layer
 *
 * Refer header file for details.
 *
 */
 arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
                                    const cmsis_nn_conv_params *conv_params,
                                    const cmsis_nn_per_channel_quant_params *quant_params,
                                    const cmsis_nn_dims *input_dims,
                                    const q15_t *input_data,
                                    const cmsis_nn_dims *filter_dims,
                                    const q7_t *filter_data,
                                    const cmsis_nn_dims *bias_dims,
                                    const int64_t *bias_data,
                                    const cmsis_nn_dims *output_dims,
                                    q15_t *output_data)
 {
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
        (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
    {
        return arm_convolve_fast_s16(ctx,
                                     conv_params,
                                     quant_params,
                                     input_dims,
                                     input_data,
                                     filter_dims,
                                     filter_data,
                                     bias_dims,
                                     bias_data,
                                     output_dims,
                                     output_data);
    }
    else
    {
        return arm_convolve_s16(ctx,
                                conv_params,
                                quant_params,
                                input_dims,
                                input_data,
                                filter_dims,
                                filter_data,
                                bias_dims,
                                bias_data,
                                output_dims,
                                output_data);
    }
 #else
    return arm_convolve_s16(ctx,
                            conv_params,
                            quant_params,
                            input_dims,
                            input_data,
                            filter_dims,
                            filter_data,
                            bias_dims,
                            bias_data,
                            output_dims,
                            output_data);
 #endif
 }
 int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
                                                 const cmsis_nn_dims *input_dims,
                                                 const cmsis_nn_dims *filter_dims,
                                                 const cmsis_nn_dims *output_dims)
 {
    (void)conv_params;
    (void)output_dims;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
        (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
    {
        return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims);
    }
    return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
 #else
    return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
 #endif
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
@@ -1,133 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_convolve_wrapper_s8.c
 * Description:  s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
 * cmsis-nn to perform the convolution.
 *
 * $Date:        02. December 2021
 * $Revision:    V.1.1.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Convolution layer
 *
 * Refer header file for details.
 *
 */
 arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
                                   const cmsis_nn_conv_params *conv_params,
                                   const cmsis_nn_per_channel_quant_params *quant_params,
                                   const cmsis_nn_dims *input_dims,
                                   const q7_t *input_data,
                                   const cmsis_nn_dims *filter_dims,
                                   const q7_t *filter_data,
                                   const cmsis_nn_dims *bias_dims,
                                   const int32_t *bias_data,
                                   const cmsis_nn_dims *output_dims,
                                   q7_t *output_data)
 {
    if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
        (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
        (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
    {
        return arm_convolve_1x1_s8_fast(ctx,
                                        conv_params,
                                        quant_params,
                                        input_dims,
                                        input_data,
                                        filter_dims,
                                        filter_data,
                                        bias_dims,
                                        bias_data,
                                        output_dims,
                                        output_data);
    }
    else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
             (input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
    {
        return arm_convolve_1_x_n_s8(ctx,
                                     conv_params,
                                     quant_params,
                                     input_dims,
                                     input_data,
                                     filter_dims,
                                     filter_data,
                                     bias_dims,
                                     bias_data,
                                     output_dims,
                                     output_data);
    }
    else
    {
        return arm_convolve_s8(ctx,
                               conv_params,
                               quant_params,
                               input_dims,
                               input_data,
                               filter_dims,
                               filter_data,
                               bias_dims,
                               bias_data,
                               output_dims,
                               output_data);
    }
 }
 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
                                                const cmsis_nn_dims *input_dims,
                                                const cmsis_nn_dims *filter_dims,
                                                const cmsis_nn_dims *output_dims)
 {
    if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
        (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
        (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
    {
        return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
    }
    else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
             (input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
    {
        return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims);
    }
    else
    {
        return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
    }
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c
@@ -1,212 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_conv_3x3_s8.c
 * Description:  Optimized s8 depthwise convolution function for channel
 *               multiplier of 1 and 3x3 kernel size.
 *
 * $Date:        09. October 2020
 * $Revision:    V.2.0.1
 *
 * Target Processor:  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Optimized s8 depthwise convolution function with constraint that
 * in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1
 *
 *  Refer prototype header file for details.
 *
 */
 arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
                                     const cmsis_nn_dw_conv_params *dw_conv_params,
                                     const cmsis_nn_per_channel_quant_params *quant_params,
                                     const cmsis_nn_dims *input_dims,
                                     const q7_t *input,
                                     const cmsis_nn_dims *filter_dims,
                                     const q7_t *kernel,
                                     const cmsis_nn_dims *bias_dims,
                                     const int32_t *bias,
                                     const cmsis_nn_dims *output_dims,
                                     q7_t *output)
 {
    (void)ctx;
    (void)bias_dims;
    const int32_t input_x = input_dims->w;
    const int32_t input_y = input_dims->h;
    const int32_t input_ch = input_dims->c;
    const int32_t output_ch = output_dims->c;
    const int32_t pad_x = dw_conv_params->padding.w;
    const int32_t pad_y = dw_conv_params->padding.h;
    const int32_t stride_x = dw_conv_params->stride.w;
    const int32_t stride_y = dw_conv_params->stride.h;
    const int32_t *output_shift = quant_params->shift;
    const int32_t *output_mult = quant_params->multiplier;
    const int32_t output_x = output_dims->w;
    const int32_t output_y = output_dims->h;
    const int32_t output_offset = dw_conv_params->output_offset;
    const int32_t input_offset = dw_conv_params->input_offset;
    const int32_t output_activation_min = dw_conv_params->activation.min;
    const int32_t output_activation_max = dw_conv_params->activation.max;
    /* Check input constraints input_ch == output_ch */
    if (input_ch != output_ch)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    /* Check input constraints pad_x <= 1 */
    if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3)
    {
        return ARM_MATH_ARGUMENT_ERROR;
    }
    for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
    {
        for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
        {
            int32_t in_ch = 0;
            int32_t ker_w_start = MAX(0, -in_w);
            for (; in_ch <= (input_ch - 4); in_ch += 4)
            {
                int32_t out_buff0 = bias[in_ch + 0];
                int32_t out_buff1 = bias[in_ch + 1];
                int32_t out_buff2 = bias[in_ch + 2];
                int32_t out_buff3 = bias[in_ch + 3];
                const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
                const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
                for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
                {
                    int32_t in_val = 0;
                    int32_t ker_val = 0;
                    if (ker_w_start == 0)
                    {
                        in_val = arm_nn_read_q7x4(input_ptr);
                        ker_val = arm_nn_read_q7x4(kernel_ptr);
                        out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
                        out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
                        out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
                        out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
                    }
                    in_val = arm_nn_read_q7x4(input_ptr + input_ch);
                    ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch);
                    out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
                    out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
                    out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
                    out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
                    if ((input_x - in_w) >= 3)
                    {
                        in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1));
                        ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1));
                        out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
                        out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
                        out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
                        out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
                    }
                    input_ptr += (input_ch * input_x);
                    kernel_ptr += (input_ch * 3);
                }
                out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
                out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
                out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]);
                out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]);
                out_buff0 += output_offset;
                out_buff1 += output_offset;
                out_buff2 += output_offset;
                out_buff3 += output_offset;
                out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max);
                out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max);
                out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max);
                out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max);
                output[out_idx++] = (int8_t)out_buff0;
                output[out_idx++] = (int8_t)out_buff1;
                output[out_idx++] = (int8_t)out_buff2;
                output[out_idx++] = (int8_t)out_buff3;
            }
            // Leftover
            for (; in_ch < input_ch; ++in_ch)
            {
                int32_t out_buff = bias[in_ch];
                const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
                const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
                for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
                {
                    if (ker_w_start == 0)
                    {
                        out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr);
                    }
                    out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch);
                    if ((input_x - in_w) >= 3)
                    {
                        out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1));
                    }
                    input_ptr += (input_ch * input_x);
                    kernel_ptr += (input_ch * 3);
                }
                out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]);
                out_buff += output_offset;
                out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max);
                output[out_idx++] = (int8_t)out_buff;
            }
        }
    }
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c
@@ -1,292 +0,0 @@
 /*
 * Copyright (C) 2022 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_conv_s16.c
 * Description:  s16 version of depthwise convolution.
 *
 * $Date:        26. Jan 2022
 * $Revision:    V.1.0.0
 *
 * Target Processor:  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 static void __attribute__((unused)) depthwise_conv_s16_mult_4_s16(const int16_t *input,
                                                                  const int32_t input_x,
                                                                  const int32_t input_y,
                                                                  const int32_t input_ch,
                                                                  const int8_t *kernel,
                                                                  const int32_t output_ch,
                                                                  const int32_t ch_mult,
                                                                  const int32_t kernel_x,
                                                                  const int32_t kernel_y,
                                                                  const int32_t pad_x,
                                                                  const int32_t pad_y,
                                                                  const int32_t stride_x,
                                                                  const int32_t stride_y,
                                                                  const int64_t *bias,
                                                                  int16_t *output,
                                                                  const int32_t *output_shift,
                                                                  const int32_t *output_mult,
                                                                  const int32_t output_x,
                                                                  const int32_t output_y,
                                                                  const int32_t output_activation_min,
                                                                  const int32_t output_activation_max)
 {
    for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
    {
        for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
        {
            for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
                 ++in_ch, out_ch += ch_mult)
            {
                for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
                {
                    int32_t out_buff32[4] = {REDUCE_MULTIPLIER(output_mult[out_ch + 0 + mult_tile]),
                                             REDUCE_MULTIPLIER(output_mult[out_ch + 1 + mult_tile]),
                                             REDUCE_MULTIPLIER(output_mult[out_ch + 2 + mult_tile]),
                                             REDUCE_MULTIPLIER(output_mult[out_ch + 3 + mult_tile])};
                    int64_t out_buff[4] = {0, 0, 0, 0};
                    if (bias)
                    {
                        out_buff[0] = bias[out_ch + 0 + mult_tile];
                        out_buff[1] = bias[out_ch + 1 + mult_tile];
                        out_buff[2] = bias[out_ch + 2 + mult_tile];
                        out_buff[3] = bias[out_ch + 3 + mult_tile];
                    }
                    for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
                    {
                        int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
                        int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
 #pragma clang loop unroll(disable)
 #endif
                        for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
                             ++ker_w, ker_idx += output_ch)
                        {
                            // TODO: Unroll of 4 with 64 bit accumulator will probably result in too much register
                            // spills. Try with unroll of 2 when enabling this.
                            int32_t in_val = input[in_idx + ker_w * input_ch];
                            out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
                            out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
                            out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
                            out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
                        }
                    }
                    out_buff32[0] =
                        arm_nn_requantize_s64(out_buff[0], out_buff32[0], output_shift[out_ch + 0 + mult_tile]);
                    out_buff32[1] =
                        arm_nn_requantize_s64(out_buff[1], out_buff32[1], output_shift[out_ch + 1 + mult_tile]);
                    out_buff32[2] =
                        arm_nn_requantize_s64(out_buff[2], out_buff32[2], output_shift[out_ch + 2 + mult_tile]);
                    out_buff32[3] =
                        arm_nn_requantize_s64(out_buff[3], out_buff32[3], output_shift[out_ch + 3 + mult_tile]);
                    out_buff32[0] = MIN(MAX(out_buff32[0], output_activation_min), output_activation_max);
                    out_buff32[1] = MIN(MAX(out_buff32[1], output_activation_min), output_activation_max);
                    out_buff32[2] = MIN(MAX(out_buff32[2], output_activation_min), output_activation_max);
                    out_buff32[3] = MIN(MAX(out_buff32[3], output_activation_min), output_activation_max);
                    output[out_idx++] = (int16_t)out_buff32[0];
                    output[out_idx++] = (int16_t)out_buff32[1];
                    output[out_idx++] = (int16_t)out_buff32[2];
                    output[out_idx++] = (int16_t)out_buff32[3];
                }
            }
        }
    }
 }
 static void depthwise_conv_s16_generic_s16(const int16_t *input,
                                           const uint16_t input_batches,
                                           const uint16_t input_x,
                                           const uint16_t input_y,
                                           const uint16_t input_ch,
                                           const int8_t *kernel,
                                           const uint16_t ch_mult,
                                           const uint16_t kernel_x,
                                           const uint16_t kernel_y,
                                           const uint16_t pad_x,
                                           const uint16_t pad_y,
                                           const uint16_t stride_x,
                                           const uint16_t stride_y,
                                           const int64_t *bias,
                                           int16_t *output,
                                           const int32_t *output_shift,
                                           const int32_t *output_mult,
                                           const uint16_t output_x,
                                           const uint16_t output_y,
                                           const int32_t output_activation_min,
                                           const int32_t output_activation_max,
                                           const uint16_t dilation_x,
                                           const uint16_t dilation_y)
 {
    for (int i_batch = 0; i_batch < input_batches; i_batch++)
    {
        for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
        {
            const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
            for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
            {
                const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
                for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
                {
                    for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
                    {
                        const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
                        const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]);
                        int64_t acc_0 = 0;
                        int ker_y_start;
                        int ker_x_start;
                        int ker_y_end;
                        int ker_x_end;
                        if (dilation_x > 1)
                        {
                            const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
                            ker_x_start = MAX(0, start_x_max);
                            const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
                            ker_x_end = MIN(kernel_x, end_min_x);
                        }
                        else
                        {
                            ker_x_start = MAX(0, -base_idx_x);
                            ker_x_end = MIN(kernel_x, input_x - base_idx_x);
                        }
                        if (dilation_y > 1)
                        {
                            const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
                            ker_y_start = MAX(0, start_y_max);
                            const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
                            ker_y_end = MIN(kernel_y, end_min_y);
                        }
                        else
                        {
                            ker_y_start = MAX(0, -base_idx_y);
                            ker_y_end = MIN(kernel_y, input_y - base_idx_y);
                        }
                        if (bias)
                        {
                            acc_0 = bias[idx_out_ch];
                        }
                        for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
                        {
                            const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
                            for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
                            {
                                const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
                                int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
                                int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
                                acc_0 += input[idx_0] * kernel[ker_idx_0];
                            }
                        }
                        /* Requantize and clamp output to provided range */
                        int32_t result = arm_nn_requantize_s64(acc_0, reduced_multiplier, output_shift[idx_out_ch]);
                        result = MAX(result, output_activation_min);
                        result = MIN(result, output_activation_max);
                        *output++ = (int16_t)result;
                    }
                }
            }
        }
        /* Advance to the next batch */
        input += (input_x * input_y * input_ch);
    }
 }
 /*
 *  Basic s16 depthwise convolution function.
 *
 *  Refer header file for details.
 *
 */
 arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
                                  const cmsis_nn_dw_conv_params *dw_conv_params,
                                  const cmsis_nn_per_channel_quant_params *quant_params,
                                  const cmsis_nn_dims *input_dims,
                                  const q15_t *input,
                                  const cmsis_nn_dims *filter_dims,
                                  const q7_t *kernel,
                                  const cmsis_nn_dims *bias_dims,
                                  const int64_t *bias,
                                  const cmsis_nn_dims *output_dims,
                                  q15_t *output)
 {
    const uint16_t dilation_x = dw_conv_params->dilation.w;
    const uint16_t dilation_y = dw_conv_params->dilation.h;
    (void)bias_dims;
    (void)ctx;
    depthwise_conv_s16_generic_s16(input,
                                   input_dims->n,
                                   input_dims->w,
                                   input_dims->h,
                                   input_dims->c,
                                   kernel,
                                   dw_conv_params->ch_mult,
                                   filter_dims->w,
                                   filter_dims->h,
                                   dw_conv_params->padding.w,
                                   dw_conv_params->padding.h,
                                   dw_conv_params->stride.w,
                                   dw_conv_params->stride.h,
                                   bias,
                                   output,
                                   quant_params->shift,
                                   quant_params->multiplier,
                                   output_dims->w,
                                   output_dims->h,
                                   dw_conv_params->activation.min,
                                   dw_conv_params->activation.max,
                                   dilation_x,
                                   dilation_y);
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
@@ -1,347 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_conv_s8.c
 * Description:  s8 version of depthwise convolution.
 *
 * $Date:        30. Dec 2021
 * $Revision:    V.2.7.1
 *
 * Target Processor:  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 static void depthwise_conv_s8_mult_4(const int8_t *input,
                                     const int32_t input_x,
                                     const int32_t input_y,
                                     const int32_t input_ch,
                                     const int8_t *kernel,
                                     const int32_t output_ch,
                                     const int32_t ch_mult,
                                     const int32_t kernel_x,
                                     const int32_t kernel_y,
                                     const int32_t pad_x,
                                     const int32_t pad_y,
                                     const int32_t stride_x,
                                     const int32_t stride_y,
                                     const int32_t *bias,
                                     int8_t *output,
                                     const int32_t *output_shift,
                                     const int32_t *output_mult,
                                     const int32_t output_x,
                                     const int32_t output_y,
                                     const int32_t output_offset,
                                     const int32_t input_offset,
                                     const int32_t output_activation_min,
                                     const int32_t output_activation_max)
 {
    for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
    {
        for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
        {
            for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
                 ++in_ch, out_ch += ch_mult)
            {
                for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
                {
                    int32_t out_buff[4] = {0, 0, 0, 0};
                    if (bias)
                    {
                        out_buff[0] = bias[out_ch + 0 + mult_tile];
                        out_buff[1] = bias[out_ch + 1 + mult_tile];
                        out_buff[2] = bias[out_ch + 2 + mult_tile];
                        out_buff[3] = bias[out_ch + 3 + mult_tile];
                    }
                    for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
                    {
                        int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
                        int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
 #pragma clang loop unroll(disable)
 #endif
                        for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
                             ++ker_w, ker_idx += output_ch)
                        {
                            int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
                            out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
                            out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
                            out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
                            out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
                        }
                    }
 #if defined(ARM_MATH_MVEI)
                    (void)out_idx;
                    int32x4_t res = vldrwq_s32(out_buff);
                    res = arm_requantize_mve_32x4(res,
                                                  vldrwq_s32(&output_mult[out_ch + mult_tile]),
                                                  vldrwq_s32(&output_shift[out_ch + mult_tile]));
                    res = vaddq_n_s32(res, output_offset);
                    res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
                    res = vminq_s32(res, vdupq_n_s32(output_activation_max));
                    vstrbq_s32(output, res);
                    output += 4;
 #else
                    out_buff[0] = arm_nn_requantize(
                        out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]);
                    out_buff[1] = arm_nn_requantize(
                        out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]);
                    out_buff[2] = arm_nn_requantize(
                        out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]);
                    out_buff[3] = arm_nn_requantize(
                        out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]);
                    out_buff[0] += output_offset;
                    out_buff[1] += output_offset;
                    out_buff[2] += output_offset;
                    out_buff[3] += output_offset;
                    out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
                    out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
                    out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
                    out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
                    output[out_idx++] = (int8_t)out_buff[0];
                    output[out_idx++] = (int8_t)out_buff[1];
                    output[out_idx++] = (int8_t)out_buff[2];
                    output[out_idx++] = (int8_t)out_buff[3];
 #endif
                }
            }
        }
    }
 }
 static void depthwise_conv_s8_generic(const q7_t *input,
                                      const uint16_t input_batches,
                                      const uint16_t input_x,
                                      const uint16_t input_y,
                                      const uint16_t input_ch,
                                      const q7_t *kernel,
                                      const uint16_t output_ch,
                                      const uint16_t ch_mult,
                                      const uint16_t kernel_x,
                                      const uint16_t kernel_y,
                                      const uint16_t pad_x,
                                      const uint16_t pad_y,
                                      const uint16_t stride_x,
                                      const uint16_t stride_y,
                                      const int32_t *bias,
                                      q7_t *output,
                                      const int32_t *output_shift,
                                      const int32_t *output_mult,
                                      const uint16_t output_x,
                                      const uint16_t output_y,
                                      const int32_t output_offset,
                                      const int32_t input_offset,
                                      const int32_t output_activation_min,
                                      const int32_t output_activation_max,
                                      const uint16_t dilation_x,
                                      const uint16_t dilation_y)
 {
    (void)output_ch;
    int i_out = 0;
    int i_batch;
    for (i_batch = 0; i_batch < input_batches; i_batch++)
    {
        for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
        {
            const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
            for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
            {
                const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
                for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
                {
                    for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
                    {
                        const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
                        int32_t acc_0 = 0;
                        int ker_y_start;
                        int ker_x_start;
                        int ker_y_end;
                        int ker_x_end;
                        if (dilation_x > 1)
                        {
                            const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
                            ker_x_start = MAX(0, start_x_max);
                            const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
                            ker_x_end = MIN(kernel_x, end_min_x);
                        }
                        else
                        {
                            ker_x_start = MAX(0, -base_idx_x);
                            ker_x_end = MIN(kernel_x, input_x - base_idx_x);
                        }
                        if (dilation_y > 1)
                        {
                            const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
                            ker_y_start = MAX(0, start_y_max);
                            const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
                            ker_y_end = MIN(kernel_y, end_min_y);
                        }
                        else
                        {
                            ker_y_start = MAX(0, -base_idx_y);
                            ker_y_end = MIN(kernel_y, input_y - base_idx_y);
                        }
                        if (bias)
                        {
                            acc_0 = bias[idx_out_ch];
                        }
                        for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
                        {
                            const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
                            for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
                            {
                                const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
                                int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
                                int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
                                acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0];
                            }
                        }
                        /* Requantize and clamp output to provided range */
                        acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]);
                        acc_0 += output_offset;
                        acc_0 = MAX(acc_0, output_activation_min);
                        acc_0 = MIN(acc_0, output_activation_max);
                        output[i_out++] = acc_0;
                    }
                }
            }
        }
        /* Advance to the next batch */
        input += (input_x * input_y * input_ch);
    }
 }
 /*
 *  Basic s8 depthwise convolution function.
 *
 *  Refer header file for details.
 *  Optimization using DSP extension is not available for the generic case where channel multiplier is > 1.
 *
 */
 arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
                                 const cmsis_nn_dw_conv_params *dw_conv_params,
                                 const cmsis_nn_per_channel_quant_params *quant_params,
                                 const cmsis_nn_dims *input_dims,
                                 const q7_t *input,
                                 const cmsis_nn_dims *filter_dims,
                                 const q7_t *kernel,
                                 const cmsis_nn_dims *bias_dims,
                                 const int32_t *bias,
                                 const cmsis_nn_dims *output_dims,
                                 q7_t *output)
 {
    const uint16_t dilation_x = dw_conv_params->dilation.w;
    const uint16_t dilation_y = dw_conv_params->dilation.h;
    (void)dw_conv_params->dilation;
    (void)bias_dims;
    (void)ctx;
    if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
        dw_conv_params->dilation.h == 1)
    {
        depthwise_conv_s8_mult_4(input,
                                 input_dims->w,
                                 input_dims->h,
                                 input_dims->c,
                                 kernel,
                                 output_dims->c,
                                 dw_conv_params->ch_mult,
                                 filter_dims->w,
                                 filter_dims->h,
                                 dw_conv_params->padding.w,
                                 dw_conv_params->padding.h,
                                 dw_conv_params->stride.w,
                                 dw_conv_params->stride.h,
                                 bias,
                                 output,
                                 quant_params->shift,
                                 quant_params->multiplier,
                                 output_dims->w,
                                 output_dims->h,
                                 dw_conv_params->output_offset,
                                 dw_conv_params->input_offset,
                                 dw_conv_params->activation.min,
                                 dw_conv_params->activation.max);
    }
    else
    {
        depthwise_conv_s8_generic(input,
                                  input_dims->n,
                                  input_dims->w,
                                  input_dims->h,
                                  input_dims->c,
                                  kernel,
                                  output_dims->c,
                                  dw_conv_params->ch_mult,
                                  filter_dims->w,
                                  filter_dims->h,
                                  dw_conv_params->padding.w,
                                  dw_conv_params->padding.h,
                                  dw_conv_params->stride.w,
                                  dw_conv_params->stride.h,
                                  bias,
                                  output,
                                  quant_params->shift,
                                  quant_params->multiplier,
                                  output_dims->w,
                                  output_dims->h,
                                  dw_conv_params->output_offset,
                                  dw_conv_params->input_offset,
                                  dw_conv_params->activation.min,
                                  dw_conv_params->activation.max,
                                  dilation_x,
                                  dilation_y);
    }
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
@@ -1,433 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_conv_s8_opt.c
 * Description:  Optimized s8 depthwise separable convolution function for
 *               channel multiplier of 1.
 *
 * $Date:        January 26, 2021
 * $Revision:    V.2.0.3
 *
 * Target Processor:  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 * Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
 *
 *  Refer prototype header file for details.
 *
 */
 arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                                     const cmsis_nn_dw_conv_params *dw_conv_params,
                                     const cmsis_nn_per_channel_quant_params *quant_params,
                                     const cmsis_nn_dims *input_dims,
                                     const q7_t *input,
                                     const cmsis_nn_dims *filter_dims,
                                     const q7_t *kernel,
                                     const cmsis_nn_dims *bias_dims,
                                     const int32_t *bias,
                                     const cmsis_nn_dims *output_dims,
                                     q7_t *output)
 {
    const int32_t input_ch = input_dims->c;
    const int32_t output_ch = output_dims->c;
    /* Check input constraints input_ch == output_ch */
    if (input_ch != output_ch)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    if (ctx->buf == NULL && arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims) > 0)
    {
        return ARM_MATH_ARGUMENT_ERROR;
    }
 #ifdef ARM_MATH_DSP
    const int32_t input_x = input_dims->w;
    const int32_t input_y = input_dims->h;
    const int32_t kernel_x = filter_dims->w;
    const int32_t kernel_y = filter_dims->h;
    const int32_t pad_x = dw_conv_params->padding.w;
    const int32_t pad_y = dw_conv_params->padding.h;
    const int32_t stride_x = dw_conv_params->stride.w;
    const int32_t stride_y = dw_conv_params->stride.h;
    const int32_t *output_shift = quant_params->shift;
    const int32_t *output_mult = quant_params->multiplier;
    const int32_t output_x = output_dims->w;
    const int32_t output_y = output_dims->h;
    const int32_t output_offset = dw_conv_params->output_offset;
    const int32_t input_offset = dw_conv_params->input_offset;
    const int32_t output_activation_min = dw_conv_params->activation.min;
    const int32_t output_activation_max = dw_conv_params->activation.max;
    q15_t *buffer_a = (q15_t *)ctx->buf;
 #ifdef ARM_MATH_MVEI
    (void)bias_dims;
    /* Generate two columns from the input tensor */
    q7_t *lhs_buffer = (q7_t *)buffer_a;
    q7_t *out = output;
    int padded = 0;
    int buffer_count = 0;
    const int32_t kernel_size = kernel_x * kernel_y;
    /* This part implements the im2col function */
    for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
    {
        for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
        {
            for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
            {
                for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
                    {
                        arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch);
                        padded = 1;
                    }
                    else
                    {
                        arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch);
                    }
                    lhs_buffer += input_ch;
                }
            }
            buffer_count++;
            if (buffer_count == 4)
            {
                lhs_buffer = (q7_t *)buffer_a;
                if (padded == 0)
                {
                    out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer,
                                                        kernel,
                                                        input_offset,
                                                        input_ch,
                                                        output_shift,
                                                        output_mult,
                                                        output_offset,
                                                        output_activation_min,
                                                        output_activation_max,
                                                        kernel_size,
                                                        bias,
                                                        out);
                }
                else
                {
                    out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer,
                                                               kernel,
                                                               input_offset,
                                                               input_ch,
                                                               output_shift,
                                                               output_mult,
                                                               output_offset,
                                                               output_activation_min,
                                                               output_activation_max,
                                                               kernel_size,
                                                               bias,
                                                               out);
                    padded = 0;
                }
                buffer_count = 0;
            }
        }
    }
    /* Handle left over buffers */
    lhs_buffer = (q7_t *)buffer_a;
    for (int i_buf = 0; i_buf < buffer_count; i_buf++)
    {
        int32_t loop_count = (input_ch + 3) / 4;
        int32_t num_ch_to_process = input_ch;
        for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
        {
            const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset;
            const int8_t *row_0 = kernel + offset;
            int32x4_t out_0 = vldrwq_s32(&bias[offset]);
            for (int i_ker = 0; i_ker < kernel_size; i_ker++)
            {
                const int32x4_t ker_0 = vldrbq_s32(row_0);
                int32x4_t ip_0 = vldrbq_s32(col_0);
                ip_0 = vaddq_n_s32(ip_0, input_offset);
                out_0 += vmulq_s32(ip_0, ker_0);
                col_0 += input_ch;
                row_0 += input_ch;
            }
            const int32x4_t mult = vldrwq_s32(&output_mult[offset]);
            const int32x4_t shift = vldrwq_s32(&output_shift[offset]);
            out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
            out_0 = vaddq_n_s32(out_0, output_offset);
            out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min));
            out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max));
            mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process);
            vstrbq_p_s32(out, out_0, p);
            out += 4;
        }
        const int tail_ch = input_ch & 0x3;
        if (tail_ch != 0)
        {
            out -= (4 - tail_ch);
        }
    }
 #else // ARM_MATH_DSP
    (void)bias_dims;
    /* Run the following code in cores using DSP extension */
    q15_t *const col_buffer_start = buffer_a;
    q15_t *col_buffer = col_buffer_start;
    const int32_t *const bias_start_pos = bias;
    const q31_t *const out_mult_start_pos = output_mult;
    const q31_t *const out_shift_start_pos = output_shift;
    uint16_t row_count;
    uint16_t row_shift;
    for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
    {
        const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
        {
            const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
            /* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than
               along the x axis */
            const int ker_y_start = MAX(0, -base_idx_y);
            /* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */
            const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
            int32_t index = 0;
            if (ker_y_start != 0)
            {
                memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
                index += (kernel_x * input_ch) * ker_y_start;
            }
            for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
            {
                const int32_t idx_y = base_idx_y + i_ker_y;
                for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
                {
                    const int32_t idx_x = base_idx_x + i_ker_x;
                    if (idx_x < 0 || idx_x >= input_x)
                    {
                        memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
                    }
                    else
                    {
                        arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch,
                                                  &col_buffer[index],
                                                  input_ch,
                                                  input_offset);
                    }
                    index += input_ch;
                }
            }
            const int diff = kernel_y - ker_y_end;
            if (diff != 0)
            {
                memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
            }
            row_count = output_ch / 4;
            row_shift = 0;
            bias = bias_start_pos;
            output_mult = out_mult_start_pos;
            output_shift = out_shift_start_pos;
            while (row_count)
            {
                q31_t sum = *bias++;
                q31_t sum_2 = *bias++;
                q31_t sum_3 = *bias++;
                q31_t sum_4 = *bias++;
                uint16_t col_count = (kernel_x * kernel_y) / 2;
                q15_t *col_pos = col_buffer_start + row_shift;
                const q7_t *row_pos = kernel + row_shift;
                row_shift += 4;
                while (col_count)
                {
                    /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
                    use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
                    /* Note: variable names can be improved here to align with rows and columns. */
                    q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
                    /* Read 4 weights */
                    ip_b1 = arm_nn_read_q7x4(row_pos);
                    ip_a1 = arm_nn_read_q7x4(row_pos + input_ch);
                    op_a = arm_nn_read_q15x2(col_pos);
                    op_b = arm_nn_read_q15x2(col_pos + input_ch);
                    ip_a2 = __SXTB16(ip_b1);
                    ip_b1 = __SXTB16(__ROR(ip_b1, 8));
                    ip_b2 = __SXTB16(ip_a1);
                    ip_a1 = __SXTB16(__ROR(ip_a1, 8));
                    op_c = __PKHBT(op_b, op_a, 16);
                    op_a = __PKHTB(op_b, op_a, 16);
                    op_b = __PKHBT(ip_b2, ip_a2, 16);
                    sum = __SMLAD(op_c, op_b, sum);
                    op_b = __PKHBT(ip_b1, ip_a1, 16);
                    sum_2 = __SMLAD(op_a, op_b, sum_2);
                    op_a = arm_nn_read_q15x2(col_pos + 2);
                    op_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
                    op_c = __PKHBT(op_b, op_a, 16);
                    op_a = __PKHTB(op_b, op_a, 16);
                    op_b = __PKHTB(ip_a2, ip_b2, 16);
                    sum_3 = __SMLAD(op_c, op_b, sum_3);
                    op_b = __PKHTB(ip_a1, ip_b1, 16);
                    sum_4 = __SMLAD(op_a, op_b, sum_4);
                    row_pos += input_ch << 1;
                    col_pos += input_ch << 1;
                    col_count--;
                }
                col_count = (kernel_x * kernel_y) & 0x1;
                while (col_count)
                {
                    sum += row_pos[0] * col_pos[0];
                    sum_2 += row_pos[1] * col_pos[1];
                    sum_3 += row_pos[2] * col_pos[2];
                    sum_4 += row_pos[3] * col_pos[3];
                    row_pos += input_ch;
                    col_pos += input_ch;
                    col_count--;
                }
                sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
                sum += output_offset;
                sum = MAX(sum, output_activation_min);
                sum = MIN(sum, output_activation_max);
                *output++ = (q7_t)sum;
                sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++);
                sum_2 += output_offset;
                sum_2 = MAX(sum_2, output_activation_min);
                sum_2 = MIN(sum_2, output_activation_max);
                *output++ = (q7_t)sum_2;
                sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++);
                sum_3 += output_offset;
                sum_3 = MAX(sum_3, output_activation_min);
                sum_3 = MIN(sum_3, output_activation_max);
                *output++ = (q7_t)sum_3;
                sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++);
                sum_4 += output_offset;
                sum_4 = MAX(sum_4, output_activation_min);
                sum_4 = MIN(sum_4, output_activation_max);
                *output++ = (q7_t)sum_4;
                row_count--;
            }
            row_count = output_ch & 0x3;
            while (row_count)
            {
                q15_t *col_pos = col_buffer_start + row_shift;
                const q7_t *row_pos = kernel + row_shift;
                q31_t sum = *bias++;
                const uint16_t col_count = (kernel_x * kernel_y);
                row_shift += 1;
                for (int i = 0; i < col_count; i++)
                {
                    sum += row_pos[i * input_ch] * col_pos[i * input_ch];
                }
                sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
                sum += output_offset;
                sum = MAX(sum, output_activation_min);
                sum = MIN(sum, output_activation_max);
                *output++ = (q7_t)sum;
                row_count--;
            }
            // clear counter and pointers
            col_buffer = col_buffer_start;
        }
    }
 #endif
 #else
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    return arm_depthwise_conv_s8(ctx,
                                 dw_conv_params,
                                 quant_params,
                                 input_dims,
                                 input,
                                 filter_dims,
                                 kernel,
                                 bias_dims,
                                 bias,
                                 output_dims,
                                 output);
 #endif /* ARM_MATH_MVEI | ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
 #if defined(ARM_MATH_MVEI)
    /* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions.  */
    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4;
 #elif defined(ARM_MATH_DSP)
    return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
 #else
    (void)input_dims;
    (void)filter_dims;
    return 0;
 #endif
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c
@@ -1,336 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_conv_u8_basic_ver1.c
 * Description:  u8 depthwise convolution function
 *
 * $Date:        09. October 2020
 * $Revision:    V.1.1.1
 *
 * Target :  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 static void depthwise_conv_u8_mult_4(const uint8_t *input,
                                     const int32_t input_x,
                                     const int32_t input_y,
                                     const int32_t input_ch,
                                     const uint8_t *kernel,
                                     const int32_t output_ch,
                                     const int32_t ch_mult,
                                     const int32_t kernel_x,
                                     const int32_t kernel_y,
                                     const int32_t pad_x,
                                     const int32_t pad_y,
                                     const int32_t stride_x,
                                     const int32_t stride_y,
                                     const int32_t *bias,
                                     uint8_t *output,
                                     const int32_t output_shift,
                                     const int32_t output_mult,
                                     const int32_t output_x,
                                     const int32_t output_y,
                                     const int32_t output_offset,
                                     const int32_t input_offset,
                                     const int32_t filter_offset,
                                     const int32_t output_activation_min,
                                     const int32_t output_activation_max)
 {
    for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
    {
        for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
        {
            for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
                 ++in_ch, out_ch += ch_mult)
            {
                for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
                {
                    int32_t out_buff[4];
                    out_buff[0] = 0;
                    out_buff[1] = 0;
                    out_buff[2] = 0;
                    out_buff[3] = 0;
                    for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
                    {
                        int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
                        int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
                        for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
                             ++ker_w, ker_idx += output_ch)
                        {
                            int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
                            out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset);
                            out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset);
                            out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset);
                            out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset);
                        }
                    }
                    if (bias != NULL)
                    {
                        out_buff[0] += bias[out_ch + 0 + mult_tile];
                        out_buff[1] += bias[out_ch + 1 + mult_tile];
                        out_buff[2] += bias[out_ch + 2 + mult_tile];
                        out_buff[3] += bias[out_ch + 3 + mult_tile];
                    }
                    out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift);
                    out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift);
                    out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift);
                    out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift);
                    out_buff[0] += output_offset;
                    out_buff[1] += output_offset;
                    out_buff[2] += output_offset;
                    out_buff[3] += output_offset;
                    out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
                    out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
                    out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
                    out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
                    output[out_idx++] = (uint8_t)out_buff[0];
                    output[out_idx++] = (uint8_t)out_buff[1];
                    output[out_idx++] = (uint8_t)out_buff[2];
                    output[out_idx++] = (uint8_t)out_buff[3];
                }
            }
        }
    }
 }
 static void depthwise_conv_u8_generic(const uint8_t *input,
                                      const int32_t input_x,
                                      const int32_t input_y,
                                      const int32_t input_ch,
                                      const uint8_t *kernel,
                                      const int32_t output_ch,
                                      const int32_t ch_mult,
                                      const int32_t kernel_x,
                                      const int32_t kernel_y,
                                      const int32_t pad_x,
                                      const int32_t pad_y,
                                      const int32_t stride_x,
                                      const int32_t stride_y,
                                      const int32_t *bias,
                                      uint8_t *output,
                                      const int32_t output_shift,
                                      const int32_t output_mult,
                                      const int32_t output_x,
                                      const int32_t output_y,
                                      const int32_t output_offset,
                                      const int32_t input_offset,
                                      const int32_t filter_offset,
                                      const int32_t output_activation_min,
                                      const int32_t output_activation_max)
 {
    (void)output_ch;
    int i_out = 0;
    for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
    {
        const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
        {
            const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
            for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
            {
                for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
                {
                    const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
                    int32_t acc_0;
                    /* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */
                    const int ker_y_start = MAX(0, -base_idx_y);
                    const int ker_x_start = MAX(0, -base_idx_x);
                    /* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */
                    const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
                    const int ker_x_end = MIN(kernel_x, input_x - base_idx_x);
                    acc_0 = 0;
                    for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
                    {
                        const int32_t idx_y = base_idx_y + i_ker_y;
                        for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
                        {
                            const int32_t idx_x = base_idx_x + i_ker_x;
                            int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
                            int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
                            acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset);
                        }
                    }
                    if (bias != NULL)
                    {
                        acc_0 += bias[idx_out_ch];
                    }
                    /* Requantize and clamp output to provided range */
                    acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift);
                    acc_0 += output_offset;
                    acc_0 = MAX(acc_0, output_activation_min);
                    acc_0 = MIN(acc_0, output_activation_max);
                    output[i_out++] = acc_0;
                }
            }
        }
    }
 }
 /**
 * @brief uint8 depthwise convolution function with asymmetric quantization
 *
 * @param[in]     input     Pointer to input tensor
 * @param[in]     input_x   Width of input tensor
 * @param[in]     input_y   Height of input tensor
 * @param[in]     input_ch  Channels in input tensor
 * @param[in]     kernel    Pointer to kernel weights
 * @param[in]     kernel_x  Width of kernel
 * @param[in]     kernel_y  Height of kernel
 * @param[in]     ch_mult   Number of channel multiplier
 * @param[in]     pad_x     Padding sizes x
 * @param[in]     pad_y     Padding sizes y
 * @param[in]     stride_x  Convolution stride along the width
 * @param[in]     stride_y  Convolution stride along the height
 * @param[in]     dilation_x Dilation along width. Not used and intended for future enhancement.
 * @param[in]     dilation_y Dilation along height. Not used and intended for future enhancement.
 * @param[in]     bias       Pointer to optional bias values. If no bias is
 *                           available, NULL is expected
 * @param[in]     input_offset  Input tensor zero offset
 * @param[in]     filter_offset Kernel tensor zero offset
 * @param[in]     output_offset Output tensor zero offset
 * @param[in,out] output        Pointer to output tensor
 * @param[in]     output_x  Width of output tensor
 * @param[in]     output_y  Height of output tensor
 * @param[in]     output_activation_min   Minimum value to clamp the output to. Range : {0, 255}
 * @param[in]     output_activation_max   Minimum value to clamp the output to. Range : {0, 255}
 * @param[in]     output_shift  Amount of right-shift for output
 * @param[in]     output_mult   Output multiplier for requantization
 * @return        The function returns one of the following
 *                <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
 *                <code>ARM_MATH_SUCCESS</code> - Successful operation
 *                <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
 *
 *
 */
 arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
                                            const uint16_t input_x,
                                            const uint16_t input_y,
                                            const uint16_t input_ch,
                                            const uint8_t *kernel,
                                            const uint16_t kernel_x,
                                            const uint16_t kernel_y,
                                            const int16_t ch_mult,
                                            const int16_t pad_x,
                                            const int16_t pad_y,
                                            const int16_t stride_x,
                                            const int16_t stride_y,
                                            const int16_t dilation_x,
                                            const int16_t dilation_y,
                                            const int32_t *bias,
                                            const int32_t input_offset,
                                            const int32_t filter_offset,
                                            const int32_t output_offset,
                                            uint8_t *output,
                                            const uint16_t output_x,
                                            const uint16_t output_y,
                                            const int32_t output_activation_min,
                                            const int32_t output_activation_max,
                                            const int32_t output_shift,
                                            const int32_t output_mult)
 {
    (void)dilation_x;
    (void)dilation_y;
    if (ch_mult % 4 == 0)
    {
        depthwise_conv_u8_mult_4(input,
                                 input_x,
                                 input_y,
                                 input_ch,
                                 kernel,
                                 ch_mult * input_ch,
                                 ch_mult,
                                 kernel_x,
                                 kernel_y,
                                 pad_x,
                                 pad_y,
                                 stride_x,
                                 stride_y,
                                 bias,
                                 output,
                                 output_shift,
                                 output_mult,
                                 output_x,
                                 output_y,
                                 output_offset,
                                 input_offset,
                                 filter_offset,
                                 output_activation_min,
                                 output_activation_max);
    }
    else
    {
        depthwise_conv_u8_generic(input,
                                  input_x,
                                  input_y,
                                  input_ch,
                                  kernel,
                                  ch_mult * input_ch,
                                  ch_mult,
                                  kernel_x,
                                  kernel_y,
                                  pad_x,
                                  pad_y,
                                  stride_x,
                                  stride_y,
                                  bias,
                                  output,
                                  output_shift,
                                  output_mult,
                                  output_x,
                                  output_y,
                                  output_offset,
                                  input_offset,
                                  filter_offset,
                                  output_activation_min,
                                  output_activation_max);
    }
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
@@ -1,135 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_conv_wrapper_s8.c
 * Description:  Wrapper API to select appropriate depthwise conv API based
 *               on dimensions.
 *
 * $Date:        20. Dec 2021
 * $Revision:    V.1.4.0
 *
 * Target Processor:  Cortex-M CPUs
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /*
 *  s8 Depthwise conv wrapper function
 *
 *  Refer header file for details.
 *
 */
 arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
                                         const cmsis_nn_dw_conv_params *dw_conv_params,
                                         const cmsis_nn_per_channel_quant_params *quant_params,
                                         const cmsis_nn_dims *input_dims,
                                         const q7_t *input,
                                         const cmsis_nn_dims *filter_dims,
                                         const q7_t *filter,
                                         const cmsis_nn_dims *bias_dims,
                                         const int32_t *bias,
                                         const cmsis_nn_dims *output_dims,
                                         q7_t *output)
 {
    arm_status status = ARM_MATH_SUCCESS;
    if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
        dw_conv_params->dilation.h == 1)
    {
 #if !defined(ARM_MATH_MVEI)
        if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) &&
            (dw_conv_params->padding.w <= 1))
        {
            status = arm_depthwise_conv_3x3_s8(ctx,
                                               dw_conv_params,
                                               quant_params,
                                               input_dims,
                                               input,
                                               filter_dims,
                                               filter,
                                               bias_dims,
                                               bias,
                                               output_dims,
                                               output);
        }
        else
 #endif
        {
            status = arm_depthwise_conv_s8_opt(ctx,
                                               dw_conv_params,
                                               quant_params,
                                               input_dims,
                                               input,
                                               filter_dims,
                                               filter,
                                               bias_dims,
                                               bias,
                                               output_dims,
                                               output);
        }
    }
    else
    {
        status = arm_depthwise_conv_s8(ctx,
                                       dw_conv_params,
                                       quant_params,
                                       input_dims,
                                       input,
                                       filter_dims,
                                       filter,
                                       bias_dims,
                                       bias,
                                       output_dims,
                                       output);
    }
    /* Return to application */
    return status;
 }
 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
                                                      const cmsis_nn_dims *input_dims,
                                                      const cmsis_nn_dims *filter_dims,
                                                      const cmsis_nn_dims *output_dims)
 {
    (void)dw_conv_params;
    int32_t size = 0;
    if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
        dw_conv_params->dilation.h == 1)
    {
        size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims);
    }
    return size;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
@@ -1,422 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_separable_conv_HWC_q7.c
 * Description:  Q7 depthwise separable convolution function
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Q7 depthwise separable convolution function
 * @param[in]       Im_in       pointer to input tensor
 * @param[in]       dim_im_in   input tensor dimension
 * @param[in]       ch_im_in    number of input tensor channels
 * @param[in]       wt          pointer to kernel weights
 * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel  filter kernel size
 * @param[in]       padding     padding sizes
 * @param[in]       stride      convolution stride
 * @param[in]       bias        pointer to bias
 * @param[in]       bias_shift  amount of left-shift for bias
 * @param[in]       out_shift   amount of right-shift for output
 * @param[in,out]   Im_out      pointer to output tensor
 * @param[in]       dim_im_out  output tensor dimension
 * @param[in,out]   bufferA     pointer to buffer space for input
 * @param[in,out]   bufferB     pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * @details
 *
 * <b>Buffer size:</b>
 *
 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
 *
 * bufferB size: 0
 *
 * <b>Input dimension constraints:</b>
 *
 * ch_im_in equals ch_im_out
 *
 * Implementation:
 * There are 3 nested loop here:
 * Inner loop: calculate each output value with MAC instruction over an accumulator
 * Mid   loop: loop over different output channel
 * Outer loop: loop over different output (x, y)
 */
 arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
                                               const uint16_t dim_im_in,
                                               const uint16_t ch_im_in,
                                               const q7_t *wt,
                                               const uint16_t ch_im_out,
                                               const uint16_t dim_kernel,
                                               const uint16_t padding,
                                               const uint16_t stride,
                                               const q7_t *bias,
                                               const uint16_t bias_shift,
                                               const uint16_t out_shift,
                                               q7_t *Im_out,
                                               const uint16_t dim_im_out,
                                               q15_t *bufferA,
                                               q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    int16_t i_out_y, i_out_x;
    int16_t i_ker_y, i_ker_x;
    q7_t *colBuffer = (q7_t *)bufferA;
    q7_t *pBuffer = colBuffer;
    const q7_t *pBias = bias;
    q7_t *pOut = Im_out;
    uint16_t rowCnt;
    uint16_t row_shift;
    /* do some checking here, basically ch_im_in == ch_im_out */
    if (ch_im_in != ch_im_out)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            /* we first do im2col here */
            for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                    {
                        /* arm_fill_q7(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, ch_im_in);
                    }
                    else
                    {
                        /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
                         */
                        memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            /* we will do the computation here for each channel */
            rowCnt = ch_im_out >> 2;
            row_shift = 0;
            pBias = bias;
            while (rowCnt)
            {
                q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                uint16_t colCnt = (dim_kernel * dim_kernel) >> 1;
                q7_t *pB = colBuffer + row_shift;
                const q7_t *pA = wt + row_shift;
                row_shift += 4;
 #ifdef USE_INTRINSIC
 #ifndef ARM_MATH_BIG_ENDIAN
                while (colCnt)
                {
                    q31_t inA1, inA2, inB1, inB2, opA, opB;
                    inB1 = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    opB = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    inB2 = __PKHTB(opB, inB1, 16);
                    inB1 = __PKHBT(inB1, opB, 16);
                    inA1 = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    opB = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    inA2 = __PKHTB(opB, inA1, 16);
                    inA1 = __PKHBT(inA1, opB, 16);
                    opA = __SXTB16(inA1);
                    opB = __SXTB16(inB1);
                    sum = __SMLAD(opA, opB, sum);
                    opA = __SXTB16(__ROR(inA1, 8));
                    opB = __SXTB16(__ROR(inB1, 8));
                    sum2 = __SMLAD(opA, opB, sum2);
                    opA = __SXTB16(inA2);
                    opB = __SXTB16(inB2);
                    sum3 = __SMLAD(opA, opB, sum3);
                    opA = __SXTB16(__ROR(inA2, 8));
                    opB = __SXTB16(__ROR(inB2, 8));
                    sum4 = __SMLAD(opA, opB, sum4);
                    colCnt--;
                }
 #else
                while (colCnt)
                {
                    q31_t inA1, inA2, inB1, inB2, opA, opB;
                    inB1 = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    opB = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    inB2 = __PKHBT(opB, inB1, 16);
                    inB1 = __PKHTB(inB1, opB, 16);
                    inA1 = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    opB = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    inA2 = __PKHBT(opB, inA1, 16);
                    inA1 = __PKHTB(inA1, opB, 16);
                    opA = __SXTB16(inA1);
                    opB = __SXTB16(inB1);
                    sum2 = __SMLAD(opA, opB, sum2);
                    opA = __SXTB16(__ROR(inA1, 8));
                    opB = __SXTB16(__ROR(inB1, 8));
                    sum = __SMLAD(opA, opB, sum);
                    opA = __SXTB16(inA2);
                    opB = __SXTB16(inB2);
                    sum4 = __SMLAD(opA, opB, sum4);
                    opA = __SXTB16(__ROR(inA2, 8));
                    opB = __SXTB16(__ROR(inB2, 8));
                    sum3 = __SMLAD(opA, opB, sum3);
                    colCnt--;
                }
 #endif /* ARM_MATH_BIG_ENDIAN */
 #else
 #ifndef ARM_MATH_BIG_ENDIAN
                /*
                 *   r0    r1    r2    r3    r4   r5
                 *  inA1, inA2, inB1, inB2, opA, opB
                 */
                asm volatile("COL_LOOP_%=:\n"
                             "ldr.w r2, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "ldr.w r5, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "pkhtb r3, r5, r2, ASR #16\n"
                             "pkhbt r2, r2, r5, LSL #16\n"
                             "ldr.w r0, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "ldr.w r5, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "pkhtb r1, r5, r0, ASR #16\n"
                             "pkhbt r0, r0, r5, LSL #16\n"
                             "sxtb16 r4, r0\n"
                             "sxtb16 r5, r2\n"
                             "smlad %[sum], r4, r5, %[sum]\n"
                             "mov.w r4, r0, ror #8\n"
                             "mov.w r5, r2, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum2], r4, r5, %[sum2]\n"
                             "sxtb16 r4, r1\n"
                             "sxtb16 r5, r3\n"
                             "smlad %[sum3], r4, r5, %[sum3]\n"
                             "mov.w r4, r1, ror #8\n"
                             "mov.w r5, r3, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum4], r4, r5, %[sum4]\n"
                             "subs %[colCnt], #1\n"
                             "bne COL_LOOP_%=\n"
                             : [ sum ] "+r"(sum),
                               [ sum2 ] "+r"(sum2),
                               [ sum3 ] "+r"(sum3),
                               [ sum4 ] "+r"(sum4),
                               [ pB ] "+r"(pB),
                               [ pA ] "+r"(pA)
                             : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
                             : "r0", "r1", "r2", "r3", "r4", "r5");
 #else
                /*
                 *  r0    r1    r2    r3    r4   r5
                 * inA1, inA2, inB1, inB2, opA, opB
                 */
                asm volatile("COL_LOOP_%=:\n"
                             "ldr.w r2, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "ldr.w r5, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "pkhbt r3, r5, r2, LSL #16\n"
                             "pkhtb r2, r2, r5, ASR #16\n"
                             "ldr.w r0, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "ldr.w r5, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "pkhbt r1, r5, r0, LSL #16\n"
                             "pkhtb r0, r0, r5, ASR #16\n"
                             "sxtb16 r4, r0\n"
                             "sxtb16 r5, r2\n"
                             "smlad %[sum2], r4, r5, %[sum2]\n"
                             "mov.w r4, r0, ror #8\n"
                             "mov.w r5, r2, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum], r4, r5, %[sum]\n"
                             "sxtb16 r4, r1\n"
                             "sxtb16 r5, r3\n"
                             "smlad %[sum4], r4, r5, %[sum4]\n"
                             "mov.w r4, r1, ror #8\n"
                             "mov.w r5, r3, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum3], r4, r5, %[sum3]\n"
                             "subs %[colCnt], #1\n"
                             "bne COL_LOOP_%=\n"
                             : [ sum ] "+r"(sum),
                               [ sum2 ] "+r"(sum2),
                               [ sum3 ] "+r"(sum3),
                               [ sum4 ] "+r"(sum4),
                               [ pB ] "+r"(pB),
                               [ pA ] "+r"(pA)
                             : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
                             : "r0", "r1", "r2", "r3", "r4", "r5");
 #endif /* ARM_MATH_BIG_ENDIAN */
 #endif /* USE_INTRINSIC */
                colCnt = (dim_kernel * dim_kernel) & 0x1;
                while (colCnt)
                {
                    union arm_nnword inA, inB;
                    inA.word = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    inB.word = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    sum += inA.bytes[0] * inB.bytes[0];
                    sum2 += inA.bytes[1] * inB.bytes[1];
                    sum3 += inA.bytes[2] * inB.bytes[2];
                    sum4 += inA.bytes[3] * inB.bytes[3];
                    colCnt--;
                }
                *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
                *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
                *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
                *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
                rowCnt--;
            }
            rowCnt = ch_im_out & 0x3;
            while (rowCnt)
            {
                q7_t *pB = colBuffer + row_shift;
                const q7_t *pA = wt + row_shift;
                q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                uint16_t colCnt = (dim_kernel * dim_kernel);
                row_shift += 1;
                while (colCnt)
                {
                    q7_t A1 = *pA;
                    q7_t B1 = *pB;
                    pA += ch_im_in;
                    pB += ch_im_in;
                    sum += A1 * B1;
                    colCnt--;
                }
                *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
                rowCnt--;
            }
            /* clear counter and pointers */
            pBuffer = colBuffer;
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
    int conv_out;
    /* do some checking here, basically ch_im_in == ch_im_out */
    if (ch_im_in != ch_im_out)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
        {
            for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
            {
                // for each output
                conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
                for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++)
                {
                    for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++)
                    {
                        int in_row = stride * i_out_y + i_ker_y - padding;
                        int in_col = stride * i_out_x + i_ker_x - padding;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
                        {
                            conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] *
                                wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out];
                        }
                    }
                }
                Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] =
                    (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
@@ -1,427 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_depthwise_separable_conv_HWC_q7_nonsquare.c
 * Description:  Q7 depthwise separable convolution function (non-square shape)
 *
 * $Date:        July 20, 2021
 * $Revision:    V.1.1.2
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 *  @ingroup groupNN
 */
 /**
 * @addtogroup NNConv
 * @{
 */
 /**
 * @brief Q7 depthwise separable convolution function (non-square shape)
 * @param[in]       Im_in         pointer to input tensor
 * @param[in]       dim_im_in_x   input tensor dimension x
 * @param[in]       dim_im_in_y   input tensor dimension y
 * @param[in]       ch_im_in      number of input tensor channels
 * @param[in]       wt            pointer to kernel weights
 * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
 * @param[in]       dim_kernel_x  filter kernel size x
 * @param[in]       dim_kernel_y  filter kernel size y
 * @param[in]       padding_x     padding sizes x
 * @param[in]       padding_y     padding sizes y
 * @param[in]       stride_x      convolution stride x
 * @param[in]       stride_y      convolution stride y
 * @param[in]       bias          pointer to bias
 * @param[in]       bias_shift    amount of left-shift for bias
 * @param[in]       out_shift     amount of right-shift for output
 * @param[in,out]   Im_out        pointer to output tensor
 * @param[in]       dim_im_out_x  output tensor dimension x
 * @param[in]       dim_im_out_y  output tensor dimension y
 * @param[in,out]   bufferA       pointer to buffer space for input
 * @param[in,out]   bufferB       pointer to buffer space for output
 * @return     The function returns either
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 *
 * This function is the version with full list of optimization tricks, but with
 * some constraints:
 *   ch_im_in is equal to ch_im_out
 *
 */
 arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
                                                         const uint16_t dim_im_in_x,
                                                         const uint16_t dim_im_in_y,
                                                         const uint16_t ch_im_in,
                                                         const q7_t *wt,
                                                         const uint16_t ch_im_out,
                                                         const uint16_t dim_kernel_x,
                                                         const uint16_t dim_kernel_y,
                                                         const uint16_t padding_x,
                                                         const uint16_t padding_y,
                                                         const uint16_t stride_x,
                                                         const uint16_t stride_y,
                                                         const q7_t *bias,
                                                         const uint16_t bias_shift,
                                                         const uint16_t out_shift,
                                                         q7_t *Im_out,
                                                         const uint16_t dim_im_out_x,
                                                         const uint16_t dim_im_out_y,
                                                         q15_t *bufferA,
                                                         q7_t *bufferB)
 {
    (void)bufferB;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
    /* Run the following code for Cortex-M4 and Cortex-M7 */
    /*
     * Implementation:
     * There are 3 nested loop here:
     * Inner loop: calculate each output value with MAC instruction over an accumulator
     * Mid   loop: loop over different output channel
     * Outer loop: loop over different output (x, y)
     *
     */
    int16_t i_out_y, i_out_x;
    int16_t i_ker_y, i_ker_x;
    q7_t *colBuffer = (q7_t *)bufferA;
    q7_t *pBuffer = colBuffer;
    const q7_t *pBias = bias;
    q7_t *pOut = Im_out;
    uint16_t rowCnt;
    uint16_t row_shift;
    /* do some checking here, basically ch_im_in == ch_im_out */
    if (ch_im_in != ch_im_out)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
        {
            /* we first do im2col here */
            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
                 i_ker_y++)
            {
                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
                     i_ker_x++)
                {
                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
                    {
                        /* arm_fill_q7(0, pBuffer, ch_im_in); */
                        memset(pBuffer, 0, ch_im_in);
                    }
                    else
                    {
                        /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
                         * ch_im_in); */
                        memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in);
                    }
                    pBuffer += ch_im_in;
                }
            }
            /* we will do the computation here for each channel */
            rowCnt = ch_im_out >> 2;
            row_shift = 0;
            pBias = bias;
            while (rowCnt)
            {
                q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1;
                q7_t *pB = colBuffer + row_shift;
                const q7_t *pA = wt + row_shift;
                row_shift += 4;
 #ifdef USE_INTRINSIC
 #ifndef ARM_MATH_BIG_ENDIAN
                while (colCnt)
                {
                    q31_t inA1, inA2, inB1, inB2, opA, opB;
                    inB1 = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    opB = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    inB2 = __PKHTB(opB, inB1, 16);
                    inB1 = __PKHBT(inB1, opB, 16);
                    inA1 = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    opB = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    inA2 = __PKHTB(opB, inA1, 16);
                    inA1 = __PKHBT(inA1, opB, 16);
                    opA = __SXTB16(inA1);
                    opB = __SXTB16(inB1);
                    sum = __SMLAD(opA, opB, sum);
                    opA = __SXTB16(__ROR(inA1, 8));
                    opB = __SXTB16(__ROR(inB1, 8));
                    sum2 = __SMLAD(opA, opB, sum2);
                    opA = __SXTB16(inA2);
                    opB = __SXTB16(inB2);
                    sum3 = __SMLAD(opA, opB, sum3);
                    opA = __SXTB16(__ROR(inA2, 8));
                    opB = __SXTB16(__ROR(inB2, 8));
                    sum4 = __SMLAD(opA, opB, sum4);
                    colCnt--;
                }
 #else
                while (colCnt)
                {
                    q31_t inA1, inA2, inB1, inB2, opA, opB;
                    inB1 = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    opB = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    inB2 = __PKHBT(opB, inB1, 16);
                    inB1 = __PKHTB(inB1, opB, 16);
                    inA1 = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    opB = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    inA2 = __PKHBT(opB, inA1, 16);
                    inA1 = __PKHTB(inA1, opB, 16);
                    opA = __SXTB16(inA1);
                    opB = __SXTB16(inB1);
                    sum2 = __SMLAD(opA, opB, sum2);
                    opA = __SXTB16(__ROR(inA1, 8));
                    opB = __SXTB16(__ROR(inB1, 8));
                    sum = __SMLAD(opA, opB, sum);
                    opA = __SXTB16(inA2);
                    opB = __SXTB16(inB2);
                    sum4 = __SMLAD(opA, opB, sum4);
                    opA = __SXTB16(__ROR(inA2, 8));
                    opB = __SXTB16(__ROR(inB2, 8));
                    sum3 = __SMLAD(opA, opB, sum3);
                    colCnt--;
                }
 #endif /* ARM_MATH_BIG_ENDIAN */
 #else
 #ifndef ARM_MATH_BIG_ENDIAN
                //  r0    r1    r2    r3    r4   r5
                // inA1, inA2, inB1, inB2, opA, opB
                asm volatile("COL_LOOP:\n"
                             "ldr.w r2, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "ldr.w r5, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "pkhtb r3, r5, r2, ASR #16\n"
                             "pkhbt r2, r2, r5, LSL #16\n"
                             "ldr.w r0, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "ldr.w r5, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "pkhtb r1, r5, r0, ASR #16\n"
                             "pkhbt r0, r0, r5, LSL #16\n"
                             "sxtb16 r4, r0\n"
                             "sxtb16 r5, r2\n"
                             "smlad %[sum], r4, r5, %[sum]\n"
                             "mov.w r4, r0, ror #8\n"
                             "mov.w r5, r2, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum2], r4, r5, %[sum2]\n"
                             "sxtb16 r4, r1\n"
                             "sxtb16 r5, r3\n"
                             "smlad %[sum3], r4, r5, %[sum3]\n"
                             "mov.w r4, r1, ror #8\n"
                             "mov.w r5, r3, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum4], r4, r5, %[sum4]\n"
                             "subs %[colCnt], #1\n"
                             "bne COL_LOOP\n"
                             : [ sum ] "+r"(sum),
                               [ sum2 ] "+r"(sum2),
                               [ sum3 ] "+r"(sum3),
                               [ sum4 ] "+r"(sum4),
                               [ pB ] "+r"(pB),
                               [ pA ] "+r"(pA)
                             : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
                             : "r0", "r1", "r2", "r3", "r4", "r5");
 #else
                //  r0    r1    r2    r3    r4   r5
                // inA1, inA2, inB1, inB2, opA, opB
                asm volatile("COL_LOOP:\n"
                             "ldr.w r2, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "ldr.w r5, [%[pB], #0]\n"
                             "add.w %[pB], %[pB], %[ch_im_in]\n"
                             "pkhbt r3, r5, r2, LSL #16\n"
                             "pkhtb r2, r2, r5, ASR #16\n"
                             "ldr.w r0, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "ldr.w r5, [%[pA], #0]\n"
                             "add.w %[pA], %[pA], %[ch_im_in]\n"
                             "pkhbt r1, r5, r0, LSL #16\n"
                             "pkhtb r0, r0, r5, ASR #16\n"
                             "sxtb16 r4, r0\n"
                             "sxtb16 r5, r2\n"
                             "smlad %[sum2], r4, r5, %[sum2]\n"
                             "mov.w r4, r0, ror #8\n"
                             "mov.w r5, r2, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum], r4, r5, %[sum]\n"
                             "sxtb16 r4, r1\n"
                             "sxtb16 r5, r3\n"
                             "smlad %[sum4], r4, r5, %[sum4]\n"
                             "mov.w r4, r1, ror #8\n"
                             "mov.w r5, r3, ror #8\n"
                             "sxtb16 r4, r4\n"
                             "sxtb16 r5, r5\n"
                             "smlad %[sum3], r4, r5, %[sum3]\n"
                             "subs %[colCnt], #1\n"
                             "bne COL_LOOP\n"
                             : [ sum ] "+r"(sum),
                               [ sum2 ] "+r"(sum2),
                               [ sum3 ] "+r"(sum3),
                               [ sum4 ] "+r"(sum4),
                               [ pB ] "+r"(pB),
                               [ pA ] "+r"(pA)
                             : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
                             : "r0", "r1", "r2", "r3", "r4", "r5");
 #endif /*ARM_MATH_BIG_ENDIAN */
 #endif /* USE_INTRINSIC */
                colCnt = (dim_kernel_x * dim_kernel_y) & 0x1;
                while (colCnt)
                {
                    union arm_nnword inA, inB;
                    inA.word = arm_nn_read_q7x4(pA);
                    pA += ch_im_in;
                    inB.word = arm_nn_read_q7x4(pB);
                    pB += ch_im_in;
                    sum += inA.bytes[0] * inB.bytes[0];
                    sum2 += inA.bytes[1] * inB.bytes[1];
                    sum3 += inA.bytes[2] * inB.bytes[2];
                    sum4 += inA.bytes[3] * inB.bytes[3];
                    colCnt--;
                }
                *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
                *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
                *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
                *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
                rowCnt--;
            }
            rowCnt = ch_im_out & 0x3;
            while (rowCnt)
            {
                q7_t *pB = colBuffer + row_shift;
                const q7_t *pA = wt + row_shift;
                q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
                uint16_t colCnt = (dim_kernel_x * dim_kernel_y);
                row_shift += 1;
                while (colCnt)
                {
                    q7_t A1 = *pA;
                    q7_t B1 = *pB;
                    pA += ch_im_in;
                    pB += ch_im_in;
                    sum += A1 * B1;
                    colCnt--;
                }
                *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
                rowCnt--;
            }
            // clear counter and pointers
            pBuffer = colBuffer;
        }
    }
 #else
    (void)bufferA;
    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
    int i_out_y, i_out_x, i_ch_out;
    int i_ker_y, i_ker_x;
    /* do some checking here, basically ch_im_in == ch_im_out */
    if (ch_im_in != ch_im_out)
    {
        return ARM_MATH_SIZE_MISMATCH;
    }
    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
    {
        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
        {
            for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
            {
                // for each output
                int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
                for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++)
                {
                    for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++)
                    {
                        int in_row = stride_y * i_out_y + i_ker_y - padding_y;
                        int in_col = stride_x * i_out_x + i_ker_x - padding_x;
                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
                        {
                            conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] *
                                wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out];
                        }
                    }
                }
                Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] =
                    (q7_t)__SSAT((conv_out >> out_shift), 8);
            }
        }
    }
 #endif /* ARM_MATH_DSP */
    /* Return to application */
    return ARM_MATH_SUCCESS;
 }
 /**
 * @} end of NNConv group
 */
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
@@ -1,218 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_depthwise_conv_s8_core.c
 * Description:  Depthwise convolution on im2col buffers.
 *
 * $Date:        09. October 2020
 * $Revision:    V.1.0.4
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 #include "arm_nnsupportfunctions.h"
 /*
 * Depthwise conv on an im2col buffer where the input channel equals
 * output channel.
 *
 * Refer header file for details.
 *
 */
 q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
                                    const q15_t *col,
                                    const uint16_t num_ch,
                                    const int32_t *out_shift,
                                    const int32_t *out_mult,
                                    const int32_t out_offset,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const uint16_t kernel_size,
                                    const int32_t *const output_bias,
                                    q7_t *out)
 {
 #if defined(ARM_MATH_MVEI)
    int32_t ch_per_loop = num_ch / 4;
    const int32_t *bias = output_bias;
    int8_t *out_tmp = out;
    int32_t idx = 0;
    while (ch_per_loop > 0)
    {
        int32x4_t ip_0;
        int32x4_t ip_1;
        int32_t ker_loop = kernel_size / 3;
        int32x4_t out_0 = vldrwq_s32(bias);
        int32x4_t out_1 = out_0;
        bias += 4;
        const int32_t offset = idx * 4;
        const int8_t *row_0 = row + offset;
        const int16_t *col_0 = col + offset;
        const int16_t *col_1 = col + kernel_size * num_ch + offset;
        int32x4_t ker_0 = vldrbq_s32(row_0);
        while (ker_loop > 0)
        {
            const int8_t *row_1 = row_0 + num_ch;
            const int8_t *row_2 = row_0 + 2 * num_ch;
            const int32x4_t ker_1 = vldrbq_s32(row_1);
            const int32x4_t ker_2 = vldrbq_s32(row_2);
            ip_0 = vldrhq_s32(col_0);
            ip_1 = vldrhq_s32(col_1);
            col_0 += num_ch;
            col_1 += num_ch;
            out_0 += vmulq_s32(ip_0, ker_0);
            out_1 += vmulq_s32(ip_1, ker_0);
            ip_0 = vldrhq_s32(col_0);
            ip_1 = vldrhq_s32(col_1);
            col_0 += num_ch;
            col_1 += num_ch;
            out_0 += vmulq_s32(ip_0, ker_1);
            out_1 += vmulq_s32(ip_1, ker_1);
            ip_0 = vldrhq_s32(col_0);
            ip_1 = vldrhq_s32(col_1);
            col_0 += num_ch;
            col_1 += num_ch;
            out_0 += vmulq_s32(ip_0, ker_2);
            out_1 += vmulq_s32(ip_1, ker_2);
            row_0 += 3 * num_ch;
            ker_0 = vldrbq_s32(row_0);
            ker_loop--;
        }
        idx++;
        /* Handle tail kernel elements */
        ker_loop = kernel_size - ((kernel_size / 3) * 3);
        while (ker_loop > 0)
        {
            ip_0 = vldrhq_s32(col_0);
            ip_1 = vldrhq_s32(col_1);
            out_0 += vmulq_s32(ip_0, ker_0);
            out_1 += vmulq_s32(ip_1, ker_0);
            col_0 += num_ch;
            col_1 += num_ch;
            ip_0 = vldrhq_s32(col_0);
            ip_1 = vldrhq_s32(col_1);
            row_0 += num_ch;
            ker_0 = vldrbq_s32(row_0);
            ker_loop--;
        }
        const int32x4_t mult = vldrwq_s32(out_mult);
        const int32x4_t shift = vldrwq_s32(out_shift);
        out_mult += 4;
        out_shift += 4;
        out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
        out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
        out_0 = vaddq_n_s32(out_0, out_offset);
        out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
        out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
        vstrbq_s32(out_tmp, out_0);
        out_1 = vaddq_n_s32(out_1, out_offset);
        out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
        out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
        vstrbq_s32(out_tmp + num_ch, out_1);
        out_tmp += 4;
        ch_per_loop--;
    }
    int32_t tail_ch = num_ch & 3;
    if (tail_ch != 0)
    {
        int32_t ch_idx = (num_ch & ~3);
        int32x4_t col_0_sum;
        int32x4_t col_1_sum;
        const int32_t single_buffer_size = kernel_size * num_ch;
        for (int i = 0; i < tail_ch; i++)
        {
            const int16_t *col_pos_0 = col + ch_idx;
            const int16_t *col_pos_1 = col_pos_0 + single_buffer_size;
            const int8_t *row_pos = row + ch_idx;
            int32_t sum_0 = bias[i];
            int32_t sum_1 = bias[i];
            for (int j = 0; j < kernel_size; j++)
            {
                const int8_t row_val = row_pos[j * num_ch];
                sum_0 += row_val * col_pos_0[j * num_ch];
                sum_1 += row_val * col_pos_1[j * num_ch];
            }
            col_0_sum[i] = sum_0;
            col_1_sum[i] = sum_1;
            ch_idx++;
        }
        const mve_pred16_t p = vctp32q((uint32_t)tail_ch);
        const int32x4_t mult = vldrwq_z_s32(out_mult, p);
        const int32x4_t shift = vldrwq_z_s32(out_shift, p);
        col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift);
        col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift);
        col_0_sum = vaddq_n_s32(col_0_sum, out_offset);
        col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min));
        col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max));
        vstrbq_p_s32(out_tmp, col_0_sum, p);
        col_1_sum = vaddq_n_s32(col_1_sum, out_offset);
        col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min));
        col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max));
        vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p);
        out_tmp += tail_ch;
    }
    return out_tmp + num_ch;
 #else
    (void)row;
    (void)col;
    (void)num_ch;
    (void)out_shift;
    (void)out_mult;
    (void)out_offset;
    (void)activation_min;
    (void)activation_max;
    (void)kernel_size;
    (void)output_bias;
    (void)out;
    return NULL;
 #endif
 }
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c
@@ -1,186 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_mat_mult_kernel_q7_q15.c
 * Description:  Matrix-multiplication function for convolution
 *
 * $Date:        January 26, 2021
 * $Revision:    V.1.0.2
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 * @brief Matrix-multiplication function for convolution.
 *
 * @details Refer to header file for details.
 *
 */
 q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
                                    const q15_t *pInBuffer,
                                    const uint16_t ch_im_out,
                                    const uint16_t numCol_A,
                                    const uint16_t bias_shift,
                                    const uint16_t out_shift,
                                    const q7_t *bias,
                                    q7_t *pOut)
 {
 #if defined(ARM_MATH_DSP)
    /* set up the second output pointers */
    q7_t *pOut2 = pOut + ch_im_out;
    const q7_t *pBias = bias;
    uint16_t rowCnt = ch_im_out >> 1;
    /* this loop over rows in A */
    while (rowCnt)
    {
        /* setup pointers for B */
        const q15_t *pB = pInBuffer;
        const q15_t *pB2 = pB + numCol_A;
        /* align the second pointer for A */
        const q7_t *pA2 = pA + numCol_A;
        /* init the sum with bias */
        q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
        q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
        uint16_t colCnt = numCol_A >> 2;
        /* accumulate over the vector */
        while (colCnt)
        {
            q31_t inA11, inA12, inA21, inA22;
            q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
            q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
            pA = read_and_pad(pA, &inA11, &inA12);
            pA2 = read_and_pad(pA2, &inA21, &inA22);
            sum = __SMLAD(inA11, inB1, sum);
            sum2 = __SMLAD(inA11, inB2, sum2);
            sum3 = __SMLAD(inA21, inB1, sum3);
            sum4 = __SMLAD(inA21, inB2, sum4);
            inB1 = arm_nn_read_q15x2_ia(&pB);
            inB2 = arm_nn_read_q15x2_ia(&pB2);
            sum = __SMLAD(inA12, inB1, sum);
            sum2 = __SMLAD(inA12, inB2, sum2);
            sum3 = __SMLAD(inA22, inB1, sum3);
            sum4 = __SMLAD(inA22, inB2, sum4);
            colCnt--;
        } /* while over colCnt */
        colCnt = numCol_A & 0x3;
        while (colCnt)
        {
            q7_t inA1 = *pA++;
            q15_t inB1 = *pB++;
            q7_t inA2 = *pA2++;
            q15_t inB2 = *pB2++;
            sum += inA1 * inB1;
            sum2 += inA1 * inB2;
            sum3 += inA2 * inB1;
            sum4 += inA2 * inB2;
            colCnt--;
        } /* while over colCnt */
        *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
        *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
        *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
        *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
        /* skip the row computed with A2 */
        pA += numCol_A;
        rowCnt--;
    } /* for over ch_im_out */
    /* compute left-over row if any */
    if (ch_im_out & 0x1)
    {
        /* setup pointers for B */
        const q15_t *pB = pInBuffer;
        const q15_t *pB2 = pB + numCol_A;
        /* load the bias */
        q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
        uint16_t colCnt = numCol_A >> 2;
        while (colCnt)
        {
            q31_t inA11, inA12;
            q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
            q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
            pA = read_and_pad(pA, &inA11, &inA12);
            sum = __SMLAD(inA11, inB1, sum);
            sum2 = __SMLAD(inA11, inB2, sum2);
            inB1 = arm_nn_read_q15x2_ia(&pB);
            inB2 = arm_nn_read_q15x2_ia(&pB2);
            sum = __SMLAD(inA12, inB1, sum);
            sum2 = __SMLAD(inA12, inB2, sum2);
            colCnt--;
        }
        colCnt = numCol_A & 0x3;
        while (colCnt)
        {
            q7_t inA1 = *pA++;
            q15_t inB1 = *pB++;
            q15_t inB2 = *pB2++;
            sum += inA1 * inB1;
            sum2 += inA1 * inB2;
            colCnt--;
        }
        *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
        *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
    }
    pOut += ch_im_out;
    /* return the new output pointer with offset */
    return pOut;
 #else
    (void)pA;
    (void)pInBuffer;
    (void)ch_im_out;
    (void)numCol_A;
    (void)bias_shift;
    (void)out_shift;
    (void)bias;
    (void)pOut;
    /* To be completed */
    return NULL;
 #endif /* ARM_MATH_DSP */
 }
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c
@@ -1,137 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_mat_mult_kernel_q7_q15_reordered.c
 * Description:  Matrix-multiplication function for convolution with reordered columns
 *
 * $Date:        January 26, 2021
 * $Revision:    V.1.0.2
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /**
 * @brief Matrix-multiplication function for convolution with re-ordered input.
 *
 * @details Refer to header file for details.
 *
 */
 q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
                                              const q15_t *pInBuffer,
                                              const uint16_t ch_im_out,
                                              const uint16_t numCol_A,
                                              const uint16_t bias_shift,
                                              const uint16_t out_shift,
                                              const q7_t *bias,
                                              q7_t *pOut)
 {
 #if defined(ARM_MATH_DSP)
    /* set up the second output pointers */
    q7_t *pOut2 = pOut + ch_im_out;
    int i;
    /* this loop over rows in A */
    for (i = 0; i < ch_im_out; i += 2)
    {
        /* setup pointers for B */
        const q15_t *pB = pInBuffer;
        const q15_t *pB2 = pB + numCol_A;
        /* align the second pointer for A */
        const q7_t *pA2 = pA + numCol_A;
        /* init the sum with bias */
        q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
        q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
        q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
        q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
        uint16_t colCnt = numCol_A >> 2;
        /* accumulate over the vector */
        while (colCnt)
        {
            q31_t inA11, inA12, inA21, inA22;
            q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
            q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
            pA = read_and_pad_reordered(pA, &inA11, &inA12);
            pA2 = read_and_pad_reordered(pA2, &inA21, &inA22);
            sum = __SMLAD(inA11, inB1, sum);
            sum2 = __SMLAD(inA11, inB2, sum2);
            sum3 = __SMLAD(inA21, inB1, sum3);
            sum4 = __SMLAD(inA21, inB2, sum4);
            inB1 = arm_nn_read_q15x2_ia(&pB);
            inB2 = arm_nn_read_q15x2_ia(&pB2);
            sum = __SMLAD(inA12, inB1, sum);
            sum2 = __SMLAD(inA12, inB2, sum2);
            sum3 = __SMLAD(inA22, inB1, sum3);
            sum4 = __SMLAD(inA22, inB2, sum4);
            colCnt--;
        } /* while over colCnt */
        colCnt = numCol_A & 0x3;
        while (colCnt)
        {
            q7_t inA1 = *pA++;
            q15_t inB1 = *pB++;
            q7_t inA2 = *pA2++;
            q15_t inB2 = *pB2++;
            sum += inA1 * inB1;
            sum2 += inA1 * inB2;
            sum3 += inA2 * inB1;
            sum4 += inA2 * inB2;
            colCnt--;
        } /* while over colCnt */
        *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
        *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
        *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
        *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
        /* skip the row computed with A2 */
        pA += numCol_A;
    } /* for over ch_im_out */
    pOut += ch_im_out;
    /* return the new output pointer with offset */
    return pOut;
 #else
    (void)pA;
    (void)pInBuffer;
    (void)ch_im_out;
    (void)numCol_A;
    (void)bias_shift;
    (void)out_shift;
    (void)bias;
    (void)pOut;
    /* To be completed */
    return NULL;
 #endif /* ARM_MATH_DSP */
 }
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
@@ -1,245 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_mat_mult_kernel_s8_s16.c
 * Description:  Matrix-multiplication function for convolution
 *
 * $Date:        14. December 2021
 * $Revision:    V.1.1.0
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /*
 * Matrix-multiplication function for convolution with per-channel requantization.
 *
 * Refer header file for details.
 *
 */
 q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
                                    const q15_t *input_b,
                                    const uint16_t output_ch,
                                    const int32_t *out_shift,
                                    const int32_t *out_mult,
                                    const int32_t out_offset,
                                    const int16_t activation_min,
                                    const int16_t activation_max,
                                    const uint16_t num_col_a,
                                    const int32_t *const output_bias,
                                    q7_t *out_0)
 {
 #if !defined(ARM_MATH_MVEI)
    /* set up the second output pointers */
    q7_t *out_1 = out_0 + output_ch;
    const int32_t *bias = output_bias;
    uint16_t row_count = output_ch / 2;
    const q7_t *ip_a0 = input_a;
    /* this loop over rows in A */
    while (row_count)
    {
        /* setup pointers for B */
        const q15_t *ip_b0 = input_b;
        const q15_t *ip_b1 = ip_b0 + num_col_a;
        /* align the second pointer for A */
        const q7_t *ip_a1 = ip_a0 + num_col_a;
        q31_t ch_0_out_0 = 0;
        q31_t ch_0_out_1 = 0;
        q31_t ch_1_out_0 = 0;
        q31_t ch_1_out_1 = 0;
        /* Init accumulator with bias for channel N and N + 1 */
        if (bias)
        {
            ch_0_out_0 = *bias;
            ch_0_out_1 = *bias++;
            ch_1_out_0 = *bias;
            ch_1_out_1 = *bias++;
        }
 #if defined(ARM_MATH_DSP)
        uint16_t col_count = num_col_a / 4;
        /* accumulate over the vector */
        while (col_count)
        {
            q31_t a01, a02, a11, a12;
            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ip_a0 = read_and_pad(ip_a0, &a01, &a02);
            ip_a1 = read_and_pad(ip_a1, &a11, &a12);
            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
            ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
            ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
            b0 = arm_nn_read_q15x2_ia(&ip_b0);
            b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
            ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
            ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
            col_count--;
        } /* while over col_count */
        col_count = num_col_a & 0x3;
 #else
        uint16_t col_count = num_col_a;
 #endif
        while (col_count)
        {
            q7_t a0 = *ip_a0++;
            q15_t b0 = *ip_b0++;
            q7_t a1 = *ip_a1++;
            q15_t b1 = *ip_b1++;
            ch_0_out_0 += a0 * b0;
            ch_0_out_1 += a0 * b1;
            ch_1_out_0 += a1 * b0;
            ch_1_out_1 += a1 * b1;
            col_count--;
        } /* while over col_count */
        ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
        ch_0_out_0 += out_offset;
        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
        *out_0++ = (q7_t)ch_0_out_0;
        ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
        ch_0_out_1 += out_offset;
        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
        *out_1++ = (q7_t)ch_0_out_1;
        out_mult++;
        out_shift++;
        ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
        ch_1_out_0 += out_offset;
        ch_1_out_0 = MAX(ch_1_out_0, activation_min);
        ch_1_out_0 = MIN(ch_1_out_0, activation_max);
        *out_0++ = (q7_t)ch_1_out_0;
        ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
        ch_1_out_1 += out_offset;
        ch_1_out_1 = MAX(ch_1_out_1, activation_min);
        ch_1_out_1 = MIN(ch_1_out_1, activation_max);
        *out_1++ = (q7_t)ch_1_out_1;
        out_mult++;
        out_shift++;
        /* skip row */
        ip_a0 += num_col_a;
        row_count--;
    }
    /* compute the last odd numbered row if any */
    if (output_ch & 0x1)
    {
        /* setup pointers for B */
        const q15_t *ip_b0 = input_b;
        const q15_t *ip_b1 = ip_b0 + num_col_a;
        q31_t ch_0_out_0 = 0;
        q31_t ch_0_out_1 = 0;
        /* load the bias */
        if (bias)
        {
            ch_0_out_0 = *bias;
            ch_0_out_1 = *bias++;
        }
 #if defined(ARM_MATH_DSP)
        uint16_t col_count = num_col_a >> 2;
        while (col_count)
        {
            q31_t a01, a02;
            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ip_a0 = read_and_pad(ip_a0, &a01, &a02);
            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
            b0 = arm_nn_read_q15x2_ia(&ip_b0);
            b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
            col_count--;
        }
        col_count = num_col_a & 0x3;
 #else
        uint16_t col_count = num_col_a;
 #endif
        while (col_count)
        {
            q7_t a0 = *ip_a0++;
            q15_t b0 = *ip_b0++;
            q15_t b1 = *ip_b1++;
            ch_0_out_0 += a0 * b0;
            ch_0_out_1 += a0 * b1;
            col_count--;
        }
        ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
        ch_0_out_0 += out_offset;
        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
        *out_0++ = (q7_t)ch_0_out_0;
        ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
        ch_0_out_1 += out_offset;
        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
        *out_1++ = (q7_t)ch_0_out_1;
        out_mult++;
        out_shift++;
    }
    out_0 += output_ch;
    /* return the new output pointer with offset */
    return out_0;
 #else
    (void)input_a;
    (void)input_b;
    (void)output_ch;
    (void)out_shift;
    (void)out_mult;
    (void)out_offset;
    (void)activation_min;
    (void)activation_max;
    (void)num_col_a;
    (void)output_bias;
    (void)out_0;
    /* To be completed */
    return NULL;
 #endif
 }
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
@@ -1,201 +0,0 @@
 /*
 * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_mat_mult_kernel_s8_s16_reordered.c
 * Description:  Matrix-multiplication function for convolution with reordered columns
 *
 * $Date:        09. October 2020
 * $Revision:    V.1.0.3
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 /*
 * Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel
 *        requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command.
 *
 * Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses
 *        read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and
 *        unifying these two functions is a potential future improvement.
 *
 */
 q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
                                              const q15_t *input_b,
                                              const uint16_t output_ch,
                                              const int32_t *out_shift,
                                              const int32_t *out_mult,
                                              const int32_t out_offset,
                                              const int16_t activation_min,
                                              const int16_t activation_max,
                                              const uint16_t num_col_a,
                                              const int32_t *const output_bias,
                                              q7_t *out_0)
 {
 #if defined(ARM_MATH_DSP)
    /* set up the second output pointers */
    q7_t *out_1 = out_0 + output_ch;
    const int32_t *bias = output_bias;
    uint16_t row_count = output_ch / 2;
    const q7_t *ip_a0 = input_a;
    /* this loop over rows in A */
    while (row_count)
    {
        /* setup pointers for B */
        const q15_t *ip_b0 = input_b;
        const q15_t *ip_b1 = ip_b0 + num_col_a;
        /* align the second pointer for A */
        const q7_t *ip_a1 = ip_a0 + num_col_a;
        /* Init accumulator with bias for channel N and N + 1 */
        q31_t ch_0_out_0 = *bias;
        q31_t ch_0_out_1 = *bias++;
        q31_t ch_1_out_0 = *bias;
        q31_t ch_1_out_1 = *bias++;
        uint16_t col_count = num_col_a / 4;
        /* accumulate over the vector */
        while (col_count)
        {
            q31_t a01, a02, a11, a12;
            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
            ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12);
            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
            ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
            ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
            b0 = arm_nn_read_q15x2_ia(&ip_b0);
            b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
            ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
            ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
            col_count--;
        } /* while over col_count */
        ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
        ch_0_out_0 += out_offset;
        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
        *out_0++ = (q7_t)ch_0_out_0;
        ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
        ch_0_out_1 += out_offset;
        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
        *out_1++ = (q7_t)ch_0_out_1;
        out_mult++;
        out_shift++;
        ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
        ch_1_out_0 += out_offset;
        ch_1_out_0 = MAX(ch_1_out_0, activation_min);
        ch_1_out_0 = MIN(ch_1_out_0, activation_max);
        *out_0++ = (q7_t)ch_1_out_0;
        ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
        ch_1_out_1 += out_offset;
        ch_1_out_1 = MAX(ch_1_out_1, activation_min);
        ch_1_out_1 = MIN(ch_1_out_1, activation_max);
        *out_1++ = (q7_t)ch_1_out_1;
        out_mult++;
        out_shift++;
        /* skip row */
        ip_a0 += num_col_a;
        row_count--;
    }
    if (output_ch & 1)
    {
        /* setup pointers for B */
        const q15_t *ip_b0 = input_b;
        const q15_t *ip_b1 = ip_b0 + num_col_a;
        /* Init accumulator with bias for channel N + 1 */
        q31_t ch_0_out_0 = *bias;
        q31_t ch_0_out_1 = ch_0_out_0;
        int32_t col_count = num_col_a / 4;
        while (col_count)
        {
            q31_t a01, a02;
            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
            b0 = arm_nn_read_q15x2_ia(&ip_b0);
            b1 = arm_nn_read_q15x2_ia(&ip_b1);
            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
            col_count--;
        } /* while over col_count */
        ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
        ch_0_out_0 += out_offset;
        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
        *out_0++ = (q7_t)ch_0_out_0;
        ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
        ch_0_out_1 += out_offset;
        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
        *out_1++ = (q7_t)ch_0_out_1;
    }
    out_0 += output_ch;
    /* return the new output pointer with offset */
    return out_0;
 #else
    (void)input_a;
    (void)input_b;
    (void)output_ch;
    (void)out_shift;
    (void)out_mult;
    (void)out_offset;
    (void)activation_min;
    (void)activation_max;
    (void)num_col_a;
    (void)output_bias;
    (void)out_0;
    /* To be completed */
    return NULL;
 #endif
 }
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
@@ -1,180 +0,0 @@
 /*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_nn_mat_mult_s8.c
 * Description:  General Matrix-multiplication function
 *
 * $Date:        27. October 2021
 * $Revision:    V.2.0.6
 *
 * Target Processor:  Cortex-M cores
 * -------------------------------------------------------------------- */
 #include "arm_nnsupportfunctions.h"
 /*
 * s8 General matrix multiplication function with per-channel requantization for upto 4 column batches.
 *
 * Refer header file for details.
 *
 */
 q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
                         const q7_t *input_col,
                         const uint16_t output_ch,
                         const uint16_t col_batches,
                         const int32_t *output_shift,
                         const int32_t *output_mult,
                         const int32_t out_offset,
                         const int32_t col_offset,
                         const int32_t row_offset,
                         const int16_t activation_min,
                         const int16_t activation_max,
                         const uint16_t row_len,
                         const int32_t *const bias,
                         q7_t *out)
 {
 #if defined(ARM_MATH_MVEI)
    (void)row_offset;
    if (col_batches == 4)
    {
        for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
        {
            int32_t row_len_tmp = row_len;
            const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
            const int8_t *ip_c0 = input_col;
            const int8_t *ip_c1 = input_col + row_len;
            const int8_t *ip_c2 = input_col + (2 * row_len);
            const int8_t *ip_c3 = input_col + (3 * row_len);
            int32_t acc_0 = 0;
            int32_t acc_1 = 0;
            int32_t acc_2 = 0;
            int32_t acc_3 = 0;
            const int32_t row_loop_cnt = (row_len + 7) / 8;
            for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
            {
                mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
                const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
                row_len_tmp -= 8;
                int16x8_t c0 = vldrbq_s16(ip_c0);
                ip_c0 += 8;
                c0 = vaddq_s16(c0, offset);
                int16x8_t c1 = vldrbq_s16(ip_c1);
                ip_c1 += 8;
                c1 = vaddq_s16(c1, offset);
                int16x8_t c2 = vldrbq_s16(ip_c2);
                ip_c2 += 8;
                c2 = vaddq_s16(c2, offset);
                int16x8_t c3 = vldrbq_s16(ip_c3);
                ip_c3 += 8;
                c3 = vaddq_s16(c3, offset);
                int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
                ip_r0 += 8;
                acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
                acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p);
                acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p);
                acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p);
            }
            int32x4_t res = {acc_0, acc_1, acc_2, acc_3};
            if (bias)
            {
                res = vaddq_n_s32(res, bias[i_out_ch]);
            }
            res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
            res = vaddq_n_s32(res, out_offset);
            res = vmaxq_s32(res, vdupq_n_s32(activation_min));
            res = vminq_s32(res, vdupq_n_s32(activation_max));
            const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
            vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
        }
        out += 4 * output_ch;
    }
    else
    {
        for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++)
        {
            for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
            {
                int32_t row_len_tmp = row_len;
                const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
                const int8_t *ip_c0 = input_col + (i_col_batch * row_len);
                int32_t acc_0 = 0;
                const int32_t row_loop_cnt = (row_len + 7) / 8;
                for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
                {
                    const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
                    const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
                    row_len_tmp -= 8;
                    int16x8_t c0 = vldrbq_s16(ip_c0);
                    ip_c0 += 8;
                    c0 = vaddq_s16(c0, offset);
                    int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
                    ip_r0 += 8;
                    acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
                }
                if (bias)
                {
                    acc_0 += bias[i_out_ch];
                }
                acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]);
                acc_0 += out_offset;
                acc_0 = MAX(acc_0, activation_min);
                acc_0 = MIN(acc_0, activation_max);
                out[i_out_ch] = (q7_t)acc_0;
            }
            out += output_ch;
        }
    }
    return out;
 #else
    (void)input_row;
    (void)input_col;
    (void)output_ch;
    (void)col_batches;
    (void)output_shift;
    (void)output_mult;
    (void)out_offset;
    (void)col_offset;
    (void)row_offset;
    (void)activation_min;
    (void)activation_max;
    (void)row_len;
    (void)bias;
    (void)out;
    return NULL;
 #endif
 }
--- a/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/CMakeLists.txt
+++ b/MATLAB/MCU_STM32_Matlab/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/CMakeLists.txt
@@ -1,21 +0,0 @@
 #
 # Copyright (c) 2019-2021 Arm Limited.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 file(GLOB SRC "./*_s8.c")
 target_sources(cmsis-nn PRIVATE ${SRC} arm_fully_connected_s16.c)
--- a/Show More
+++ b/Show More